Esempio n. 1
0
    def peek(self, app, remote_filename, local_file, offset=0, size=None):
        job = app.execution
        assert 'ssh_remote_folder' in job, \
            "Missing attribute `ssh_remote_folder` on `Job` instance" \
            " passed to `PbsLrms.peek`."

        if size is None:
            size = sys.maxsize

        _filename_mapping = generic_filename_mapping(
            job.lrms_jobname, job.lrms_jobid, remote_filename)
        _remote_filename = os.path.join(
            job.ssh_remote_folder, _filename_mapping)

        try:
            self.transport.connect()
            remote_handler = self.transport.open(
                _remote_filename, mode='r', bufsize=-1)
            remote_handler.seek(offset)
            data = remote_handler.read(size)
        except Exception as ex:
            log.error("Could not read remote file '%s': %s: %s",
                      _remote_filename, ex.__class__.__name__, str(ex))

        try:
            local_file.write(data)
        except (TypeError, AttributeError):
            output_file = open(local_file, 'w+b')
            output_file.write(data)
            output_file.close()
        log.debug('... Done.')
Esempio n. 2
0
 def _cleanup_terminating_task(self, app, pid, termstatus=None):
     app.execution.state = Run.State.TERMINATING
     if termstatus is not None:
         app.execution.returncode = termstatus
     if pid in self.job_infos:
         self.job_infos[pid]['terminated'] = True
         if app.requested_memory is not None:
             assert (app.requested_memory
                     == self.job_infos[pid]['requested_memory'])
             self.available_memory += app.requested_memory
     wrapper_filename = posixpath.join(
         app.execution.lrms_execdir,
         ShellcmdLrms.WRAPPER_DIR,
         ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
     try:
         log.debug(
             "Reading resource utilization from wrapper file `%s` for task %s ...",
             wrapper_filename, app)
         with self.transport.open(wrapper_filename, 'r') as wrapper_file:
             outcome = self._parse_wrapper_output(wrapper_file)
             app.execution.update(outcome)
             if termstatus is None:
                 app.execution.returncode = outcome.returncode
     except Exception as err:
         msg = ("Could not open wrapper file `{0}` for task `{1}`: {2}"
                .format(wrapper_filename, app, err))
         log.warning("%s -- Termination status and resource utilization fields will not be set.", msg)
         raise gc3libs.exceptions.InvalidValue(msg)
     finally:
         self._delete_job_resource_file(pid)
Esempio n. 3
0
    def get_results(self, app, download_dir,
                    overwrite=False, changed_only=True):
        if app.output_base_url is not None:
            raise gc3libs.exceptions.DataStagingError(
                "Retrieval of output files to non-local destinations"
                " is not supported in the ShellCmd backend.")

        self.transport.connect()
        # Make list of files to copy, in the form of (remote_path,
        # local_path) pairs.  This entails walking the
        # `Application.outputs` list to expand wildcards and
        # directory references.
        stageout = list()
        for remote_relpath, local_url in app.outputs.iteritems():
            if local_url.scheme in ['swift', 'swt', 'swifts', 'swts']:
                continue
            local_relpath = local_url.path
            if remote_relpath == gc3libs.ANY_OUTPUT:
                remote_relpath = ''
                local_relpath = ''
            stageout += _make_remote_and_local_path_pair(
                self.transport, app, remote_relpath,
                download_dir, local_relpath)

        # copy back all files, renaming them to adhere to the
        # ArcLRMS convention
        log.debug("Downloading job output into '%s' ...", download_dir)
        for remote_path, local_path in stageout:
            # ignore missing files (this is what ARC does too)
            self.transport.get(remote_path, local_path,
                               ignore_nonexisting=True,
                               overwrite=overwrite,
                               changed_only=changed_only)
        return
Esempio n. 4
0
 def _cleanup_terminating_task(self, app, pid, termstatus=None):
     app.execution.state = Run.State.TERMINATING
     if termstatus is not None:
         app.execution.returncode = termstatus
     if pid in self.job_infos:
         self.job_infos[pid]['terminated'] = True
         if app.requested_memory is not None:
             assert (app.requested_memory == self.job_infos[pid]
                     ['requested_memory'])
             self.available_memory += app.requested_memory
     wrapper_filename = posixpath.join(app.execution.lrms_execdir,
                                       ShellcmdLrms.WRAPPER_DIR,
                                       ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
     try:
         log.debug(
             "Reading resource utilization from wrapper file `%s` for task %s ...",
             wrapper_filename, app)
         with self.transport.open(wrapper_filename, 'r') as wrapper_file:
             outcome = self._parse_wrapper_output(wrapper_file)
             app.execution.update(outcome)
             if termstatus is None:
                 app.execution.returncode = outcome.returncode
     except Exception as err:
         msg = (
             "Could not open wrapper file `{0}` for task `{1}`: {2}".format(
                 wrapper_filename, app, err))
         log.warning(
             "%s -- Termination status and resource utilization fields will not be set.",
             msg)
         raise gc3libs.exceptions.InvalidValue(msg)
     finally:
         self._delete_job_resource_file(pid)
Esempio n. 5
0
    def peek(self, app, remote_filename, local_file, offset=0, size=None):
        job = app.execution
        assert 'ssh_remote_folder' in job, \
            "Missing attribute `ssh_remote_folder` on `Job` instance" \
            " passed to `PbsLrms.peek`."

        if size is None:
            size = sys.maxsize

        _filename_mapping = generic_filename_mapping(job.lrms_jobname,
                                                     job.lrms_jobid,
                                                     remote_filename)
        _remote_filename = os.path.join(job.ssh_remote_folder,
                                        _filename_mapping)

        try:
            self.transport.connect()
            remote_handler = self.transport.open(_remote_filename,
                                                 mode='r',
                                                 bufsize=-1)
            remote_handler.seek(offset)
            data = remote_handler.read(size)
        except Exception as ex:
            log.error("Could not read remote file '%s': %s: %s",
                      _remote_filename, ex.__class__.__name__, str(ex))

        try:
            local_file.write(data)
        except (TypeError, AttributeError):
            output_file = open(local_file, 'w+b')
            output_file.write(data)
            output_file.close()
        log.debug('... Done.')
Esempio n. 6
0
 def _parse_stat_output(self, stdout):
     """
     Receive the output of ``squeue --noheader -o %i:%T:%r and parse it.
     """
     jobstatus = dict()
     if stdout.strip() == '':
         # if stdout is empty and `squeue -j` exitcode is 0, then
         # the job has recently completed;
         #
         # if the job has been removed from the controllers'
         # memory, then `squeue -j` exits with code 1
         jobstatus['state'] = Run.State.TERMINATING
     else:
         # parse stdout
         jobid, state, reason = stdout.split('^')
         log.debug("translating SLURM's state '%s' to gc3libs.Run.State",
                   state)
         if state in ['PENDING', 'CONFIGURING']:
             # XXX: see above for a discussion of whether 'CONFIGURING'
             # should be grouped with 'RUNNING' or not; here it's
             # likely the correct choice to group it with 'PENDING' as
             # the "configuring" phase may last a few minutes during
             # which the job is not yet really running.
             jobstatus['state'] = Run.State.SUBMITTED
         elif state in ['RUNNING', 'COMPLETING']:
             jobstatus['state'] = Run.State.RUNNING
         elif state in ['SUSPENDED']:
             jobstatus['state'] = Run.State.STOPPED
         elif state in ['COMPLETED', 'CANCELLED', 'FAILED',
                        'NODE_FAIL', 'PREEMPTED', 'TIMEOUT']:
             jobstatus['state'] = Run.State.TERMINATING
         else:
             jobstatus['state'] = Run.State.UNKNOWN
     return jobstatus
Esempio n. 7
0
    def execute_command(self, cmdline):
        """
        Scan the given command-line and return a predefined result if
        *any* word in command position matches one of the keys in the
        `expected_answer` argument to the class constructor.

        Note that the parsing of command-line is based on regular
        expressions and is thus only an approximation at ``sh``
        syntax.  It will *certainly* fail on some command-lines, but
        there is no way around this short of writing a complete ``sh``
        parser just for this function.  (And no, Python's module
        `shlex` will not do the job -- been there, done that.)
        """

        log.debug("scanning command-line <<<%s>>>", cmdline)

        for match in self._COMMAND_RE.finditer(cmdline):
            cmd = match.group("cmd")
            if cmd in self.expected_answer:
                reply = self.expected_answer[cmd]
                log.debug("returning programmed reply for '%s': %s", cmd, reply)
                return reply

        # if everything else failed, do run the command-line ...
        return LocalTransport.execute_command(self, cmdline)
Esempio n. 8
0
 def validate_data(self, data_file_list):
     """
     Supported protocols: file, gsiftp, srm, http, https
     """
     for url in data_file_list:
         log.debug("Resource %s: checking URL '%s' ..." % (self.name, url))
         if not url.scheme in ['srm', 'lfc', 'file', 'http', 'gsiftp', 'https']:
             return False
     return True
Esempio n. 9
0
 def cancel_job(self, app):
     controller, job = self._get_job_and_controller(app.execution.lrms_jobid)
     try:
         log.debug("Calling arc.JobController.Cancel(job)")
         if not controller.CancelJob(job):
             raise gc3libs.exceptions.LRMSError('arc.JobController.Cancel returned False')
     except Exception, ex:
         gc3libs.log.error('Failed while killing job. Error type %s, message %s' % (ex.__class__,str(ex)))
         raise gc3libs.exceptions.LRMSError('Failed while killing job. Error type %s, message %s' % (ex.__class__,str(ex)))
Esempio n. 10
0
 def _update_job_resource_file(self, pid, resources):
     """
     Update file in `self.resource_dir/PID` with `resources`.
     """
     self.transport.connect()
     # XXX: We should check for exceptions!
     log.debug("Updating resource file for pid %s", pid)
     with self.transport.open(posixpath.join(self.resource_dir, str(pid)),
                              'wb') as fp:
         pickle.dump(resources, fp, -1)
Esempio n. 11
0
 def _update_job_resource_file(self, pid, resources):
     """
     Update file in `self.resource_dir/PID` with `resources`.
     """
     self.transport.connect()
     # XXX: We should check for exceptions!
     log.debug("Updating resource file for pid %s", pid)
     with self.transport.open(
             posixpath.join(self.resource_dir, str(pid)), 'wb') as fp:
         pickle.dump(resources, fp, -1)
Esempio n. 12
0
    def update_job_state(self, app):
        """
        Query the running status of the local process whose PID is
        stored into `app.execution.lrms_jobid`, and map the POSIX
        process status to GC3Libs `Run.State`.
        """
        self.transport.connect()
        pid = app.execution.lrms_jobid
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps ax | grep -E '^ *%d '" % pid)
        if exit_code == 0:
            log.debug("Process with PID %s found."
                      " Checking its running status ...", pid)
            # Process exists. Check the status
            status = stdout.split()[2]
            if status[0] == 'T':
                # Job stopped
                app.execution.state = Run.State.STOPPED
            elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']:
                # Job is running. Check manpage of ps both on linux
                # and BSD to know the meaning of these statuses.
                app.execution.state = Run.State.RUNNING
                # if `requested_walltime` is set, enforce it as a
                # running time limit
                if app.requested_walltime is not None:
                    exit_code2, stdout2, stderr2 = self.transport.execute_command(
                        "ps -p %d -o etimes=" % pid)
                    if exit_code2 != 0:
                        # job terminated already, do cleanup and return
                        self._cleanup_terminating_task(app, pid)
                        return app.execution.state
                    cancel = False
                    elapsed = Duration(stdout2.strip() + 'seconds')
                    if elapsed > self.max_walltime:
                        log.warning("Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.",
                                    app, elapsed.to_timedelta(), self.max_walltime, self.name)
                        cancel = True
                    if elapsed > app.requested_walltime:
                        log.warning("Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.",
                                    app, elapsed.to_timedelta(), app.requested_walltime)
                        cancel = True
                    if cancel:
                        self.cancel_job(app)
                        # set signal to SIGTERM in termination status
                        self._cleanup_terminating_task(app, pid, termstatus=(15, -1))
                        return app.execution.state
        else:
            log.debug(
                "Process with PID %d not found,"
                " assuming task %s has finished running.",
                pid, app)
            self._cleanup_terminating_task(app, pid)

        self._get_persisted_resource_state()
        return app.execution.state
Esempio n. 13
0
    def update_job_state(self, app):
        """
        Query the running status of the local process whose PID is
        stored into `app.execution.lrms_jobid`, and map the POSIX
        process status to GC3Libs `Run.State`.
        """
        self.transport.connect()
        pid = app.execution.lrms_jobid
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps ax | grep -E '^ *%d '" % pid)
        if exit_code == 0:
            log.debug(
                "Process with PID %s found."
                " Checking its running status", pid)
            # Process exists. Check the status
            status = stdout.split()[2]
            if status[0] == 'T':
                # Job stopped
                app.execution.state = Run.State.STOPPED
            elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']:
                # Job is running. Check manpage of ps both on linux
                # and BSD to know the meaning of these statuses.
                app.execution.state = Run.State.RUNNING
        else:
            log.debug(
                "Process with PID %d not found."
                " Checking wrapper file ...", pid)
            app.execution.state = Run.State.TERMINATING
            if pid in self.job_infos:
                self.job_infos[pid]['terminated'] = True
                assert (app.requested_memory == self.job_infos[pid]
                        ['requested_memory'])
                if app.requested_memory:
                    self.available_memory += app.requested_memory
            wrapper_filename = posixpath.join(
                app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR,
                ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
            try:
                wrapper_file = self.transport.open(wrapper_filename, 'r')
            except Exception as err:
                self._delete_job_resource_file(pid)
                raise gc3libs.exceptions.InvalidValue(
                    "Could not open wrapper file '%s' for task '%s': %s" %
                    (wrapper_filename, app, err),
                    do_log=True)
            try:
                outcome = self._parse_wrapper_output(wrapper_file)
                app.execution.returncode = \
                    Run.shellexit_to_returncode(int(outcome.ReturnCode))
                self._delete_job_resource_file(pid)
            finally:
                wrapper_file.close()

        self._get_persisted_resource_state()
        return app.execution.state
Esempio n. 14
0
 def __run_command_and_parse_output(self, cmd, parser, kind='accounting'):
     log.debug("Checking remote job %s info with `%s` ...", kind, cmd)
     exit_code, stdout, stderr = self.transport.execute_command(cmd)
     if exit_code == 0:
         return parser(stdout, stderr)
     else:
         raise gc3libs.exceptions.AuxiliaryCommandError(
             "Failed running %s command `%s`:"
             " exit code: %d, stderr: '%s'"
             % (kind, cmd, exit_code, stderr),
             do_log=True)
Esempio n. 15
0
    def update_job_state(self, app):
        """
        Query the running status of the local process whose PID is
        stored into `app.execution.lrms_jobid`, and map the POSIX
        process status to GC3Libs `Run.State`.
        """
        self.transport.connect()
        pid = app.execution.lrms_jobid
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps ax | grep -E '^ *%d '" % pid)
        if exit_code == 0:
            log.debug("Process with PID %s found."
                      " Checking its running status", pid)
            # Process exists. Check the status
            status = stdout.split()[2]
            if status[0] == 'T':
                # Job stopped
                app.execution.state = Run.State.STOPPED
            elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']:
                # Job is running. Check manpage of ps both on linux
                # and BSD to know the meaning of these statuses.
                app.execution.state = Run.State.RUNNING
        else:
            log.debug(
                "Process with PID %d not found."
                " Checking wrapper file ...", pid)
            app.execution.state = Run.State.TERMINATING
            if pid in self.job_infos:
                self.job_infos[pid]['terminated'] = True
                assert (app.requested_memory
                        == self.job_infos[pid]['requested_memory'])
                if app.requested_memory:
                    self.available_memory += app.requested_memory
            wrapper_filename = posixpath.join(
                app.execution.lrms_execdir,
                ShellcmdLrms.WRAPPER_DIR,
                ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
            try:
                wrapper_file = self.transport.open(wrapper_filename, 'r')
            except Exception as err:
                self._delete_job_resource_file(pid)
                raise gc3libs.exceptions.InvalidValue(
                    "Could not open wrapper file '%s' for task '%s': %s"
                    % (wrapper_filename, app, err), do_log=True)
            try:
                outcome = self._parse_wrapper_output(wrapper_file)
                app.execution.returncode = \
                    Run.shellexit_to_returncode(int(outcome.ReturnCode))
                self._delete_job_resource_file(pid)
            finally:
                wrapper_file.close()

        self._get_persisted_resource_state()
        return app.execution.state
Esempio n. 16
0
 def __run_command_and_parse_output(self, cmd, parser, kind='accounting'):
     log.debug("Checking remote job %s info with `%s` ...", kind, cmd)
     exit_code, stdout, stderr = self.transport.execute_command(cmd)
     if exit_code == 0:
         return parser(stdout, stderr)
     else:
         raise gc3libs.exceptions.AuxiliaryCommandError(
             "Failed running %s command `%s`:"
             " exit code: %d, stderr: '%s'" %
             (kind, cmd, exit_code, stderr),
             do_log=True)
Esempio n. 17
0
 def free(self, app):
     controller, job = self._get_job_and_controller(app.execution.lrms_jobid)
     log.debug("Calling JobController.CleanJob")
     if not controller.CleanJob(job):
         log.error("arc1.JobController.CleanJob returned False for ARC job ID '%s'",
                   app.execution.lrms_jobid)
     # XXX: this is necessary as the other component of arc library seems to refer to the job.xml file
     # remove Job from job.xml file
     log.debug("Removing job '%s' from jobfile '%s'",
               app, gc3libs.Default.ARC_JOBLIST_LOCATION)
     job.RemoveJobsFromFile(gc3libs.Default.ARC_JOBLIST_LOCATION, [job.IDFromEndpoint])
Esempio n. 18
0
    def _iterjobs(self):
        """
        Iterate over all jobs.
        """

        self._get_JobSupervisor_and_JobController()

        for c in self._controllers:
            log.debug("Calling JobController.GetJobInformation() ...")
            c.GetJobInformation()
            log.debug('... controller returned %d jobs' % len(c.GetJobs()))
        return itertools.chain(* [c.GetJobs() for c in self._controllers])
Esempio n. 19
0
 def _delete_job_resource_file(self, pid):
     """
     Delete `self.resource_dir/PID` file
     """
     self.transport.connect()
     log.debug("Deleting resource file for pid %s ...", pid)
     pidfile = posixpath.join(self.resource_dir, str(pid))
     try:
         self.transport.remove(pidfile)
     except Exception as err:
         log.debug("Ignored error deleting file `%s`: %s: %s", pidfile,
                   err.__class__.__name__, err)
Esempio n. 20
0
 def _delete_job_resource_file(self, pid):
     """
     Delete `self.resource_dir/PID` file
     """
     self.transport.connect()
     log.debug("Deleting resource file for pid %s ...", pid)
     pidfile = posixpath.join(self.resource_dir, str(pid))
     try:
         self.transport.remove(pidfile)
     except Exception as err:
         log.debug(
             "Ignored error deleting file `%s`: %s: %s",
             pidfile, err.__class__.__name__, err)
Esempio n. 21
0
 def update_job_state(self, app):
     """
     Advance `app`'s status to the next one
     in the normal execution graph.
     """
     log.debug("No-Op backend updating state of Task %s ...", app)
     transitions = self.transition_graph[app.execution.state]
     log.debug("Task %s transitions: %s.", app, str.join(", ", [
         ("with probability %g to state %s" % (prob, state))
         for prob, state in transitions.items() if prob > 0
     ]))
     dice = random()
     log.debug("Rolled dice, got %g result", dice)
     for prob, state in sorted(transitions.items()):
         if dice < prob:
             log.debug(
                 "Task %s transitions to state '%s'", app, state)
             # update resource state based on old and new app state
             if app.execution.state == Run.State.SUBMITTED:
                 self.queued -= 1
                 self.user_queued -= 1
             if app.execution.state == Run.State.RUNNING:
                 self.user_run -= 1
             if state == Run.State.RUNNING:
                 self.user_run += 1
             if state == Run.State.TERMINATING:
                 self.free_slots += app.requested_cores
                 if app.requested_memory:
                     self.available_memory += app.requested_memory
             # set the new app state
             app.execution.state = state
             break
         else:
             dice -= prob
     return app.execution.state
Esempio n. 22
0
 def _parse_stat_output(self, stdout, stderr):
     """
     Parse output of ``squeue --noheader -o %i:%T:%r``.
     """
     state = Run.State.UNKNOWN
     for line in stdout.split('\n'):
         line = line.strip()
         # sites might wrap basic SLURM commands like `squeue` or
         # `sacct` to provide additional information to users; we
         # need to tell the actual SLURM output from the sites' own
         # info; fortunately, SLURM's `--format` option allows
         # arbitrary string prefixes which we can leverage to tag
         # the interesting output lines.
         if line.startswith('GC3Pie^'):
             # parse stdout
             _, job_id, job_state_code, reason = stdout.split('^')
             log.debug("translating SLURM state `%s` to gc3libs.Run.State",
                       job_state_code)
             if job_state_code in ['PENDING', 'CONFIGURING']:
                 # XXX: see comments in `count_jobs` for a discussion
                 # of whether 'CONFIGURING' should be grouped with
                 # 'RUNNING' or not; here it's likely the correct
                 # choice to group it with 'PENDING' as the
                 # "configuring" phase may last a few minutes during
                 # which the job is not yet really running.
                 state = Run.State.SUBMITTED
             elif job_state_code in ['RUNNING', 'COMPLETING']:
                 state = Run.State.RUNNING
             elif job_state_code in ['SUSPENDED']:
                 state = Run.State.STOPPED
             elif job_state_code in [
                     'COMPLETED', 'CANCELLED', 'FAILED', 'NODE_FAIL',
                     'PREEMPTED', 'TIMEOUT'
             ]:
                 state = Run.State.TERMINATING
             else:
                 state = Run.State.UNKNOWN
             break
     else:
         # No `GC3pie:` line found in output:
         #
         # * If stdout is empty and `squeue -j` exitcode is 0, then
         # the job has recently completed (but we still need to
         # call `sacct` to reap the termination status).
         #
         # * If the job has been removed from the controllers'
         # memory, then `squeue -j` exits with code 1.
         state = Run.State.TERMINATING
     return self._stat_result(state, None)  # no term status info
Esempio n. 23
0
    def _get_targets(self):
        """
        Wrapper around `arc.TargetGenerator.GetTargets()`.
        """
        # tg = arc.TargetGenerator(self._usercfg, 1)
        # return tg.FoundTargets()
        # This methodd should spawn the ldapsearch to update the ExecutionTager information
        log.debug('Calling arc.TargetGenerator.RetrieveExecutionTargets')

        self._get_JobSupervisor_and_JobController()

        self._target_generator.RetrieveExecutionTargets()

        log.debug('Calling arc.TargetGenerator.GetExecutionTargets()')
        return self._target_generator.GetExecutionTargets()
Esempio n. 24
0
File: pbs.py Progetto: fliem/gc3pie
 def _parse_stat_output(self, stdout, stderr):
     # parse `qstat` output
     pbs_status = stdout.split()[4]
     log.debug("translating PBS/Torque's `qstat` code"
               " '%s' to gc3libs.Run.State", pbs_status)
     if pbs_status in ['Q', 'W']:
         state = Run.State.SUBMITTED
     elif pbs_status in ['R']:
         state = Run.State.RUNNING
     elif pbs_status in ['S', 'H', 'T'] or 'qh' in pbs_status:
         state = Run.State.STOPPED
     elif pbs_status in ['C', 'E', 'F']:
         state = Run.State.TERMINATING
     else:
         state = Run.State.UNKNOWN
     return self._stat_result(state, None)  # no term status info
Esempio n. 25
0
 def _read_job_resource_file(self, pid):
     """
     Get resource information on job with pid `pid`, if it
     exists. Returns None if it does not exist.
     """
     self.transport.connect()
     log.debug("Reading resource file for pid %s", pid)
     jobinfo = None
     fname = posixpath.join(self.resource_dir, str(pid))
     with self.transport.open(fname, 'rb') as fp:
         try:
             jobinfo = pickle.load(fp)
         except Exception as ex:
             log.error("Unable to read remote resource file %s: %s", fname,
                       ex)
             raise
     return jobinfo
Esempio n. 26
0
 def _parse_stat_output(self, stdout, stderr):
     # parse `qstat` output
     pbs_status = stdout.split()[4]
     log.debug(
         "translating PBS/Torque's `qstat` code"
         " '%s' to gc3libs.Run.State", pbs_status)
     if pbs_status in ['Q', 'W']:
         state = Run.State.SUBMITTED
     elif pbs_status in ['R']:
         state = Run.State.RUNNING
     elif pbs_status in ['S', 'H', 'T'] or 'qh' in pbs_status:
         state = Run.State.STOPPED
     elif pbs_status in ['C', 'E', 'F']:
         state = Run.State.TERMINATING
     else:
         state = Run.State.UNKNOWN
     return self._stat_result(state, None)  # no term status info
Esempio n. 27
0
 def _read_job_resource_file(self, pid):
     """
     Get resource information on job with pid `pid`, if it
     exists. Returns None if it does not exist.
     """
     self.transport.connect()
     log.debug("Reading resource file for pid %s", pid)
     jobinfo = None
     fname = posixpath.join(self.resource_dir, str(pid))
     with self.transport.open(fname, 'rb') as fp:
         try:
             jobinfo = pickle.load(fp)
         except Exception as ex:
             log.error("Unable to read remote resource file %s: %s",
                       fname, ex)
             raise
     return jobinfo
Esempio n. 28
0
File: sge.py Progetto: uzh/gc3pie
 def _parse_stat_output(self, stdout, stderr):
     ge_status_code = stdout.split()[4]
     log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State", ge_status_code)
     if ge_status_code in ["s", "S", "T"] or ge_status_code.startswith("h"):
         state = Run.State.STOPPED
     elif "qw" in ge_status_code:
         state = Run.State.SUBMITTED
     elif "r" in ge_status_code or "R" in ge_status_code or "t" in ge_status_code:
         state = Run.State.RUNNING
     elif ge_status_code == "E":  # error condition
         state = Run.State.TERMINATING
     else:
         log.warning("unknown SGE job status '%s', returning `UNKNOWN`", ge_status_code)
         state = Run.State.UNKNOWN
     # to get the exit status information we'll have to parse
     # `qacct` output so put ``None`` here
     return self._stat_result(state, None)
Esempio n. 29
0
    def get_resource_status(self):
        self.updated = False
        try:
            self.running_kernel
        except AttributeError:
            self._gather_machine_specs()

        self.job_infos = self._get_persisted_resource_state()
        used_memory = self._compute_used_memory(self.job_infos)
        self.available_memory = self.total_memory - used_memory
        self.updated = True
        log.debug(
            "Recovered resource information from files in %s:"
            " available memory: %s, memory used by jobs: %s",
            self.resource_dir,
            self.available_memory.to_str('%g%s', unit=Memory.MB, conv=float),
            used_memory.to_str('%g%s', unit=Memory.MB, conv=float))
        return self
Esempio n. 30
0
    def get_resource_status(self):
        self.updated = False
        try:
            self.running_kernel
        except AttributeError:
            self._gather_machine_specs()

        self.job_infos = self._get_persisted_resource_state()
        used_memory = self._compute_used_memory(self.job_infos)
        self.available_memory = self.total_memory - used_memory
        self.updated = True
        log.debug("Recovered resource information from files in %s:"
                  " available memory: %s, memory used by jobs: %s",
                  self.resource_dir,
                  self.available_memory.to_str('%g%s',
                                               unit=Memory.MB,
                                               conv=float),
                  used_memory.to_str('%g%s', unit=Memory.MB, conv=float))
        return self
Esempio n. 31
0
    def _parse_stat_output(self, stdout):
        job_status = stdout.split()[4]
        log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State",
                  job_status)

        jobstatus = dict()
        if job_status in ['s', 'S', 'T'] or job_status.startswith('h'):
            jobstatus['state'] = Run.State.STOPPED
        elif 'qw' in job_status:
            jobstatus['state'] = Run.State.SUBMITTED
        elif 'r' in job_status or 'R' in job_status or 't' in job_status:
            jobstatus['state'] = Run.State.RUNNING
        elif job_status == 'E':  # error condition
            jobstatus['state'] = Run.State.TERMINATING
        else:
            log.warning("unknown SGE job status '%s', returning `UNKNOWN`",
                        job_status)
            jobstatus['state'] = Run.State.UNKNOWN
        return jobstatus
Esempio n. 32
0
    def _parse_stat_output(self, stdout):
        job_status = stdout.split()[4]
        log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State",
                  job_status)

        jobstatus = dict()
        if job_status in ['s', 'S', 'T'] or job_status.startswith('h'):
            jobstatus['state'] = Run.State.STOPPED
        elif 'qw' in job_status:
            jobstatus['state'] = Run.State.SUBMITTED
        elif 'r' in job_status or 'R' in job_status or 't' in job_status:
            jobstatus['state'] = Run.State.RUNNING
        elif job_status == 'E':  # error condition
            jobstatus['state'] = Run.State.TERMINATING
        else:
            log.warning("unknown SGE job status '%s', returning `UNKNOWN`",
                        job_status)
            jobstatus['state'] = Run.State.UNKNOWN
        return jobstatus
Esempio n. 33
0
    def _parse_stat_output(self, stdout):
        # check that passed object obeys contract

        # parse `qstat` output
        job_status = stdout.split()[4]
        jobstatus = dict()
        log.debug("translating PBS/Torque's `qstat` code "
                  "'%s' to gc3libs.Run.State", job_status)
        if job_status in ['Q', 'W']:
            jobstatus['state'] = Run.State.SUBMITTED
        elif job_status in ['R']:
            jobstatus['state'] = Run.State.RUNNING
        elif job_status in ['S', 'H', 'T'] or 'qh' in job_status:
            jobstatus['state'] = Run.State.STOPPED
        elif job_status in ['C', 'E', 'F']:
            jobstatus['state'] = Run.State.TERMINATING
        else:
            jobstatus['state'] = Run.State.UNKNOWN

        return jobstatus
Esempio n. 34
0
 def _get_persisted_resource_state(self):
     """
     Get information on total resources from the files stored in
     `self.resource_dir`. It then returns a dictionary {PID: {key:
     values}} with informations for each job which is associated to
     a running process.
     """
     self.transport.connect()
     pidfiles = self.transport.listdir(self.resource_dir)
     log.debug("Checking status of the following PIDs: %s",
               str.join(", ", pidfiles))
     job_infos = {}
     for pid in pidfiles:
         job = self._read_job_resource_file(pid)
         if job:
             job_infos[pid] = job
         else:
             # Process not found, ignore it
             continue
     return job_infos
Esempio n. 35
0
 def _get_persisted_resource_state(self):
     """
     Get information on total resources from the files stored in
     `self.resource_dir`. It then returns a dictionary {PID: {key:
     values}} with informations for each job which is associated to
     a running process.
     """
     self.transport.connect()
     pidfiles = self.transport.listdir(self.resource_dir)
     log.debug("Checking status of the following PIDs: %s",
               str.join(", ", pidfiles))
     job_infos = {}
     for pid in pidfiles:
         job = self._read_job_resource_file(pid)
         if job:
             job_infos[pid] = job
         else:
             # Process not found, ignore it
             continue
     return job_infos
Esempio n. 36
0
    def get_results(self,
                    app,
                    download_dir,
                    overwrite=False,
                    changed_only=True):
        if app.output_base_url is not None:
            raise gc3libs.exceptions.UnrecoverableDataStagingError(
                "Retrieval of output files to non-local destinations"
                " is not supported (yet).")

        job = app.execution
        try:
            self.transport.connect()
            # Make list of files to copy, in the form of (remote_path,
            # local_path) pairs.  This entails walking the
            # `Application.outputs` list to expand wildcards and
            # directory references.
            stageout = list()
            for remote_relpath, local_url in app.outputs.items():
                local_relpath = local_url.path
                if remote_relpath == gc3libs.ANY_OUTPUT:
                    remote_relpath = ''
                    local_relpath = ''
                stageout += _make_remote_and_local_path_pair(
                    self.transport, job, remote_relpath, download_dir,
                    local_relpath)

            # copy back all files, renaming them to adhere to the
            # ArcLRMS convention
            log.debug("Downloading job output into '%s' ...", download_dir)
            for remote_path, local_path in stageout:
                # ignore missing files (this is what ARC does too)
                self.transport.get(remote_path,
                                   local_path,
                                   ignore_nonexisting=True,
                                   overwrite=overwrite,
                                   changed_only=changed_only)
            return

        except:
            raise
Esempio n. 37
0
File: sge.py Progetto: imcf/gc3pie
 def _parse_stat_output(self, stdout, stderr):
     ge_status_code = stdout.split()[4]
     log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State",
               ge_status_code)
     if (ge_status_code in ['s', 'S', 'T']
             or ge_status_code.startswith('h')):
         state = Run.State.STOPPED
     elif 'qw' in ge_status_code:
         state = Run.State.SUBMITTED
     elif ('r' in ge_status_code or 'R' in ge_status_code
           or 't' in ge_status_code):
         state = Run.State.RUNNING
     elif ge_status_code == 'E':  # error condition
         state = Run.State.TERMINATING
     else:
         log.warning("unknown SGE job status '%s', returning `UNKNOWN`",
                     ge_status_code)
         state = Run.State.UNKNOWN
     # to get the exit status information we'll have to parse
     # `qacct` output so put ``None`` here
     return self._stat_result(state, None)
Esempio n. 38
0
 def _lsf_state_to_gc3pie_state(stat):
     log.debug("Translating LSF's `bjobs` status '%s' to gc3libs.Run.State ...", stat)
     try:
         return {
         # LSF 'stat' mapping:
             'PEND'  : Run.State.SUBMITTED,
             'RUN'   : Run.State.RUNNING,
             'PSUSP' : Run.State.STOPPED,
             'USUSP' : Run.State.STOPPED,
             'SSUSP' : Run.State.STOPPED,
             # DONE = successful termination
             'DONE'  : Run.State.TERMINATING,
             # EXIT = job was killed / exit forced
             'EXIT'  : Run.State.TERMINATING,
             # ZOMBI = job "killed" and unreachable
             'ZOMBI' : Run.State.TERMINATING,
             'UNKWN' : Run.State.UNKNOWN,
             }[stat]
     except KeyError:
         log.warning("Unknown LSF job status '%s', returning `UNKNOWN`", stat)
         return Run.State.UNKNOWN
Esempio n. 39
0
    def _parse_stat_output(self, stdout):
        # check that passed object obeys contract

        # parse `qstat` output
        job_status = stdout.split()[4]
        jobstatus = dict()
        log.debug(
            "translating PBS/Torque's `qstat` code "
            "'%s' to gc3libs.Run.State", job_status)
        if job_status in ['Q', 'W']:
            jobstatus['state'] = Run.State.SUBMITTED
        elif job_status in ['R']:
            jobstatus['state'] = Run.State.RUNNING
        elif job_status in ['S', 'H', 'T'] or 'qh' in job_status:
            jobstatus['state'] = Run.State.STOPPED
        elif job_status in ['C', 'E', 'F']:
            jobstatus['state'] = Run.State.TERMINATING
        else:
            jobstatus['state'] = Run.State.UNKNOWN

        return jobstatus
Esempio n. 40
0
    def _get_job_and_controller(self, jobid):
        """
        Return a pair `(c, j)` where `j` is the `arc.Job` object
        corresponding to the given `jobid` and `c` is the
        corresponding `arc.JobController`.
        """

        """
        jobmaster = arc.JobSupervisor(usercfg, []);
        jobcontrollers = jobmaster.GetJobControllers();
        """

        self._iterjobs()

        for c in self._controllers:
            log.debug("Calling JobController.GetJobs in get_job_and_controller")
            jl = c.GetJobs()
            for j in jl:
                if j.JobID.str() == jobid:
                    # found, clean remote job sessiondir
                    return (c, j)
        raise KeyError("No job found with job ID '%s'" % jobid)
Esempio n. 41
0
    def submit_job(self, app):
        """
        Transition `app`'s status to `Run.State.SUBMITTED` if possible.

        Note that this method still checks that `app`'s requirements
        are compatible with what this resource was instanciated with,
        and that conversely the resource still has enough free
        cores/memory/etc to host a new application.  So, submission to
        a No-Op resource may still fail!
        """
        free_slots = self.free_slots - app.requested_cores
        if free_slots <= 0:
            raise gc3libs.exceptions.LRMSSubmitError(
                "Resource %s already running maximum allowed number of jobs"
                " (%s). Increase 'max_cores' to raise." %
                (self.name, self.max_cores))

        if (app.requested_memory and
                 self.available_memory < app.requested_memory):
            raise gc3libs.exceptions.LRMSSubmitError(
                "Resource %s does not have enough available memory:"
                " %s requested, but only %s available."
                % (self.name,
                   app.requested_memory.to_str('%g%s', unit=Memory.MB),
                   available_memory.to_str('%g%s', unit=Memory.MB),)
            )

        log.debug("Faking execution of command '%s' ...",
                  str.join(" ", app.arguments))

        # Update application and current resources
        app.execution.lrms_jobid = id(app)
        self.free_slots = free_slots
        if app.requested_memory:
            self.available_memory -= app.requested_memory
        self.queued += 1
        self.user_queued += 1

        return app
Esempio n. 42
0
 def cancel_job(self, app):
     job = app.execution
     try:
         self.transport.connect()
         cmd = self._cancel_command(job.lrms_jobid)
         exit_code, stdout, stderr = self.transport.execute_command(cmd)
         if exit_code != 0:
             # XXX: It is possible that 'qdel' fails because job
             # has been already completed thus the cancel_job
             # behaviour should be tolerant to these errors.
             log.error("Failed executing remote command '%s'; exit status %d", cmd, exit_code)
             log.debug("  remote command returned STDOUT '%s'", stdout)
             log.debug("  remote command returned STDERR '%s'", stderr)
             if exit_code == 127:
                 # command was not executed, time to signal an exception
                 raise gc3libs.exceptions.LRMSError(
                     "Cannot execute remote command '%s'" " -- See DEBUG level log for details" % (cmd,)
                 )
         return job
     except:
         log.critical("Failure checking status")
         raise
Esempio n. 43
0
File: slurm.py Progetto: imcf/gc3pie
 def _parse_stat_output(self, stdout, stderr):
     """
     Receive the output of ``squeue --noheader -o %i:%T:%r and parse it.
     """
     state = Run.State.UNKNOWN
     if stdout.strip() == '':
         # If stdout is empty and `squeue -j` exitcode is 0, then
         # the job has recently completed (but we still need to
         # call `sacct` to reap the termination status).
         #
         # If the job has been removed from the controllers'
         # memory, then `squeue -j` exits with code 1.
         state = Run.State.TERMINATING
     else:
         # parse stdout
         job_id, job_state_code, reason = stdout.split('^')
         log.debug("translating SLURM's state '%s' to gc3libs.Run.State",
                   job_state_code)
         if job_state_code in ['PENDING', 'CONFIGURING']:
             # XXX: see comments in `count_jobs` for a discussion
             # of whether 'CONFIGURING' should be grouped with
             # 'RUNNING' or not; here it's likely the correct
             # choice to group it with 'PENDING' as the
             # "configuring" phase may last a few minutes during
             # which the job is not yet really running.
             state = Run.State.SUBMITTED
         elif job_state_code in ['RUNNING', 'COMPLETING']:
             state = Run.State.RUNNING
         elif job_state_code in ['SUSPENDED']:
             state = Run.State.STOPPED
         elif job_state_code in [
                 'COMPLETED', 'CANCELLED', 'FAILED', 'NODE_FAIL',
                 'PREEMPTED', 'TIMEOUT'
         ]:
             state = Run.State.TERMINATING
         else:
             state = Run.State.UNKNOWN
     return self._stat_result(state, None)  # no term status info
Esempio n. 44
0
    def peek(self, app, remote_filename, local_file, offset=0, size=None):

        job = app.execution

        assert job.has_key('lrms_jobid'), \
            "Missing attribute `lrms_jobid` on `Job` instance passed to `ArcLrms.peek`."

        controller, j = self._get_job_and_controller(job.lrms_jobid)

        if size is None:
            size = sys.maxint

        # `local_file` could be a file name (string) or a file-like
        # object, as per function docstring; ensure `local_file_name`
        # is the local path
        try:
           local_file_name = local_file.name
        except AttributeError:
           local_file_name = local_file

        # `local_file` could be a file name (string) or a file-like
        # object, as per function docstring; ensure `local_file_name`
        # is the local path
        try:
            local_file_name = local_file.name
        except AttributeError:
            local_file_name = local_file

        source_url = arc.URL(job.lrms_jobid + '/' + remote_filename)
        destination_url = arc.URL(local_file_name)

        # download file
        log.debug("Arc1Lrms.peek(): Downloading remote file '%s' into local file '%s' ..."
                  % (remote_filename, local_file_name))
        if not controller.ARCCopyFile(source_url, destination_url):
            log.warning("Failed downloading '%s' to '%s'"
                        % (source_url.str(), destination_url.str()))
        log.debug("Arc1LRMS.peek(): arc.JobController.ARCCopyFile: completed")
Esempio n. 45
0
 def _lsf_state_to_gc3pie_state(stat):
     log.debug("Translating LSF's `bjobs` status '%s' to"
               " gc3libs.Run.State ...", stat)
     try:
         return {
             # LSF 'stat' mapping:
             'PEND': Run.State.SUBMITTED,
             'RUN': Run.State.RUNNING,
             'PSUSP': Run.State.STOPPED,
             'USUSP': Run.State.STOPPED,
             'SSUSP': Run.State.STOPPED,
             # DONE = successful termination
             'DONE': Run.State.TERMINATING,
             # EXIT = job was killed / exit forced
             'EXIT': Run.State.TERMINATING,
             # ZOMBI = job "killed" and unreachable
             'ZOMBI': Run.State.TERMINATING,
             'UNKWN': Run.State.UNKNOWN,
         }[stat]
     except KeyError:
         log.warning(
             "Unknown LSF job status '%s', returning `UNKNOWN`", stat)
         return Run.State.UNKNOWN
Esempio n. 46
0
    def execute_command(self, cmdline):
        """
        Scan the given command-line and return a predefined result if
        *any* word in command position matches one of the keys in the
        `expected_answer` argument to the class constructor.

        Note that the parsing of command-line is based on regular
        expressions and is thus only an approximation at ``sh``
        syntax.  It will *certainly* fail on some command-lines, but
        there is no way around this short of writing a complete ``sh``
        parser just for this function.  (And no, Python's module
        `shlex` will not do the job -- been there, done that.)
        """

        log.debug("scanning command-line <<<%s>>>", cmdline)

        for match in self._COMMAND_RE.finditer(cmdline):
            cmd = match.group("cmd")
            if cmd in self.expected_answer:
                return self.expected_answer[cmd]

        # if everything else failed, do run the command-line ...
        return LocalTransport.execute_command(self, cmdline)
Esempio n. 47
0
 def _parse_stat_output(self, stdout, stderr):
     """
     Receive the output of ``squeue --noheader -o %i:%T:%r and parse it.
     """
     state = Run.State.UNKNOWN
     if stdout.strip() == '':
         # If stdout is empty and `squeue -j` exitcode is 0, then
         # the job has recently completed (but we still need to
         # call `sacct` to reap the termination status).
         #
         # If the job has been removed from the controllers'
         # memory, then `squeue -j` exits with code 1.
         state = Run.State.TERMINATING
     else:
         # parse stdout
         job_id, job_state_code, reason = stdout.split('^')
         log.debug("translating SLURM's state '%s' to gc3libs.Run.State",
                   job_state_code)
         if job_state_code in ['PENDING', 'CONFIGURING']:
             # XXX: see comments in `count_jobs` for a discussion
             # of whether 'CONFIGURING' should be grouped with
             # 'RUNNING' or not; here it's likely the correct
             # choice to group it with 'PENDING' as the
             # "configuring" phase may last a few minutes during
             # which the job is not yet really running.
             state = Run.State.SUBMITTED
         elif job_state_code in ['RUNNING', 'COMPLETING']:
             state = Run.State.RUNNING
         elif job_state_code in ['SUSPENDED']:
             state = Run.State.STOPPED
         elif job_state_code in ['COMPLETED', 'CANCELLED', 'FAILED',
                                 'NODE_FAIL', 'PREEMPTED', 'TIMEOUT']:
             state = Run.State.TERMINATING
         else:
             state = Run.State.UNKNOWN
     return self._stat_result(state, None)  # no term status info
Esempio n. 48
0
    def _parse_stat_output(stdout):
        # LSF `bjobs -l` uses a LDIF-style continuation lines, wherein
        # a line is truncated at 79 characters and continues upon the
        # next one; continuation lines start with a fixed amount of
        # whitespace.  Join continuation lines, so that we can work on
        # a single block of text.
        lines = [ ]
        for line in stdout.split('\n'):
            if len(line) == 0:
                continue
            if line.startswith(LsfLrms._CONTINUATION_LINE_START):
                lines[-1] += line[len(LsfLrms._CONTINUATION_LINE_START):]
            else:
                lines.append(line)

        # now rebuild stdout by joining the reconstructed lines
        stdout = str.join('\n', lines)

        jobstatus = gc3libs.utils.Struct()
        # XXX: this only works if the current status is the first one
        # reported in STDOUT ...
        match = LsfLrms._status_re.search(stdout)
        if match:
            stat = match.group('state')
            jobstatus.state = LsfLrms._lsf_state_to_gc3pie_state(stat)
            if stat == 'DONE':
                # DONE = success
                jobstatus.exit_status = 0
            elif stat == 'EXIT':
                # EXIT = job exited with exit code != 0
                match = LsfLrms._unsuccessful_exit_re.search(stdout)
                if match:
                    log.debug("LSF says: '%s'", match.group(0))
                    jobstatus.exit_status = int(match.group('exit_status'))
        assert 'state' in jobstatus
        return jobstatus
Esempio n. 49
0
 def _parse_stat_output(self, stdout):
     """
     Receive the output of ``squeue --noheader -o %i:%T:%r and parse it.
     """
     jobstatus = dict()
     if stdout.strip() == '':
         # if stdout is empty and `squeue -j` exitcode is 0, then
         # the job has recently completed;
         #
         # if the job has been removed from the controllers'
         # memory, then `squeue -j` exits with code 1
         jobstatus['state'] = Run.State.TERMINATING
     else:
         # parse stdout
         jobid, state, reason = stdout.split('^')
         log.debug("translating SLURM's state '%s' to gc3libs.Run.State",
                   state)
         if state in ['PENDING', 'CONFIGURING']:
             # XXX: see above for a discussion of whether 'CONFIGURING'
             # should be grouped with 'RUNNING' or not; here it's
             # likely the correct choice to group it with 'PENDING' as
             # the "configuring" phase may last a few minutes during
             # which the job is not yet really running.
             jobstatus['state'] = Run.State.SUBMITTED
         elif state in ['RUNNING', 'COMPLETING']:
             jobstatus['state'] = Run.State.RUNNING
         elif state in ['SUSPENDED']:
             jobstatus['state'] = Run.State.STOPPED
         elif state in [
                 'COMPLETED', 'CANCELLED', 'FAILED', 'NODE_FAIL',
                 'PREEMPTED', 'TIMEOUT'
         ]:
             jobstatus['state'] = Run.State.TERMINATING
         else:
             jobstatus['state'] = Run.State.UNKNOWN
     return jobstatus
Esempio n. 50
0
 def cancel_job(self, app):
     job = app.execution
     try:
         self.transport.connect()
         cmd = self._cancel_command(job.lrms_jobid)
         exit_code, stdout, stderr = self.transport.execute_command(cmd)
         if exit_code != 0:
             # XXX: It is possible that 'qdel' fails because job
             # has been already completed thus the cancel_job
             # behaviour should be tolerant to these errors.
             log.error(
                 "Failed executing remote command '%s'; exit status %d",
                 cmd, exit_code)
             log.debug("  remote command returned STDOUT '%s'", stdout)
             log.debug("  remote command returned STDERR '%s'", stderr)
             if exit_code == 127:
                 # command was not executed, time to signal an exception
                 raise gc3libs.exceptions.LRMSError(
                     "Cannot execute remote command '%s'"
                     " -- See DEBUG level log for details" % (cmd, ))
         return job
     except:
         log.critical('Failure checking status')
         raise
Esempio n. 51
0
File: sge.py Progetto: imcf/gc3pie
    def get_resource_status(self):
        try:
            self.transport.connect()

            _command = ("%s -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_stdout, stderr))

            _command = ("%s -F -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_F_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_F_stdout, stderr))

            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(qstat_stdout, self._username)
            slots = compute_nr_of_slots(qstat_F_stdout)
            self.free_slots = int(slots['global']['available'])
            self.used_quota = -1

            log.info(
                "Updated resource '%s' status:"
                " free slots: %d,"
                " own running jobs: %d,"
                " own queued jobs: %d,"
                " total queued jobs: %d",
                self.name,
                self.free_slots,
                self.user_run,
                self.user_queued,
                self.queued,
            )
            return self

        except Exception as ex:
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__,
                      str(ex))
            raise
Esempio n. 52
0
    def get_resource_status(self):
        self.updated = False
        try:
            self.transport.connect()

            _command = ("%s --noheader -o '%%i^%%T^%%u^%%U^%%r^%%R'" %
                        self._squeue)
            log.debug("Running `%s`...", _command)
            exitcode, stdout, stderr = self.transport.execute_command(_command)
            if exitcode != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SLURM backend failed executing '%s':"
                    " exit code: %d; stdout: '%s', stderr: '%s'" %
                    (_command, exitcode, stdout, stderr))

            log.debug("Computing updated values for total/available slots ...")
            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(stdout, self._username)
            self.total_run = total_running
            self.free_slots = -1
            self.used_quota = -1

            log.info(
                "Updated resource '%s' status:"
                " free slots: %d,"
                " total running: %d,"
                " own running jobs: %d,"
                " own queued jobs: %d,"
                " total queued jobs: %d",
                self.name,
                self.free_slots,
                self.total_run,
                self.user_run,
                self.user_queued,
                self.queued,
            )
            return self

        except Exception as ex:
            # self.transport.close()
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s",
                      ex.__class__.__name__,
                      str(ex),
                      exc_info=True)
            raise
Esempio n. 53
0
    def update_job_state(self, app):
        try:
            job = app.execution
            job.lrms_jobid
        except AttributeError as ex:
            # `job` has no `lrms_jobid`: object is invalid
            raise gc3libs.exceptions.InvalidArgument(
                "Job object is invalid: %s" % str(ex))

        try:
            self.transport.connect()
            cmd = self._stat_command(job)
            log.debug("Checking remote job status with '%s' ..." % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                jobstatus = self._parse_stat_output(stdout)
                job.update(jobstatus)

                job.state = jobstatus.get('state', Run.State.UNKNOWN)
                if job.state == Run.State.UNKNOWN:
                    log.warning("Unknown batch job status,"
                                " setting GC3Pie job state to `UNKNOWN`")

                if 'exit_status' in jobstatus:
                    job.returncode = Run.shellexit_to_returncode(
                        int(jobstatus['exit_status']))

                # SLURM's `squeue` command exits with code 0 if the
                # job ID exists in the database (i.e., a job with that
                # ID has been run) but prints no output.  In this
                # case, we need to continue and examine the accounting
                # command output to get the termination status etc.
                if job.state != Run.State.TERMINATING:
                    return job.state
            else:
                log.error("Failed while running the `qstat`/`bjobs` command."
                          " exit code: %d, stderr: '%s'" % (exit_code, stderr))

            # In some batch systems, jobs disappear from qstat
            # output as soon as they are finished. In these cases,
            # we have to check some *accounting* command to check
            # the exit status.
            cmd = self._acct_command(job)
            if cmd:
                log.debug("Retrieving accounting information using command"
                          " '%s' ..." % cmd)
                try:
                    return self.__do_acct(job, cmd, self._parse_acct_output)
                except gc3libs.exceptions.AuxiliaryCommandError:
                    # This is used to distinguish between a standard
                    # Torque installation and a PBSPro where `tracejob`
                    # does not work but if `job_history_enable=True`,
                    # then we can actually access information about
                    # finished jobs with `qstat -x -f`.
                    try:
                        cmd = self._secondary_acct_command(job)
                        if cmd:
                            log.debug(
                                "The primary job accounting command"
                                " returned no information; trying"
                                " with '%s' instead...", cmd)
                            return self.__do_acct(
                                job, cmd, self._parse_secondary_acct_output)
                    except (gc3libs.exceptions.AuxiliaryCommandError,
                            NotImplementedError):
                        # ignore error -- there is nothing we can do
                        pass

            # No *stat command and no *acct command returned
            # correctly.
            try:
                if (time.time() - job.stat_failed_at) > self.accounting_delay:
                    # accounting info should be there, if it's not
                    # then job is definitely lost
                    log.critical(
                        "Failed executing remote command: '%s';"
                        "exit status %d", cmd, exit_code)
                    log.debug("  remote command returned stdout: '%s'", stdout)
                    log.debug("  remote command returned stderr: '%s'", stderr)
                    raise gc3libs.exceptions.LRMSError(
                        "Failed executing remote command: '%s'; exit status %d"
                        % (cmd, exit_code))
                else:
                    # do nothing, let's try later...
                    return job.state
            except AttributeError:
                # this is the first time `qstat` fails, record a
                # timestamp and retry later
                job.stat_failed_at = time.time()

        except Exception as ex:
            log.error("Error in querying Batch resource '%s': %s: %s",
                      self.name, ex.__class__.__name__, str(ex))
            raise
        # If we reach this point it means that we don't actually know
        # the current state of the job.
        job.state = Run.State.UNKNOWN
        return job.state
Esempio n. 54
0
    def get_resource_status(self):
        """
        Get dynamic information out of the LSF subsystem.

        return self

        dynamic information required (at least those):
        total_queued
        free_slots
        user_running
        user_queued
        """

        try:
            self.transport.connect()

            # Run lhosts to get the list of available nodes and their
            # related number of cores
            # used to compute self.total_slots
            # lhost output format:
            # ($nodeid,$OStype,$model,$cpuf,$ncpus,$maxmem,$maxswp)
            _command = ('%s -w' % self._lshosts)
            exit_code, stdout, stderr = self.transport.execute_command(
                _command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "LSF backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, stdout, stderr))

            if stdout:
                lhosts_output = stdout.strip().split('\n')
                # Remove Header
                lhosts_output.pop(0)
            else:
                lhosts_output = []

            # compute self.total_slots
            self.max_cores = 0
            for line in lhosts_output:
                # HOST_NAME      type    model  cpuf ncpus maxmem maxswp server RESOURCES  # noqa
                (hostname, h_type, h_model, h_cpuf, h_ncpus) = \
                    line.strip().split()[0:5]
                try:
                    self.max_cores += int(h_ncpus)
                except ValueError:
                    # h_ncpus == '-'
                    pass

            # Run `bjobs -u all -w` to get information about the jobs
            # for a given user used to compute `running_jobs`,
            # `self.queued`, `self.user_run` and `self.user_queued`.
            #
            # bjobs output format:
            # JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME  # noqa
            _command = ('%s -u all -w' % self._bjobs)
            log.debug("Runing `%s`... ", _command)
            exit_code, stdout, stderr = \
                self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "LSF backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, stdout, stderr))

            if stdout:
                bjobs_output = stdout.strip().split('\n')
                # Remove Header
                bjobs_output.pop(0)
            else:
                bjobs_output = []

            # user runing/queued
            used_cores = 0
            self.queued = 0
            self.user_queued = 0
            self.user_run = 0

            queued_statuses = [
                'PEND', 'PSUSP', 'USUSP', 'SSUSP', 'WAIT', 'ZOMBI'
            ]
            for line in bjobs_output:
                # JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME  # noqa
                (jobid, user, stat, queue, from_h, exec_h) = \
                    line.strip().split()[0:6]
                # to compute the number of cores allocated per each job
                # we use the output format of EXEC_HOST field
                # e.g.: 1*cpt178:2*cpt151
                for node in exec_h.split(':'):
                    try:
                        # multi core
                        (cores, n_name) = node.split('*')
                    except ValueError:
                        # single core
                        cores = 1
                try:
                    cores = int(cores)
                except ValueError:
                    # core == '-'
                    pass
                used_cores += cores

                if stat in queued_statuses:
                    self.queued += 1
                if user == self._username:
                    if stat in queued_statuses:
                        self.user_queued += 1
                    else:
                        self.user_run += 1

            self.free_slots = self.max_cores - used_cores

            return self

        except Exception as ex:
            # self.transport.close()
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__,
                      str(ex))
            raise
Esempio n. 55
0
    def submit_job(self, app):
        """
        Run an `Application` instance as a local process.

        :see: `LRMS.submit_job`
        """
        # Update current resource usage to check how many jobs are
        # running in there.  Please note that for consistency with
        # other backends, these updated information are not kept!
        try:
            self.transport.connect()
        except gc3libs.exceptions.TransportError as ex:
            raise gc3libs.exceptions.LRMSSubmitError(
                "Unable to access shellcmd resource at %s: %s" %
                (self.frontend, str(ex)))

        job_infos = self._get_persisted_resource_state()
        free_slots = self.max_cores - self._compute_used_cores(job_infos)
        available_memory = self.total_memory - \
            self._compute_used_memory(job_infos)

        if self.free_slots == 0 or free_slots == 0:
            # XXX: We shouldn't check for self.free_slots !
            raise gc3libs.exceptions.LRMSSubmitError(
                "Resource %s already running maximum allowed number of jobs"
                " (%s). Increase 'max_cores' to raise." %
                (self.name, self.max_cores))

        if app.requested_memory and \
                (available_memory < app.requested_memory or
                 self.available_memory < app.requested_memory):
            raise gc3libs.exceptions.LRMSSubmitError(
                "Resource %s does not have enough available memory:"
                " %s requested, but only %s available." % (
                    self.name,
                    app.requested_memory.to_str('%g%s', unit=Memory.MB),
                    available_memory.to_str('%g%s', unit=Memory.MB),
                ))

        log.debug("Executing local command '%s' ...",
                  str.join(" ", app.arguments))

        # Check if spooldir is a valid directory
        if not self.spooldir:
            ex, stdout, stderr = self.transport.execute_command(
                'cd "$TMPDIR" && pwd')
            if ex != 0 or stdout.strip() == '' or not stdout[0] == '/':
                log.debug(
                    "Unable to recover a valid absolute path for spooldir."
                    " Using `/var/tmp`.")
                self.spooldir = '/var/tmp'
            else:
                self.spooldir = stdout.strip()

        # determine execution directory
        exit_code, stdout, stderr = self.transport.execute_command(
            "mktemp -d %s " % posixpath.join(self.spooldir, 'gc3libs.XXXXXX'))
        if exit_code != 0:
            log.error("Error creating temporary directory on host %s: %s",
                      self.frontend, stderr)
            log.debug('Freeing resources used by failed application')
            self.free(app)
            raise gc3libs.exceptions.LRMSSubmitError(
                "Error creating temporary directory on host %s: %s",
                self.frontend, stderr)

        execdir = stdout.strip()
        app.execution.lrms_execdir = execdir

        # Copy input files to remote dir
        for local_path, remote_path in app.inputs.items():
            if local_path.scheme != 'file':
                continue
            remote_path = posixpath.join(execdir, remote_path)
            remote_parent = os.path.dirname(remote_path)
            try:
                if (remote_parent not in ['', '.']
                        and not self.transport.exists(remote_parent)):
                    log.debug("Making remote directory '%s'", remote_parent)
                    self.transport.makedirs(remote_parent)
                log.debug("Transferring file '%s' to '%s'", local_path.path,
                          remote_path)
                self.transport.put(local_path.path, remote_path)
                # preserve execute permission on input files
                if os.access(local_path.path, os.X_OK):
                    self.transport.chmod(remote_path, 0o755)
            except:
                log.critical(
                    "Copying input file '%s' to remote host '%s' failed",
                    local_path.path, self.frontend)
                log.debug('Cleaning up failed application')
                self.free(app)
                raise

        # try to ensure that a local executable really has
        # execute permissions, but ignore failures (might be a
        # link to a file we do not own)
        if app.arguments[0].startswith('./'):
            try:
                self.transport.chmod(
                    posixpath.join(execdir, app.arguments[0][2:]), 0o755)
                # os.chmod(app.arguments[0], 0755)
            except:
                log.error("Failed setting execution flag on remote file '%s'",
                          posixpath.join(execdir, app.arguments[0]))

        # set up redirection
        redirection_arguments = ''
        if app.stdin is not None:
            # stdin = open(app.stdin, 'r')
            redirection_arguments += " <%s" % app.stdin

        if app.stdout is not None:
            redirection_arguments += " >%s" % app.stdout
            stdout_dir = os.path.dirname(app.stdout)
            if stdout_dir:
                self.transport.makedirs(posixpath.join(execdir, stdout_dir))

        if app.join:
            redirection_arguments += " 2>&1"
        else:
            if app.stderr is not None:
                redirection_arguments += " 2>%s" % app.stderr
                stderr_dir = os.path.dirname(app.stderr)
                if stderr_dir:
                    self.transport.makedirs(posixpath.join(
                        execdir, stderr_dir))

        # set up environment
        env_commands = []
        for k, v in app.environment.iteritems():
            env_commands.append("export {k}={v};".format(k=sh_quote_safe(k),
                                                         v=sh_quote_unsafe(v)))

        # Create the directory in which pid, output and wrapper script
        # files will be stored
        wrapper_dir = posixpath.join(execdir, ShellcmdLrms.WRAPPER_DIR)

        if not self.transport.isdir(wrapper_dir):
            try:
                self.transport.makedirs(wrapper_dir)
            except:
                log.error("Failed creating remote folder '%s'" % wrapper_dir)
                self.free(app)
                raise

        # Set up scripts to download/upload the swift/http files
        downloadfiles = []
        uploadfiles = []
        wrapper_downloader_filename = posixpath.join(
            wrapper_dir, ShellcmdLrms.WRAPPER_DOWNLOADER)

        for url, outfile in app.inputs.items():
            if url.scheme in [
                    'swift', 'swifts', 'swt', 'swts', 'http', 'https'
            ]:
                downloadfiles.append(
                    "python '%s' download '%s' '%s'" %
                    (wrapper_downloader_filename, str(url), outfile))

        for infile, url in app.outputs.items():
            if url.scheme in ['swift', 'swt', 'swifts', 'swts']:
                uploadfiles.append(
                    "python '%s' upload '%s' '%s'" %
                    (wrapper_downloader_filename, str(url), infile))
        if downloadfiles or uploadfiles:
            # Also copy the downloader.
            with open(
                    resource_filename(Requirement.parse("gc3pie"),
                                      "gc3libs/etc/downloader.py")) as fd:
                wrapper_downloader = self.transport.open(
                    wrapper_downloader_filename, 'w')
                wrapper_downloader.write(fd.read())
                wrapper_downloader.close()

        # Build
        pidfilename = posixpath.join(wrapper_dir, ShellcmdLrms.WRAPPER_PID)
        wrapper_output_filename = posixpath.join(
            wrapper_dir, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
        wrapper_script_fname = posixpath.join(wrapper_dir,
                                              ShellcmdLrms.WRAPPER_SCRIPT)

        try:
            # Create the wrapper script
            wrapper_script = self.transport.open(wrapper_script_fname, 'w')
            commands = (r"""#!/bin/sh
                echo $$ >{pidfilename}
                cd {execdir}
                exec {redirections}
                {environment}
                {downloadfiles}
                '{time_cmd}' -o '{wrapper_out}' -f '{fmt}' {command}
                rc=$?
                {uploadfiles}
                rc2=$?
                if [ $rc -ne 0 ]; then exit $rc; else exit $rc2; fi
                """.format(
                pidfilename=pidfilename,
                execdir=execdir,
                time_cmd=self.time_cmd,
                wrapper_out=wrapper_output_filename,
                fmt=ShellcmdLrms.TIMEFMT,
                redirections=redirection_arguments,
                environment=str.join('\n', env_commands),
                downloadfiles=str.join('\n', downloadfiles),
                uploadfiles=str.join('\n', uploadfiles),
                command=(str.join(' ', (sh_quote_unsafe(arg)
                                        for arg in app.arguments))),
            ))
            wrapper_script.write(commands)
            wrapper_script.close()
            #log.info("Wrapper script: <<<%s>>>", commands)
        except gc3libs.exceptions.TransportError:
            log.error("Freeing resources used by failed application")
            self.free(app)
            raise

        try:
            self.transport.chmod(wrapper_script_fname, 0o755)

            # Execute the script in background
            self.transport.execute_command(wrapper_script_fname, detach=True)
        except gc3libs.exceptions.TransportError:
            log.error("Freeing resources used by failed application")
            self.free(app)
            raise

        # Just after the script has been started the pidfile should be
        # filled in with the correct pid.
        #
        # However, the script can have not been able to write the
        # pidfile yet, so we have to wait a little bit for it...
        pidfile = None
        for retry in gc3libs.utils.ExponentialBackoff():
            try:
                pidfile = self.transport.open(pidfilename, 'r')
                break
            except gc3libs.exceptions.TransportError as ex:
                if '[Errno 2]' in str(ex):  # no such file or directory
                    time.sleep(retry)
                    continue
                else:
                    raise
        if pidfile is None:
            # XXX: probably self.free(app) should go here as well
            raise gc3libs.exceptions.LRMSSubmitError(
                "Unable to get PID file of submitted process from"
                " execution directory `%s`: %s" % (execdir, pidfilename))
        pid = pidfile.read().strip()
        try:
            pid = int(pid)
        except ValueError:
            # XXX: probably self.free(app) should go here as well
            pidfile.close()
            raise gc3libs.exceptions.LRMSSubmitError(
                "Invalid pid `%s` in pidfile %s." % (pid, pidfilename))
        pidfile.close()

        # Update application and current resources
        app.execution.lrms_jobid = pid
        # We don't need to update free_slots since its value is
        # checked at runtime.
        if app.requested_memory:
            self.available_memory -= app.requested_memory
        self.job_infos[pid] = {
            'requested_cores': app.requested_cores,
            'requested_memory': app.requested_memory,
            'execution_dir': execdir,
            'terminated': False,
        }
        self._update_job_resource_file(pid, self.job_infos[pid])
        return app
Esempio n. 56
0
    def submit_job(self, app):
        """This method will create a remote directory to store job's
        sandbox, and will copy the sandbox in there.
        """
        job = app.execution

        # Create the remote directory.
        self.transport.connect()
        cmd = ("mkdir -p {0};"
               " mktemp -d {0}/batch_job.XXXXXXXXXX".format(self.spooldir))
        exit_code, stdout, stderr = self.transport.execute_command(cmd)
        if exit_code != 0:
            raise gc3libs.exceptions.SpoolDirError(
                "Cannot create temporary job working directory"
                " on resource '%s'; command '%s' exited"
                " with code: %d and stderr: '%s'." %
                (self.name, cmd, exit_code, stderr))
        ssh_remote_folder = stdout.split('\n')[0]

        # Copy the input file(s) to remote directory.
        for local_path, remote_path in list(app.inputs.items()):
            remote_path = os.path.join(ssh_remote_folder, remote_path)
            remote_parent = os.path.dirname(remote_path)
            try:
                if remote_parent not in ['', '.']:
                    log.debug("Making remote directory '%s'", remote_parent)
                    self.transport.makedirs(remote_parent)
                log.debug("Transferring file '%s' to '%s'", local_path.path,
                          remote_path)
                self.transport.put(local_path.path, remote_path)
                # preserve execute permission on input files
                if os.access(local_path.path, os.X_OK):
                    self.transport.chmod(remote_path, 0o755)
            except:
                log.critical(
                    "Copying input file '%s' to remote cluster '%s' failed",
                    local_path.path, self.frontend)
                raise

        if app.arguments[0].startswith('./'):
            gc3libs.log.debug("Making remote path '%s' executable.",
                              app.arguments[0])
            self.transport.chmod(
                os.path.join(ssh_remote_folder, app.arguments[0]), 0o755)

        # if STDOUT/STDERR should be saved in a directory, ensure it
        # exists (see Issue 495 for details)
        for dest in (app.stdout, app.stderr):
            if dest:
                destdir = os.path.dirname(dest)
                if destdir:
                    self.transport.makedirs(
                        posixpath.join(ssh_remote_folder, destdir))

        try:
            sub_cmd, aux_script = self._submit_command(app)
            if aux_script != '':
                # create temporary script name
                script_filename = ('./script.%s.sh' % uuid.uuid4())
                # save script to a temporary file and submit that one instead
                local_script_file = tempfile.NamedTemporaryFile(mode='wt')
                local_script_file.write('#!/bin/sh\n')
                # Add preamble file
                prologue = self.get_prologue_script(app)
                if prologue:
                    local_script_file.write(prologue)

                local_script_file.write(aux_script)

                # Add epilogue files
                epilogue = self.get_epilogue_script(app)
                if epilogue:
                    local_script_file.write(epilogue)

                local_script_file.flush()
                # upload script to remote location
                self.transport.put(
                    local_script_file.name,
                    os.path.join(ssh_remote_folder, script_filename))
                # set execution mode on remote script
                self.transport.chmod(
                    os.path.join(ssh_remote_folder, script_filename), 0o755)
                # cleanup
                local_script_file.close()
                if os.path.exists(local_script_file.name):
                    os.unlink(local_script_file.name)
            else:
                # we still need a script name even if there is no
                # script to submit
                script_filename = ''

            # Submit it
            exit_code, stdout, stderr = self.transport.execute_command(
                "/bin/sh -c %s" %
                sh_quote_safe('cd %s && %s %s' %
                              (ssh_remote_folder, sub_cmd, script_filename)))

            if exit_code != 0:
                raise gc3libs.exceptions.LRMSError(
                    "Failed executing command 'cd %s && %s %s' on resource"
                    " '%s'; exit code: %d, stderr: '%s'." %
                    (ssh_remote_folder, sub_cmd, script_filename, self.name,
                     exit_code, stderr))

            jobid = self._parse_submit_output(stdout)
            log.debug('Job submitted with jobid: %s', jobid)

            job.execution_target = self.frontend

            job.lrms_jobid = jobid
            job.lrms_jobname = jobid
            try:
                if app.jobname:
                    job.lrms_jobname = app.jobname
            except:
                pass

            if 'stdout' in app:
                job.stdout_filename = app.stdout
            else:
                job.stdout_filename = '%s.o%s' % (job.lrms_jobname, jobid)
            if app.join:
                job.stderr_filename = job.stdout_filename
            else:
                if 'stderr' in app:
                    job.stderr_filename = app.stderr
                else:
                    job.stderr_filename = '%s.e%s' % (job.lrms_jobname, jobid)
            job.history.append('Submitted to %s @ %s, got jobid %s' %
                               (self._batchsys_name, self.name, jobid))
            job.history.append(
                "Submission command output:\n"
                "  === stdout ===\n%s"
                "  === stderr ===\n%s"
                "  === end ===\n" % (stdout, stderr), 'pbs', 'qsub')
            job.ssh_remote_folder = ssh_remote_folder

            return job

        except:
            log.critical(
                "Failure submitting job to resource '%s' - "
                "see log file for errors", self.name)
            raise
Esempio n. 57
0
    def update_job_state(self, app):
        """
        Query the running status of the local process whose PID is
        stored into `app.execution.lrms_jobid`, and map the POSIX
        process status to GC3Libs `Run.State`.
        """
        self.transport.connect()
        pid = app.execution.lrms_jobid
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps ax | grep -E '^ *%d '" % pid)
        if exit_code == 0:
            log.debug(
                "Process with PID %s found."
                " Checking its running status ...", pid)
            # Process exists. Check the status
            status = stdout.split()[2]
            if status[0] == 'T':
                # Job stopped
                app.execution.state = Run.State.STOPPED
            elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']:
                # Job is running. Check manpage of ps both on linux
                # and BSD to know the meaning of these statuses.
                app.execution.state = Run.State.RUNNING
                # if `requested_walltime` is set, enforce it as a
                # running time limit
                if app.requested_walltime is not None:
                    exit_code2, stdout2, stderr2 = self.transport.execute_command(
                        "ps -p %d -o etime=" % pid)
                    if exit_code2 != 0:
                        # job terminated already, do cleanup and return
                        self._cleanup_terminating_task(app, pid)
                        return app.execution.state
                    cancel = False
                    elapsed = _parse_time_duration(stdout2.strip())
                    if elapsed > self.max_walltime:
                        log.warning(
                            "Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.",
                            app, elapsed.to_timedelta(), self.max_walltime,
                            self.name)
                        cancel = True
                    if elapsed > app.requested_walltime:
                        log.warning(
                            "Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.",
                            app, elapsed.to_timedelta(),
                            app.requested_walltime)
                        cancel = True
                    if cancel:
                        self.cancel_job(app)
                        # set signal to SIGTERM in termination status
                        self._cleanup_terminating_task(app,
                                                       pid,
                                                       termstatus=(15, -1))
                        return app.execution.state
        else:
            log.debug(
                "Process with PID %d not found,"
                " assuming task %s has finished running.", pid, app)
            self._cleanup_terminating_task(app, pid)

        self._get_persisted_resource_state()
        return app.execution.state
Esempio n. 58
0
    def update_job_state(self, app):
        job = app.execution
        try:
            job.lrms_jobid
        except AttributeError as ex:
            # `job` has no `lrms_jobid`: object is invalid
            raise gc3libs.exceptions.InvalidArgument(
                "Job object is invalid: {ex}".format(ex=ex))

        self.transport.connect()

        cmd = self._stat_command(job)
        try:
            state, termstatus = self.__run_command_and_parse_output(
                cmd, self._parse_stat_output, 'status')
            if state != Run.State.TERMINATING:
                # no need to go further and parse acct info; also,
                # exit status is not relevant in this case
                job.state = state
                log.debug("Task %s state set to %s", app, state)
                return state
        except gc3libs.exceptions.AuxiliaryCommandError:
            # use the special state value ``None`` to signal that
            # the "status" command failed, we might need this
            # after the "acct" command has run
            state, termstatus = None, None
        assert state is None or state == Run.State.TERMINATING
        log.debug(
            "Job status command gave state `%s`"
            " and termination status `%s` for task %s", state, termstatus, app)

        # In some batch systems, jobs disappear from qstat
        # output as soon as they are finished. In these cases,
        # we have to check some *accounting* command to check
        # the exit status.
        acctinfo = {}
        for cmd_fn, parse_fn in [
                # this is the regular sacct/qacct/bjobs command
            (self._acct_command, self._parse_acct_output),
                # This is used to distinguish between a standard
                # Torque installation and a PBSPro where `tracejob`
                # does not work but if `job_history_enable=True`,
                # then we can actually access information about
                # finished jobs with `qstat -x -f`.
            (self._secondary_acct_command, self._parse_secondary_acct_output),
        ]:
            cmd = cmd_fn(job)
            # `._secondary_acct_command` returns ``None`` if no
            # "secondary" accouting method is defined -- skip to next
            # iteration, if any
            if cmd is None:
                continue
            try:
                acctinfo = self.__run_command_and_parse_output(
                    cmd, parse_fn, 'accounting')
                # use info from the first acct command that succeeds
                if acctinfo:
                    log.debug("Gathered accounting info %r for task %s",
                              acctinfo, app)
                    break
            except gc3libs.exceptions.AuxiliaryCommandError:
                log.debug("Accounting command `%s` failed.", cmd)
                # try next one
                pass
            except gc3libs.exceptions.UnexpectedJobState as ex:
                log.debug(
                    "Unexpected output from accounting command `%s`: %s.", cmd,
                    ex)
                # try next one
                pass

        # if no termination status is known and the acct
        # command provided one, use it
        if 'termstatus' in acctinfo:
            # if we have a termination status, then the job has terminated
            state = Run.State.TERMINATING
            if termstatus is None:
                termstatus = acctinfo['termstatus']
            else:
                # this should not happen!  but one never knows how new
                # versions of the software may break old habits and
                # parsing rules, so better fail loudly here so we get
                # a bug report and a chance to fix...
                assert termstatus == acctinfo['termstatus'], (
                    "Status and accounting commands disagree"
                    " on job termination status: {1} vs {2}".format(
                        termstatus, acctinfo['termstatus']))

        if termstatus is None:
            # No *stat command and no *acct command returned correctly.
            try:
                job.stat_failed_at
            except AttributeError:
                # this is the first time `qstat` fails, record a
                # timestamp and retry later
                job.stat_failed_at = time.time()
                return job.state

            if (time.time() - job.stat_failed_at) <= self.accounting_delay:
                # do nothing, let's try later...
                return job.state
            else:
                # accounting info should be there, if it's not
                # then job is definitely lost
                job.state = Run.State.UNKNOWN
                raise gc3libs.exceptions.LRMSError(
                    "Could not retrieve status information for task {app}".
                    format(app=app))

        # if we got to this point the job is in TERMINATING state
        # and we know at least the termination status
        assert state == Run.State.TERMINATING

        job.state = state
        job.returncode = termstatus
        job.update(acctinfo)

        return state