Example #1
0
 def _cleanup_terminating_task(self, app, pid, termstatus=None):
     app.execution.state = Run.State.TERMINATING
     if termstatus is not None:
         app.execution.returncode = termstatus
     if pid in self.job_infos:
         self.job_infos[pid]['terminated'] = True
         if app.requested_memory is not None:
             assert (app.requested_memory == self.job_infos[pid]
                     ['requested_memory'])
             self.available_memory += app.requested_memory
     wrapper_filename = posixpath.join(app.execution.lrms_execdir,
                                       ShellcmdLrms.WRAPPER_DIR,
                                       ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
     try:
         log.debug(
             "Reading resource utilization from wrapper file `%s` for task %s ...",
             wrapper_filename, app)
         with self.transport.open(wrapper_filename, 'r') as wrapper_file:
             outcome = self._parse_wrapper_output(wrapper_file)
             app.execution.update(outcome)
             if termstatus is None:
                 app.execution.returncode = outcome.returncode
     except Exception as err:
         msg = (
             "Could not open wrapper file `{0}` for task `{1}`: {2}".format(
                 wrapper_filename, app, err))
         log.warning(
             "%s -- Termination status and resource utilization fields will not be set.",
             msg)
         raise gc3libs.exceptions.InvalidValue(msg)
     finally:
         self._delete_job_resource_file(pid)
Example #2
0
    def free(self, app):
        """
        Delete the temporary directory where a child process has run.
        The temporary directory is removed with all its content,
        recursively.

        If the deletion is successful, the `lrms_execdir` attribute in
        `app.execution` is reset to `None`; subsequent invocations of
        this method on the same applications do nothing.
        """
        try:
            if app.execution.lrms_execdir is not None:
                self.transport.connect()
                self.transport.remove_tree(app.execution.lrms_execdir)
                app.execution.lrms_execdir = None
        except Exception as ex:
            log.warning("Could not remove directory '%s': %s: %s",
                        app.execution.lrms_execdir, ex.__class__.__name__, ex)

        try:
            pid = app.execution.lrms_jobid
            self._delete_job_resource_file(pid)
        except AttributeError:
            # lrms_jobid not yet assigned
            # probabaly submit process failed before
            # ingnore and continue
            pass
Example #3
0
 def _cleanup_terminating_task(self, app, pid, termstatus=None):
     app.execution.state = Run.State.TERMINATING
     if termstatus is not None:
         app.execution.returncode = termstatus
     if pid in self.job_infos:
         self.job_infos[pid]['terminated'] = True
         if app.requested_memory is not None:
             assert (app.requested_memory
                     == self.job_infos[pid]['requested_memory'])
             self.available_memory += app.requested_memory
     wrapper_filename = posixpath.join(
         app.execution.lrms_execdir,
         ShellcmdLrms.WRAPPER_DIR,
         ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
     try:
         log.debug(
             "Reading resource utilization from wrapper file `%s` for task %s ...",
             wrapper_filename, app)
         with self.transport.open(wrapper_filename, 'r') as wrapper_file:
             outcome = self._parse_wrapper_output(wrapper_file)
             app.execution.update(outcome)
             if termstatus is None:
                 app.execution.returncode = outcome.returncode
     except Exception as err:
         msg = ("Could not open wrapper file `{0}` for task `{1}`: {2}"
                .format(wrapper_filename, app, err))
         log.warning("%s -- Termination status and resource utilization fields will not be set.", msg)
         raise gc3libs.exceptions.InvalidValue(msg)
     finally:
         self._delete_job_resource_file(pid)
Example #4
0
    def free(self, app):
        """
        Delete the temporary directory where a child process has run.
        The temporary directory is removed with all its content,
        recursively.

        If the deletion is successful, the `lrms_execdir` attribute in
        `app.execution` is reset to `None`; subsequent invocations of
        this method on the same applications do nothing.
        """
        try:
            if app.execution.lrms_execdir is not None:
                self.transport.connect()
                self.transport.remove_tree(app.execution.lrms_execdir)
                app.execution.lrms_execdir = None
        except Exception as ex:
            log.warning("Could not remove directory '%s': %s: %s",
                        app.execution.lrms_execdir, ex.__class__.__name__, ex)

        try:
            pid = app.execution.lrms_jobid
            self._delete_job_resource_file(pid)
        except AttributeError:
            # lrms_jobid not yet assigned
            # probabaly submit process failed before
            # ingnore and continue
            pass
Example #5
0
    def free(self, app):

        job = app.execution
        try:
            self.transport.connect()
            self.transport.remove_tree(job.ssh_remote_folder)
        except Exception as err:
            log.warning("Failed removing remote folder '%s': %s: %s",
                        job.ssh_remote_folder, err.__class__, err)
        return
Example #6
0
    def update_job_state(self, app):
        """
        Query the running status of the local process whose PID is
        stored into `app.execution.lrms_jobid`, and map the POSIX
        process status to GC3Libs `Run.State`.
        """
        self.transport.connect()
        pid = app.execution.lrms_jobid
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps ax | grep -E '^ *%d '" % pid)
        if exit_code == 0:
            log.debug("Process with PID %s found."
                      " Checking its running status ...", pid)
            # Process exists. Check the status
            status = stdout.split()[2]
            if status[0] == 'T':
                # Job stopped
                app.execution.state = Run.State.STOPPED
            elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']:
                # Job is running. Check manpage of ps both on linux
                # and BSD to know the meaning of these statuses.
                app.execution.state = Run.State.RUNNING
                # if `requested_walltime` is set, enforce it as a
                # running time limit
                if app.requested_walltime is not None:
                    exit_code2, stdout2, stderr2 = self.transport.execute_command(
                        "ps -p %d -o etimes=" % pid)
                    if exit_code2 != 0:
                        # job terminated already, do cleanup and return
                        self._cleanup_terminating_task(app, pid)
                        return app.execution.state
                    cancel = False
                    elapsed = Duration(stdout2.strip() + 'seconds')
                    if elapsed > self.max_walltime:
                        log.warning("Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.",
                                    app, elapsed.to_timedelta(), self.max_walltime, self.name)
                        cancel = True
                    if elapsed > app.requested_walltime:
                        log.warning("Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.",
                                    app, elapsed.to_timedelta(), app.requested_walltime)
                        cancel = True
                    if cancel:
                        self.cancel_job(app)
                        # set signal to SIGTERM in termination status
                        self._cleanup_terminating_task(app, pid, termstatus=(15, -1))
                        return app.execution.state
        else:
            log.debug(
                "Process with PID %d not found,"
                " assuming task %s has finished running.",
                pid, app)
            self._cleanup_terminating_task(app, pid)

        self._get_persisted_resource_state()
        return app.execution.state
Example #7
0
    def free(self, app):

        job = app.execution
        try:
            self.transport.connect()
            self.transport.remove_tree(job.ssh_remote_folder)
        except:
            log.warning("Failed removing remote folder '%s': %s: %s",
                        job.ssh_remote_folder, sys.exc_info()[0],
                        sys.exc_info()[1])
        return
Example #8
0
    def free(self, app):

        job = app.execution
        try:
            self.transport.connect()
            self.transport.remove_tree(job.ssh_remote_folder)
        except:
            log.warning("Failed removing remote folder '%s': %s: %s",
                        job.ssh_remote_folder,
                        sys.exc_info()[0],
                        sys.exc_info()[1])
        return
Example #9
0
File: sge.py Project: uzh/gc3pie
 def _parse_stat_output(self, stdout, stderr):
     ge_status_code = stdout.split()[4]
     log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State", ge_status_code)
     if ge_status_code in ["s", "S", "T"] or ge_status_code.startswith("h"):
         state = Run.State.STOPPED
     elif "qw" in ge_status_code:
         state = Run.State.SUBMITTED
     elif "r" in ge_status_code or "R" in ge_status_code or "t" in ge_status_code:
         state = Run.State.RUNNING
     elif ge_status_code == "E":  # error condition
         state = Run.State.TERMINATING
     else:
         log.warning("unknown SGE job status '%s', returning `UNKNOWN`", ge_status_code)
         state = Run.State.UNKNOWN
     # to get the exit status information we'll have to parse
     # `qacct` output so put ``None`` here
     return self._stat_result(state, None)
Example #10
0
 def _parse_acct_output(self, stdout):
     # Antonio: this is an ugly fix, but we have issues with bacct
     # on some LSF installation being veeeeery slow, so we have to
     # try and use `bjobs` whenever possible, and fall back to
     # bacct if bjobs does not work.
     #
     # However, since the user could update the configuration file
     # and put `bacct = bacct`, we also have to ensure that we are
     # calling the correct function to parse the output of the acct
     # command.
     if self._bacct.startswith('bacct'):
         return self.__parse_acct_output_w_bacct(stdout)
     elif self._bacct.startswith('bjobs'):
         return self.__parse_acct_output_w_bjobs(stdout)
     else:
         log.warning(
             "Unknown acct command `%s`. Assuming its output is compatible"
             " with `bacct`" % self._bacct)
         return self.__parse_acct_output_w_bacct(stdout)
Example #11
0
 def _parse_acct_output(self, stdout):
     # Antonio: this is an ugly fix, but we have issues with bacct
     # on some LSF installation being veeeeery slow, so we have to
     # try and use `bjobs` whenever possible, and fall back to
     # bacct if bjobs does not work.
     #
     # However, since the user could update the configuration file
     # and put `bacct = bacct`, we also have to ensure that we are
     # calling the correct function to parse the output of the acct
     # command.
     if self._bacct.startswith('bacct'):
         return self.__parse_acct_output_w_bacct(stdout)
     elif self._bacct.startswith('bjobs'):
         return self.__parse_acct_output_w_bjobs(stdout)
     else:
         log.warning(
             "Unknown acct command `%s`. Assuming its output is compatible"
             " with `bacct`" % self._bacct)
         return self.__parse_acct_output_w_bacct(stdout)
Example #12
0
    def _parse_stat_output(self, stdout):
        job_status = stdout.split()[4]
        log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State",
                  job_status)

        jobstatus = dict()
        if job_status in ['s', 'S', 'T'] or job_status.startswith('h'):
            jobstatus['state'] = Run.State.STOPPED
        elif 'qw' in job_status:
            jobstatus['state'] = Run.State.SUBMITTED
        elif 'r' in job_status or 'R' in job_status or 't' in job_status:
            jobstatus['state'] = Run.State.RUNNING
        elif job_status == 'E':  # error condition
            jobstatus['state'] = Run.State.TERMINATING
        else:
            log.warning("unknown SGE job status '%s', returning `UNKNOWN`",
                        job_status)
            jobstatus['state'] = Run.State.UNKNOWN
        return jobstatus
Example #13
0
    def _parse_stat_output(self, stdout):
        job_status = stdout.split()[4]
        log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State",
                  job_status)

        jobstatus = dict()
        if job_status in ['s', 'S', 'T'] or job_status.startswith('h'):
            jobstatus['state'] = Run.State.STOPPED
        elif 'qw' in job_status:
            jobstatus['state'] = Run.State.SUBMITTED
        elif 'r' in job_status or 'R' in job_status or 't' in job_status:
            jobstatus['state'] = Run.State.RUNNING
        elif job_status == 'E':  # error condition
            jobstatus['state'] = Run.State.TERMINATING
        else:
            log.warning("unknown SGE job status '%s', returning `UNKNOWN`",
                        job_status)
            jobstatus['state'] = Run.State.UNKNOWN
        return jobstatus
Example #14
0
    def __init__(self, name,
                 # this are inherited from the base LRMS class
                 architecture, max_cores, max_cores_per_job,
                 max_memory_per_core, max_walltime, auth,
                 # these are specific to the ARC0 backend
                 arc_ldap,
                 frontend=None,
                 lost_job_timeout=gc3libs.Default.ARC_LOST_JOB_TIMEOUT,
                 **extra_args):

        log.warning(
            "The ARC1 backend (used in resource '%s') is deprecated"
            " and will be removed in a future release."
            " Consider changing your configuration.",
            name)

        # check if arc module has been imported
        if not have_arc_module:
            raise gc3libs.exceptions.LRMSError(
                "Could not import `arc` module, disable ARC1 resources.")

        # init base class
        LRMS.__init__(
            self, name,
            architecture, max_cores, max_cores_per_job,
            max_memory_per_core, max_walltime, auth)

        # ARC1-specific setup
        self.lost_job_timeout = lost_job_timeout
        self.arc_ldap = arc_ldap
        if frontend is None:
            if self.arc_ldap is not None:
                # extract frontend information from arc_ldap entry
                try:
                    resource_url = gc3libs.url.Url(arc_ldap)
                    self.frontend = resource_url.hostname
                except Exception, err:
                    raise gc3libs.exceptions.ConfigurationError(
                        "Configuration error: resource '%s' has no valid 'arc_ldap' setting: %s: %s"
                        % (name, err.__class__.__name__, err.message))
            else:
                self.frontend = None
Example #15
0
File: sge.py Project: imcf/gc3pie
 def _parse_stat_output(self, stdout, stderr):
     ge_status_code = stdout.split()[4]
     log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State",
               ge_status_code)
     if (ge_status_code in ['s', 'S', 'T']
             or ge_status_code.startswith('h')):
         state = Run.State.STOPPED
     elif 'qw' in ge_status_code:
         state = Run.State.SUBMITTED
     elif ('r' in ge_status_code or 'R' in ge_status_code
           or 't' in ge_status_code):
         state = Run.State.RUNNING
     elif ge_status_code == 'E':  # error condition
         state = Run.State.TERMINATING
     else:
         log.warning("unknown SGE job status '%s', returning `UNKNOWN`",
                     ge_status_code)
         state = Run.State.UNKNOWN
     # to get the exit status information we'll have to parse
     # `qacct` output so put ``None`` here
     return self._stat_result(state, None)
Example #16
0
 def _lsf_state_to_gc3pie_state(stat):
     log.debug("Translating LSF's `bjobs` status '%s' to gc3libs.Run.State ...", stat)
     try:
         return {
         # LSF 'stat' mapping:
             'PEND'  : Run.State.SUBMITTED,
             'RUN'   : Run.State.RUNNING,
             'PSUSP' : Run.State.STOPPED,
             'USUSP' : Run.State.STOPPED,
             'SSUSP' : Run.State.STOPPED,
             # DONE = successful termination
             'DONE'  : Run.State.TERMINATING,
             # EXIT = job was killed / exit forced
             'EXIT'  : Run.State.TERMINATING,
             # ZOMBI = job "killed" and unreachable
             'ZOMBI' : Run.State.TERMINATING,
             'UNKWN' : Run.State.UNKNOWN,
             }[stat]
     except KeyError:
         log.warning("Unknown LSF job status '%s', returning `UNKNOWN`", stat)
         return Run.State.UNKNOWN
Example #17
0
    def peek(self, app, remote_filename, local_file, offset=0, size=None):

        job = app.execution

        assert job.has_key('lrms_jobid'), \
            "Missing attribute `lrms_jobid` on `Job` instance passed to `ArcLrms.peek`."

        controller, j = self._get_job_and_controller(job.lrms_jobid)

        if size is None:
            size = sys.maxint

        # `local_file` could be a file name (string) or a file-like
        # object, as per function docstring; ensure `local_file_name`
        # is the local path
        try:
           local_file_name = local_file.name
        except AttributeError:
           local_file_name = local_file

        # `local_file` could be a file name (string) or a file-like
        # object, as per function docstring; ensure `local_file_name`
        # is the local path
        try:
            local_file_name = local_file.name
        except AttributeError:
            local_file_name = local_file

        source_url = arc.URL(job.lrms_jobid + '/' + remote_filename)
        destination_url = arc.URL(local_file_name)

        # download file
        log.debug("Arc1Lrms.peek(): Downloading remote file '%s' into local file '%s' ..."
                  % (remote_filename, local_file_name))
        if not controller.ARCCopyFile(source_url, destination_url):
            log.warning("Failed downloading '%s' to '%s'"
                        % (source_url.str(), destination_url.str()))
        log.debug("Arc1LRMS.peek(): arc.JobController.ARCCopyFile: completed")
Example #18
0
 def _lsf_state_to_gc3pie_state(stat):
     log.debug("Translating LSF's `bjobs` status '%s' to"
               " gc3libs.Run.State ...", stat)
     try:
         return {
             # LSF 'stat' mapping:
             'PEND': Run.State.SUBMITTED,
             'RUN': Run.State.RUNNING,
             'PSUSP': Run.State.STOPPED,
             'USUSP': Run.State.STOPPED,
             'SSUSP': Run.State.STOPPED,
             # DONE = successful termination
             'DONE': Run.State.TERMINATING,
             # EXIT = job was killed / exit forced
             'EXIT': Run.State.TERMINATING,
             # ZOMBI = job "killed" and unreachable
             'ZOMBI': Run.State.TERMINATING,
             'UNKWN': Run.State.UNKNOWN,
         }[stat]
     except KeyError:
         log.warning(
             "Unknown LSF job status '%s', returning `UNKNOWN`", stat)
         return Run.State.UNKNOWN
Example #19
0
            raise gc3libs.exceptions.InvalidArgument(
                "Job object is invalid: %s" % str(ex))

        try:
            self.transport.connect()
            cmd = self._stat_command(job)
            log.debug("Checking remote job status with '%s' ..." % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                jobstatus = self._parse_stat_output(stdout)
                job.update(jobstatus)

                job.state = jobstatus.get('state', Run.State.UNKNOWN)
                if job.state == Run.State.UNKNOWN:
                    log.warning(
                        "Unknown batch job status,"
                        " setting GC3Pie job state to `UNKNOWN`")

                if 'exit_status' in jobstatus:
                    job.exitcode = int(jobstatus['exit_status'])
                    # XXX: we should set the `signal` part accordingly
                    job.signal = 0

                # SLURM's `squeue` command exits with code 0 if the
                # job ID exists in the database (i.e., a job with that
                # ID has been run) but prints no output.  In this
                # case, we need to continue and examine the accounting
                # command output to get the termination status etc.
                if job.state != Run.State.TERMINATING:
                    return job.state
            else:
Example #20
0
    def update_job_state(self, app):
        try:
            job = app.execution
            job.lrms_jobid
        except AttributeError as ex:
            # `job` has no `lrms_jobid`: object is invalid
            raise gc3libs.exceptions.InvalidArgument(
                "Job object is invalid: %s" % str(ex))

        try:
            self.transport.connect()
            cmd = self._stat_command(job)
            log.debug("Checking remote job status with '%s' ..." % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                jobstatus = self._parse_stat_output(stdout)
                job.update(jobstatus)

                job.state = jobstatus.get('state', Run.State.UNKNOWN)
                if job.state == Run.State.UNKNOWN:
                    log.warning(
                        "Unknown batch job status,"
                        " setting GC3Pie job state to `UNKNOWN`")

                if 'exit_status' in jobstatus:
                    job.returncode = Run.shellexit_to_returncode(
                        int(jobstatus['exit_status']))

                # SLURM's `squeue` command exits with code 0 if the
                # job ID exists in the database (i.e., a job with that
                # ID has been run) but prints no output.  In this
                # case, we need to continue and examine the accounting
                # command output to get the termination status etc.
                if job.state != Run.State.TERMINATING:
                    return job.state
            else:
                log.error(
                    "Failed while running the `qstat`/`bjobs` command."
                    " exit code: %d, stderr: '%s'" % (exit_code, stderr))

            # In some batch systems, jobs disappear from qstat
            # output as soon as they are finished. In these cases,
            # we have to check some *accounting* command to check
            # the exit status.
            cmd = self._acct_command(job)
            if cmd:
                log.debug(
                    "Retrieving accounting information using command"
                    " '%s' ..." % cmd)
                try:
                    return self.__do_acct(job, cmd, self._parse_acct_output)
                except gc3libs.exceptions.AuxiliaryCommandError:
                    # This is used to distinguish between a standard
                    # Torque installation and a PBSPro where `tracejob`
                    # does not work but if `job_history_enable=True`,
                    # then we can actually access information about
                    # finished jobs with `qstat -x -f`.
                    try:
                        cmd = self._secondary_acct_command(job)
                        if cmd:
                            log.debug("The primary job accounting command"
                                      " returned no information; trying"
                                      " with '%s' instead...", cmd)
                            return self.__do_acct(
                                job, cmd, self._parse_secondary_acct_output)
                    except (gc3libs.exceptions.AuxiliaryCommandError,
                            NotImplementedError):
                        # ignore error -- there is nothing we can do
                        pass

            # No *stat command and no *acct command returned
            # correctly.
            try:
                if (time.time() - job.stat_failed_at) > self.accounting_delay:
                    # accounting info should be there, if it's not
                    # then job is definitely lost
                    log.critical(
                        "Failed executing remote command: '%s';"
                        "exit status %d", cmd, exit_code)
                    log.debug(
                        "  remote command returned stdout: '%s'", stdout)
                    log.debug(
                        "  remote command returned stderr: '%s'", stderr)
                    raise gc3libs.exceptions.LRMSError(
                        "Failed executing remote command: '%s'; exit status %d"
                        % (cmd, exit_code))
                else:
                    # do nothing, let's try later...
                    return job.state
            except AttributeError:
                # this is the first time `qstat` fails, record a
                # timestamp and retry later
                job.stat_failed_at = time.time()

        except Exception as ex:
            log.error("Error in querying Batch resource '%s': %s: %s",
                      self.name, ex.__class__.__name__, str(ex))
            raise
        # If we reach this point it means that we don't actually know
        # the current state of the job.
        job.state = Run.State.UNKNOWN
        return job.state
Example #21
0
    def update_job_state(self, app):
        """
        Query the running status of the local process whose PID is
        stored into `app.execution.lrms_jobid`, and map the POSIX
        process status to GC3Libs `Run.State`.
        """
        self.transport.connect()
        pid = app.execution.lrms_jobid
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps ax | grep -E '^ *%d '" % pid)
        if exit_code == 0:
            log.debug(
                "Process with PID %s found."
                " Checking its running status ...", pid)
            # Process exists. Check the status
            status = stdout.split()[2]
            if status[0] == 'T':
                # Job stopped
                app.execution.state = Run.State.STOPPED
            elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']:
                # Job is running. Check manpage of ps both on linux
                # and BSD to know the meaning of these statuses.
                app.execution.state = Run.State.RUNNING
                # if `requested_walltime` is set, enforce it as a
                # running time limit
                if app.requested_walltime is not None:
                    exit_code2, stdout2, stderr2 = self.transport.execute_command(
                        "ps -p %d -o etime=" % pid)
                    if exit_code2 != 0:
                        # job terminated already, do cleanup and return
                        self._cleanup_terminating_task(app, pid)
                        return app.execution.state
                    cancel = False
                    elapsed = _parse_time_duration(stdout2.strip())
                    if elapsed > self.max_walltime:
                        log.warning(
                            "Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.",
                            app, elapsed.to_timedelta(), self.max_walltime,
                            self.name)
                        cancel = True
                    if elapsed > app.requested_walltime:
                        log.warning(
                            "Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.",
                            app, elapsed.to_timedelta(),
                            app.requested_walltime)
                        cancel = True
                    if cancel:
                        self.cancel_job(app)
                        # set signal to SIGTERM in termination status
                        self._cleanup_terminating_task(app,
                                                       pid,
                                                       termstatus=(15, -1))
                        return app.execution.state
        else:
            log.debug(
                "Process with PID %d not found,"
                " assuming task %s has finished running.", pid, app)
            self._cleanup_terminating_task(app, pid)

        self._get_persisted_resource_state()
        return app.execution.state
Example #22
0
    def get_results(self, app, download_dir, overwrite=False):
        jobid = app.execution.lrms_jobid

        # XXX: can raise encoding/decoding error if `download_dir`
        # is not ASCII, but the ARClib bindings don't accept
        # Python `unicode` strings.
        download_dir = str(download_dir)

        c, j = self._get_job_and_controller(jobid)

        # as ARC complains when downloading to an already-existing
        # directory, make a temporary directory for downloading files;
        # then move files to their final destination and delete the
        # temporary location.
        tmp_download_dir = tempfile.mkdtemp(suffix='.d', dir=download_dir)

        log.debug("Downloading %s output into temporary location '%s' ...", app, tmp_download_dir)

        # Get a list of downloadable files
        download_file_list = c.GetDownloadFiles(j.JobID);

        source_url = arc.URL(j.JobID.str())
        destination_url = arc.URL(tmp_download_dir)

        source_path_prefix = source_url.Path()
        destination_path_prefix = destination_url.Path()

        errors = 0
        for remote_file in download_file_list:
            source_url.ChangePath(os.path.join(source_path_prefix,remote_file))
            destination_url.ChangePath(os.path.join(destination_path_prefix,remote_file))
            if not c.ARCCopyFile(source_url,destination_url):
                log.warning("Failed downloading '%s' to '%s'",
                            source_url.str(), destination_url.str())
                errors += 1
        if errors > 0:
            # remove temporary download location
            shutil.rmtree(tmp_download_dir, ignore_errors=True)
            raise gc3libs.exceptions.UnrecoverableDataStagingError(
                "Failed downloading remote folder of job '%s' into '%s'."
                " There were %d errors, reported at the WARNING level in log files."
                % (jobid, download_dir, errors))

        log.debug("Moving %s output into download location '%s' ...", app, download_dir)
        entries = os.listdir(tmp_download_dir)
        if not overwrite:
            # raise an early error before we start mixing files from
            # the old and new download directories
            for entry in entries:
                dst = os.path.join(download_dir, entry)
                if os.path.exists(entry):
                    # remove temporary download location
                    shutil.rmtree(tmp_download_dir, ignore_errors=True)
                    raise gc3libs.exceptions.UnrecoverableDataStagingError(
                        "Entry '%s' in download directory '%s' already exists,"
                        " and no overwriting was requested."
                        % (entry, download_dir))
        # move all entries to the final destination
        for entry in entries:
            src = os.path.join(tmp_download_dir, entry)
            dst = os.path.join(download_dir, entry)
            if os.path.isdir(dst):
                shutil.rmtree(dst)
            os.rename(src, dst)

        # remove temporary download location (XXX: is it correct to ignore errors here?)
        shutil.rmtree(tmp_download_dir, ignore_errors=True)

        app.execution.download_dir = download_dir
        return
Example #23
0
    def update_job_state(self, app):
        try:
            job = app.execution
            job.lrms_jobid
        except AttributeError as ex:
            # `job` has no `lrms_jobid`: object is invalid
            raise gc3libs.exceptions.InvalidArgument(
                "Job object is invalid: %s" % str(ex))

        try:
            self.transport.connect()
            cmd = self._stat_command(job)
            log.debug("Checking remote job status with '%s' ..." % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                jobstatus = self._parse_stat_output(stdout)
                job.update(jobstatus)

                job.state = jobstatus.get('state', Run.State.UNKNOWN)
                if job.state == Run.State.UNKNOWN:
                    log.warning("Unknown batch job status,"
                                " setting GC3Pie job state to `UNKNOWN`")

                if 'exit_status' in jobstatus:
                    job.returncode = Run.shellexit_to_returncode(
                        int(jobstatus['exit_status']))

                # SLURM's `squeue` command exits with code 0 if the
                # job ID exists in the database (i.e., a job with that
                # ID has been run) but prints no output.  In this
                # case, we need to continue and examine the accounting
                # command output to get the termination status etc.
                if job.state != Run.State.TERMINATING:
                    return job.state
            else:
                log.error("Failed while running the `qstat`/`bjobs` command."
                          " exit code: %d, stderr: '%s'" % (exit_code, stderr))

            # In some batch systems, jobs disappear from qstat
            # output as soon as they are finished. In these cases,
            # we have to check some *accounting* command to check
            # the exit status.
            cmd = self._acct_command(job)
            if cmd:
                log.debug("Retrieving accounting information using command"
                          " '%s' ..." % cmd)
                try:
                    return self.__do_acct(job, cmd, self._parse_acct_output)
                except gc3libs.exceptions.AuxiliaryCommandError:
                    # This is used to distinguish between a standard
                    # Torque installation and a PBSPro where `tracejob`
                    # does not work but if `job_history_enable=True`,
                    # then we can actually access information about
                    # finished jobs with `qstat -x -f`.
                    try:
                        cmd = self._secondary_acct_command(job)
                        if cmd:
                            log.debug(
                                "The primary job accounting command"
                                " returned no information; trying"
                                " with '%s' instead...", cmd)
                            return self.__do_acct(
                                job, cmd, self._parse_secondary_acct_output)
                    except (gc3libs.exceptions.AuxiliaryCommandError,
                            NotImplementedError):
                        # ignore error -- there is nothing we can do
                        pass

            # No *stat command and no *acct command returned
            # correctly.
            try:
                if (time.time() - job.stat_failed_at) > self.accounting_delay:
                    # accounting info should be there, if it's not
                    # then job is definitely lost
                    log.critical(
                        "Failed executing remote command: '%s';"
                        "exit status %d", cmd, exit_code)
                    log.debug("  remote command returned stdout: '%s'", stdout)
                    log.debug("  remote command returned stderr: '%s'", stderr)
                    raise gc3libs.exceptions.LRMSError(
                        "Failed executing remote command: '%s'; exit status %d"
                        % (cmd, exit_code))
                else:
                    # do nothing, let's try later...
                    return job.state
            except AttributeError:
                # this is the first time `qstat` fails, record a
                # timestamp and retry later
                job.stat_failed_at = time.time()

        except Exception as ex:
            log.error("Error in querying Batch resource '%s': %s: %s",
                      self.name, ex.__class__.__name__, str(ex))
            raise
        # If we reach this point it means that we don't actually know
        # the current state of the job.
        job.state = Run.State.UNKNOWN
        return job.state