Example #1
0
File: sge.py Project: uzh/gc3pie
 def _parse_acct_output(self, stdout, stderr):
     acctinfo = {}
     for line in stdout.split("\n"):
         # skip empty and header lines
         line = line.strip()
         if line == "" or "===" in line:
             continue
         # extract key/value pairs from `qacct` output
         key, value = line.split(" ", 1)
         value = value.strip()
         if key == "failed":
             # value may be, e.g., "100 : assumedly after job"
             value = value.split()[0]
         try:
             dest, conv = self._qacct_keyval_mapping[key]
             acctinfo[dest] = conv(value)
         except KeyError:
             # no conversion by default -- keep it a string
             acctinfo["sge_" + key] = value
         except (ValueError, TypeError) as err:
             log.error(
                 "Cannot parse value '%s' for qacct parameter '%s': %s: %s",
                 value,
                 key,
                 err.__class__.__name__,
                 str(err),
             )
             acctinfo[dest] = None
     assert "exitcode" in acctinfo, "Could not extract exit code from `tracejob` output"
     acctinfo["termstatus"] = Run.shellexit_to_returncode(acctinfo.pop("exitcode"))
     return acctinfo
Example #2
0
    def peek(self, app, remote_filename, local_file, offset=0, size=None):
        job = app.execution
        assert 'ssh_remote_folder' in job, \
            "Missing attribute `ssh_remote_folder` on `Job` instance" \
            " passed to `PbsLrms.peek`."

        if size is None:
            size = sys.maxsize

        _filename_mapping = generic_filename_mapping(
            job.lrms_jobname, job.lrms_jobid, remote_filename)
        _remote_filename = os.path.join(
            job.ssh_remote_folder, _filename_mapping)

        try:
            self.transport.connect()
            remote_handler = self.transport.open(
                _remote_filename, mode='r', bufsize=-1)
            remote_handler.seek(offset)
            data = remote_handler.read(size)
        except Exception as ex:
            log.error("Could not read remote file '%s': %s: %s",
                      _remote_filename, ex.__class__.__name__, str(ex))

        try:
            local_file.write(data)
        except (TypeError, AttributeError):
            output_file = open(local_file, 'w+b')
            output_file.write(data)
            output_file.close()
        log.debug('... Done.')
Example #3
0
 def _parse_acct_output(self, stdout):
     jobstatus = dict()
     for line in stdout.split('\n'):
         # skip empty and header lines
         line = line.strip()
         if line == '' or '===' in line:
             continue
         # extract key/value pairs from `qacct` output
         key, value = line.split(' ', 1)
         value = value.strip()
         if key == 'failed':
             # value may be, e.g., "100 : assumedly after job"
             value = value.split()[0]
         try:
             dest, conv = self._qacct_keyval_mapping[key]
             jobstatus[dest] = conv(value)
         except KeyError:
             # no conversion by default -- keep it a string
             jobstatus['sge_' + key] = value
         except (ValueError, TypeError) as err:
             log.error(
                 "Cannot parse value '%s' for qacct parameter '%s': %s: %s",
                 value, key, err.__class__.__name__, str(err))
             jobstatus[dest] = None
     return jobstatus
Example #4
0
    def cancel_job(self, app):
        try:
            pid = int(app.execution.lrms_jobid)
        except ValueError:
            raise gc3libs.exceptions.InvalidArgument(
                "Invalid field `lrms_jobid` in Job '%s':"
                " expected a number, got '%s' (%s) instead" %
                (app, app.execution.lrms_jobid, type(
                    app.execution.lrms_jobid)))

        self.transport.connect()
        exit_code, stdout, stderr = self.transport.execute_command('kill %d' %
                                                                   pid)
        # XXX: should we check that the process actually died?
        if exit_code != 0:
            # Error killing the process. It may not exists or we don't
            # have permission to kill it.
            exit_code, stdout, stderr = self.transport.execute_command(
                "ps ax | grep -E '^ *%d '" % pid)
            if exit_code == 0:
                # The PID refers to an existing process, but we
                # couldn't kill it.
                log.error("Could not kill job '%s': %s", pid, stderr)
            else:
                # The PID refers to a non-existing process.
                log.error(
                    "Could not kill job '%s'. It refers to non-existent"
                    " local process %s.", app, app.execution.lrms_jobid)
        self._delete_job_resource_file(pid)
Example #5
0
    def peek(self, app, remote_filename, local_file, offset=0, size=None):
        job = app.execution
        assert 'ssh_remote_folder' in job, \
            "Missing attribute `ssh_remote_folder` on `Job` instance" \
            " passed to `PbsLrms.peek`."

        if size is None:
            size = sys.maxsize

        _filename_mapping = generic_filename_mapping(job.lrms_jobname,
                                                     job.lrms_jobid,
                                                     remote_filename)
        _remote_filename = os.path.join(job.ssh_remote_folder,
                                        _filename_mapping)

        try:
            self.transport.connect()
            remote_handler = self.transport.open(_remote_filename,
                                                 mode='r',
                                                 bufsize=-1)
            remote_handler.seek(offset)
            data = remote_handler.read(size)
        except Exception as ex:
            log.error("Could not read remote file '%s': %s: %s",
                      _remote_filename, ex.__class__.__name__, str(ex))

        try:
            local_file.write(data)
        except (TypeError, AttributeError):
            output_file = open(local_file, 'w+b')
            output_file.write(data)
            output_file.close()
        log.debug('... Done.')
Example #6
0
    def cancel_job(self, app):
        try:
            pid = int(app.execution.lrms_jobid)
        except ValueError:
            raise gc3libs.exceptions.InvalidArgument(
                "Invalid field `lrms_jobid` in Job '%s':"
                " expected a number, got '%s' (%s) instead"
                % (app, app.execution.lrms_jobid,
                   type(app.execution.lrms_jobid)))

        self.transport.connect()
        exit_code, stdout, stderr = self.transport.execute_command(
            'kill %d' % pid)
        # XXX: should we check that the process actually died?
        if exit_code != 0:
            # Error killing the process. It may not exists or we don't
            # have permission to kill it.
            exit_code, stdout, stderr = self.transport.execute_command(
                "ps ax | grep -E '^ *%d '" % pid)
            if exit_code == 0:
                # The PID refers to an existing process, but we
                # couldn't kill it.
                log.error("Could not kill job '%s': %s", pid, stderr)
            else:
                # The PID refers to a non-existing process.
                log.error(
                    "Could not kill job '%s'. It refers to non-existent"
                    " local process %s.", app, app.execution.lrms_jobid)
        self._delete_job_resource_file(pid)
Example #7
0
File: sge.py Project: imcf/gc3pie
 def _parse_acct_output(self, stdout, stderr):
     acctinfo = {}
     for line in stdout.split('\n'):
         # skip empty and header lines
         line = line.strip()
         if line == '' or '===' in line:
             continue
         # extract key/value pairs from `qacct` output
         key, value = line.split(' ', 1)
         value = value.strip()
         if key == 'failed':
             # value may be, e.g., "100 : assumedly after job"
             value = value.split()[0]
         try:
             dest, conv = self._qacct_keyval_mapping[key]
             acctinfo[dest] = conv(value)
         except KeyError:
             # no conversion by default -- keep it a string
             acctinfo['sge_' + key] = value
         except (ValueError, TypeError) as err:
             log.error(
                 "Cannot parse value '%s' for qacct parameter '%s': %s: %s",
                 value, key, err.__class__.__name__, str(err))
             acctinfo[dest] = None
     assert 'exitcode' in acctinfo, (
         "Could not extract exit code from `tracejob` output")
     acctinfo['termstatus'] = Run.shellexit_to_returncode(
         acctinfo.pop('exitcode'))
     return acctinfo
Example #8
0
 def _parse_acct_output(self, stdout):
     jobstatus = dict()
     for line in stdout.split('\n'):
         # skip empty and header lines
         line = line.strip()
         if line == '' or '===' in line:
             continue
         # extract key/value pairs from `qacct` output
         key, value = line.split(' ', 1)
         value = value.strip()
         if key == 'failed':
             # value may be, e.g., "100 : assumedly after job"
             value = value.split()[0]
         try:
             dest, conv = self._qacct_keyval_mapping[key]
             jobstatus[dest] = conv(value)
         except KeyError:
             # no conversion by default -- keep it a string
             jobstatus['sge_' + key] = value
         except (ValueError, TypeError) as err:
             log.error(
                 "Cannot parse value '%s' for qacct parameter '%s': %s: %s",
                 value, key, err.__class__.__name__, str(err))
             jobstatus[dest] = None
     return jobstatus
Example #9
0
 def free(self, app):
     controller, job = self._get_job_and_controller(app.execution.lrms_jobid)
     log.debug("Calling JobController.CleanJob")
     if not controller.CleanJob(job):
         log.error("arc1.JobController.CleanJob returned False for ARC job ID '%s'",
                   app.execution.lrms_jobid)
     # XXX: this is necessary as the other component of arc library seems to refer to the job.xml file
     # remove Job from job.xml file
     log.debug("Removing job '%s' from jobfile '%s'",
               app, gc3libs.Default.ARC_JOBLIST_LOCATION)
     job.RemoveJobsFromFile(gc3libs.Default.ARC_JOBLIST_LOCATION, [job.IDFromEndpoint])
Example #10
0
File: sge.py Project: imcf/gc3pie
    def get_resource_status(self):
        try:
            self.transport.connect()

            _command = ("%s -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_stdout, stderr))

            _command = ("%s -F -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_F_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_F_stdout, stderr))

            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(qstat_stdout, self._username)
            slots = compute_nr_of_slots(qstat_F_stdout)
            self.free_slots = int(slots['global']['available'])
            self.used_quota = -1

            log.info(
                "Updated resource '%s' status:"
                " free slots: %d,"
                " own running jobs: %d,"
                " own queued jobs: %d,"
                " total queued jobs: %d",
                self.name,
                self.free_slots,
                self.user_run,
                self.user_queued,
                self.queued,
            )
            return self

        except Exception as ex:
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__,
                      str(ex))
            raise
Example #11
0
 def terminated(self):
     # full path to output file on local filesystem
     output_file = join(self.output_dir, self.output_file_name)
     # if the output file is not there, log an error and exit
     if not exists(output_file):
         log.error("Expected output file `%s` from %s does not exists!",
                   output_file, self)
         return
     # ensure destination directory exists
     if not exists('pictures'):
         os.mkdir('pictures')
     # the trailing slash ensures `shutil.move` raises an error if
     # the destination exists but is not a directory
     move(output_file, 'pictures/')
Example #12
0
    def get_resource_status(self):
        try:
            self.transport.connect()

            _command = ("%s -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_stdout, stderr))

            _command = ("%s -F -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_F_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_F_stdout, stderr))

            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(qstat_stdout, self._username)
            slots = compute_nr_of_slots(qstat_F_stdout)
            self.free_slots = int(slots['global']['available'])
            self.used_quota = -1

            log.info("Updated resource '%s' status:"
                     " free slots: %d,"
                     " own running jobs: %d,"
                     " own queued jobs: %d,"
                     " total queued jobs: %d",
                     self.name,
                     self.free_slots,
                     self.user_run,
                     self.user_queued,
                     self.queued,
                     )
            return self

        except Exception as ex:
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s",
                      ex.__class__.__name__, str(ex))
            raise
Example #13
0
    def get_resource_status(self):
        self.updated = False
        try:
            self.transport.connect()

            _command = ("%s --noheader -o '%%i^%%T^%%u^%%U^%%r^%%R'" %
                        self._squeue)
            log.debug("Running `%s`...", _command)
            exitcode, stdout, stderr = self.transport.execute_command(_command)
            if exitcode != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SLURM backend failed executing '%s':"
                    " exit code: %d; stdout: '%s', stderr: '%s'" %
                    (_command, exitcode, stdout, stderr))

            log.debug("Computing updated values for total/available slots ...")
            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(stdout, self._username)
            self.total_run = total_running
            self.free_slots = -1
            self.used_quota = -1

            log.info(
                "Updated resource '%s' status:"
                " free slots: %d,"
                " total running: %d,"
                " own running jobs: %d,"
                " own queued jobs: %d,"
                " total queued jobs: %d",
                self.name,
                self.free_slots,
                self.total_run,
                self.user_run,
                self.user_queued,
                self.queued,
            )
            return self

        except Exception as ex:
            # self.transport.close()
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s",
                      ex.__class__.__name__,
                      str(ex),
                      exc_info=True)
            raise
Example #14
0
 def _read_job_resource_file(self, pid):
     """
     Get resource information on job with pid `pid`, if it
     exists. Returns None if it does not exist.
     """
     self.transport.connect()
     log.debug("Reading resource file for pid %s", pid)
     jobinfo = None
     fname = posixpath.join(self.resource_dir, str(pid))
     with self.transport.open(fname, 'rb') as fp:
         try:
             jobinfo = pickle.load(fp)
         except Exception as ex:
             log.error("Unable to read remote resource file %s: %s",
                       fname, ex)
             raise
     return jobinfo
Example #15
0
 def _read_job_resource_file(self, pid):
     """
     Get resource information on job with pid `pid`, if it
     exists. Returns None if it does not exist.
     """
     self.transport.connect()
     log.debug("Reading resource file for pid %s", pid)
     jobinfo = None
     fname = posixpath.join(self.resource_dir, str(pid))
     with self.transport.open(fname, 'rb') as fp:
         try:
             jobinfo = pickle.load(fp)
         except Exception as ex:
             log.error("Unable to read remote resource file %s: %s", fname,
                       ex)
             raise
     return jobinfo
Example #16
0
    def get_resource_status(self):
        self.updated = False
        try:
            self.transport.connect()

            _command = ('%s -a' % self._qstat)
            log.debug("Running `%s`...", _command)
            exit_code, qstat_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "PBS backend failed executing '%s':"
                    " exit code: %d; stdout: '%s', stderr: '%s'"
                    % (_command, exit_code, qstat_stdout, stderr))

            log.debug("Computing updated values for total/available slots ...")
            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(qstat_stdout, self._username)
            self.total_run = total_running
            self.free_slots = -1
            self.used_quota = -1

            log.info("Updated resource '%s' status:"
                     " free slots: %d,"
                     " total running: %d,"
                     " own running jobs: %d,"
                     " own queued jobs: %d,"
                     " total queued jobs: %d",
                     self.name,
                     self.free_slots,
                     self.total_run,
                     self.user_run,
                     self.user_queued,
                     self.queued,
                     )
            return self

        except Exception as ex:
            # self.transport.close()
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s",
                      ex.__class__.__name__, str(ex), exc_info=True)
            raise
Example #17
0
 def cancel_job(self, app):
     job = app.execution
     try:
         self.transport.connect()
         cmd = self._cancel_command(job.lrms_jobid)
         exit_code, stdout, stderr = self.transport.execute_command(cmd)
         if exit_code != 0:
             # XXX: It is possible that 'qdel' fails because job
             # has been already completed thus the cancel_job
             # behaviour should be tolerant to these errors.
             log.error("Failed executing remote command '%s'; exit status %d", cmd, exit_code)
             log.debug("  remote command returned STDOUT '%s'", stdout)
             log.debug("  remote command returned STDERR '%s'", stderr)
             if exit_code == 127:
                 # command was not executed, time to signal an exception
                 raise gc3libs.exceptions.LRMSError(
                     "Cannot execute remote command '%s'" " -- See DEBUG level log for details" % (cmd,)
                 )
         return job
     except:
         log.critical("Failure checking status")
         raise
Example #18
0
 def cancel_job(self, app):
     job = app.execution
     try:
         self.transport.connect()
         cmd = self._cancel_command(job.lrms_jobid)
         exit_code, stdout, stderr = self.transport.execute_command(cmd)
         if exit_code != 0:
             # XXX: It is possible that 'qdel' fails because job
             # has been already completed thus the cancel_job
             # behaviour should be tolerant to these errors.
             log.error(
                 "Failed executing remote command '%s'; exit status %d",
                 cmd, exit_code)
             log.debug("  remote command returned STDOUT '%s'", stdout)
             log.debug("  remote command returned STDERR '%s'", stderr)
             if exit_code == 127:
                 # command was not executed, time to signal an exception
                 raise gc3libs.exceptions.LRMSError(
                     "Cannot execute remote command '%s'"
                     " -- See DEBUG level log for details" % (cmd, ))
         return job
     except:
         log.critical('Failure checking status')
         raise
Example #19
0
    def cancel_job(self, app):
        try:
            pid = int(app.execution.lrms_jobid)
        except ValueError:
            raise gc3libs.exceptions.InvalidArgument(
                "Invalid field `lrms_jobid` in Job '%s':"
                " expected a number, got '%s' (%s) instead" %
                (app, app.execution.lrms_jobid, type(
                    app.execution.lrms_jobid)))

        self.transport.connect()
        # Kill all the processes belonging to the same session as the
        # pid we actually started.

        # On linux, kill '$(ps -o pid= -g $(ps -o sess= -p %d))' would
        # be enough, but on MacOSX it doesn't work.
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps -p %d  -o sess=" % pid)
        if exit_code != 0 or not stdout.strip():
            # No PID found. We cannot recover the session group of the
            # process, so we cannot kill any remaining orphan process.
            log.error("Unable to find job '%s': no pid found." % pid)
        else:
            exit_code, stdout, stderr = self.transport.execute_command(
                'kill $(ps -ax -o sess=,pid= | egrep "^[ \t]*%s[ \t]")' %
                stdout.strip())
            # XXX: should we check that the process actually died?
            if exit_code != 0:
                # Error killing the process. It may not exists or we don't
                # have permission to kill it.
                exit_code, stdout, stderr = self.transport.execute_command(
                    "ps ax | grep -E '^ *%d '" % pid)
                if exit_code == 0:
                    # The PID refers to an existing process, but we
                    # couldn't kill it.
                    log.error("Could not kill job '%s': %s", pid, stderr)
                else:
                    # The PID refers to a non-existing process.
                    log.error(
                        "Could not kill job '%s'. It refers to non-existent"
                        " local process %s.", app, app.execution.lrms_jobid)
        self._delete_job_resource_file(pid)
Example #20
0
    def cancel_job(self, app):
        try:
            pid = int(app.execution.lrms_jobid)
        except ValueError:
            raise gc3libs.exceptions.InvalidArgument(
                "Invalid field `lrms_jobid` in Job '%s':"
                " expected a number, got '%s' (%s) instead"
                % (app, app.execution.lrms_jobid,
                   type(app.execution.lrms_jobid)))

        self.transport.connect()
        # Kill all the processes belonging to the same session as the
        # pid we actually started.

        # On linux, kill '$(ps -o pid= -g $(ps -o sess= -p %d))' would
        # be enough, but on MacOSX it doesn't work.
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps -p %d  -o sess=" % pid)
        if exit_code != 0 or not stdout.strip():
            # No PID found. We cannot recover the session group of the
            # process, so we cannot kill any remaining orphan process.
            log.error("Unable to find job '%s': no pid found." % pid)
        else:
            exit_code, stdout, stderr = self.transport.execute_command(
                'kill $(ps -ax -o sess=,pid= | egrep "^[ \t]*%s[ \t]")' % stdout.strip())
            # XXX: should we check that the process actually died?
            if exit_code != 0:
                # Error killing the process. It may not exists or we don't
                # have permission to kill it.
                exit_code, stdout, stderr = self.transport.execute_command(
                    "ps ax | grep -E '^ *%d '" % pid)
                if exit_code == 0:
                    # The PID refers to an existing process, but we
                    # couldn't kill it.
                    log.error("Could not kill job '%s': %s", pid, stderr)
                else:
                    # The PID refers to a non-existing process.
                    log.error(
                        "Could not kill job '%s'. It refers to non-existent"
                        " local process %s.", app, app.execution.lrms_jobid)
        self._delete_job_resource_file(pid)
Example #21
0
                if 'exit_status' in jobstatus:
                    job.exitcode = int(jobstatus['exit_status'])
                    # XXX: we should set the `signal` part accordingly
                    job.signal = 0

                # SLURM's `squeue` command exits with code 0 if the
                # job ID exists in the database (i.e., a job with that
                # ID has been run) but prints no output.  In this
                # case, we need to continue and examine the accounting
                # command output to get the termination status etc.
                if job.state != Run.State.TERMINATING:
                    return job.state
            else:
                log.error(
                    "Failed while running the `qstat`/`bjobs` command."
                     " exit code: %d, stderr: '%s'" % (exit_code, stderr))

            # In some batch systems, jobs disappear from qstat
            # output as soon as they are finished. In these cases,
            # we have to check some *accounting* command to check
            # the exit status.
            cmd = self._acct_command(job)
            if cmd:
                log.debug(
                    "The `qstat`/`bjobs` command returned no job information;"
                    " trying with '%s' instead ..." % cmd)
                exit_code, stdout, stderr = self.transport.execute_command(cmd)
                if exit_code == 0:
                    jobstatus = self._parse_acct_output(stdout)
                    job.update(jobstatus)
Example #22
0
    def update_job_state(self, app):
        try:
            job = app.execution
            job.lrms_jobid
        except AttributeError as ex:
            # `job` has no `lrms_jobid`: object is invalid
            raise gc3libs.exceptions.InvalidArgument(
                "Job object is invalid: %s" % str(ex))

        try:
            self.transport.connect()
            cmd = self._stat_command(job)
            log.debug("Checking remote job status with '%s' ..." % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                jobstatus = self._parse_stat_output(stdout)
                job.update(jobstatus)

                job.state = jobstatus.get('state', Run.State.UNKNOWN)
                if job.state == Run.State.UNKNOWN:
                    log.warning(
                        "Unknown batch job status,"
                        " setting GC3Pie job state to `UNKNOWN`")

                if 'exit_status' in jobstatus:
                    job.returncode = Run.shellexit_to_returncode(
                        int(jobstatus['exit_status']))

                # SLURM's `squeue` command exits with code 0 if the
                # job ID exists in the database (i.e., a job with that
                # ID has been run) but prints no output.  In this
                # case, we need to continue and examine the accounting
                # command output to get the termination status etc.
                if job.state != Run.State.TERMINATING:
                    return job.state
            else:
                log.error(
                    "Failed while running the `qstat`/`bjobs` command."
                    " exit code: %d, stderr: '%s'" % (exit_code, stderr))

            # In some batch systems, jobs disappear from qstat
            # output as soon as they are finished. In these cases,
            # we have to check some *accounting* command to check
            # the exit status.
            cmd = self._acct_command(job)
            if cmd:
                log.debug(
                    "Retrieving accounting information using command"
                    " '%s' ..." % cmd)
                try:
                    return self.__do_acct(job, cmd, self._parse_acct_output)
                except gc3libs.exceptions.AuxiliaryCommandError:
                    # This is used to distinguish between a standard
                    # Torque installation and a PBSPro where `tracejob`
                    # does not work but if `job_history_enable=True`,
                    # then we can actually access information about
                    # finished jobs with `qstat -x -f`.
                    try:
                        cmd = self._secondary_acct_command(job)
                        if cmd:
                            log.debug("The primary job accounting command"
                                      " returned no information; trying"
                                      " with '%s' instead...", cmd)
                            return self.__do_acct(
                                job, cmd, self._parse_secondary_acct_output)
                    except (gc3libs.exceptions.AuxiliaryCommandError,
                            NotImplementedError):
                        # ignore error -- there is nothing we can do
                        pass

            # No *stat command and no *acct command returned
            # correctly.
            try:
                if (time.time() - job.stat_failed_at) > self.accounting_delay:
                    # accounting info should be there, if it's not
                    # then job is definitely lost
                    log.critical(
                        "Failed executing remote command: '%s';"
                        "exit status %d", cmd, exit_code)
                    log.debug(
                        "  remote command returned stdout: '%s'", stdout)
                    log.debug(
                        "  remote command returned stderr: '%s'", stderr)
                    raise gc3libs.exceptions.LRMSError(
                        "Failed executing remote command: '%s'; exit status %d"
                        % (cmd, exit_code))
                else:
                    # do nothing, let's try later...
                    return job.state
            except AttributeError:
                # this is the first time `qstat` fails, record a
                # timestamp and retry later
                job.stat_failed_at = time.time()

        except Exception as ex:
            log.error("Error in querying Batch resource '%s': %s: %s",
                      self.name, ex.__class__.__name__, str(ex))
            raise
        # If we reach this point it means that we don't actually know
        # the current state of the job.
        job.state = Run.State.UNKNOWN
        return job.state
Example #23
0
    def _gather_machine_specs(self):
        """
        Gather information about this machine and, if `self.override`
        is true, also update the value of `max_cores` and
        `max_memory_per_jobs` attributes.

        This method works with both Linux and MacOSX.
        """
        self.transport.connect()

        # expand env variables in the `resource_dir` setting
        exit_code, stdout, stderr = self.transport.execute_command(
            'echo %s' % sh_quote_unsafe(self.cfg_resourcedir))
        self.resource_dir = stdout.strip()

        # XXX: it is actually necessary to create the folder
        # as a separate step
        if not self.transport.exists(self.resource_dir):
            try:
                log.info("Creating resource file directory: '%s' ...",
                         self.resource_dir)
                self.transport.makedirs(self.resource_dir)
            except Exception as ex:
                log.error("Failed creating resource directory '%s':"
                          " %s: %s", self.resource_dir, type(ex), str(ex))
                # cannot continue
                raise

        exit_code, stdout, stderr = self.transport.execute_command('uname -m')
        arch = gc3libs.config._parse_architecture(stdout)
        if arch != self.architecture:
            raise gc3libs.exceptions.ConfigurationError(
                "Invalid architecture: configuration file says `%s` but "
                "it actually is `%s`" %
                (str.join(', ', self.architecture), str.join(', ', arch)))

        exit_code, stdout, stderr = self.transport.execute_command('uname -s')
        self.running_kernel = stdout.strip()

        # ensure `time_cmd` points to a valid value
        self.time_cmd = self._locate_gnu_time()
        if not self.time_cmd:
            raise gc3libs.exceptions.ConfigurationError(
                "Unable to find GNU `time` installed on your system."
                " Please, install GNU time and set the `time_cmd`"
                " configuration option in gc3pie.conf.")

        if not self.override:
            # Ignore other values.
            return

        if self.running_kernel == 'Linux':
            exit_code, stdout, stderr = self.transport.execute_command('nproc')
            max_cores = int(stdout)

            # get the amount of total memory from /proc/meminfo
            with self.transport.open('/proc/meminfo', 'r') as fd:
                for line in fd:
                    if line.startswith('MemTotal'):
                        self.total_memory = int(line.split()[1]) * Memory.KiB
                        break

        elif self.running_kernel == 'Darwin':
            exit_code, stdout, stderr = self.transport.execute_command(
                'sysctl hw.ncpu')
            max_cores = int(stdout.split(':')[-1])

            exit_code, stdout, stderr = self.transport.execute_command(
                'sysctl hw.memsize')
            self.total_memory = int(stdout.split(':')[1]) * Memory.B

        if max_cores != self.max_cores:
            log.info(
                "Mismatch of value `max_cores` on resource '%s':"
                " configuration file says `max_cores=%d` while it's actually `%d`."
                " Updating current value.", self.name, self.max_cores,
                max_cores)
            self.max_cores = max_cores

        if self.total_memory != self.max_memory_per_core:
            log.info(
                "Mismatch of value `max_memory_per_core` on resource %s:"
                " configuration file says `max_memory_per_core=%s` while it's"
                " actually `%s`. Updating current value.", self.name,
                self.max_memory_per_core,
                self.total_memory.to_str('%g%s', unit=Memory.MB))
            self.max_memory_per_core = self.total_memory

        self.available_memory = self.total_memory
Example #24
0
    def submit_job(self, app):
        """
        Run an `Application` instance as a local process.

        :see: `LRMS.submit_job`
        """
        # Update current resource usage to check how many jobs are
        # running in there.  Please note that for consistency with
        # other backends, these updated information are not kept!
        try:
            self.transport.connect()
        except gc3libs.exceptions.TransportError as ex:
            raise gc3libs.exceptions.LRMSSubmitError(
                "Unable to access shellcmd resource at %s: %s" %
                (self.frontend, str(ex)))

        job_infos = self._get_persisted_resource_state()
        free_slots = self.max_cores - self._compute_used_cores(job_infos)
        available_memory = self.total_memory - \
            self._compute_used_memory(job_infos)

        if self.free_slots == 0 or free_slots == 0:
            # XXX: We shouldn't check for self.free_slots !
            raise gc3libs.exceptions.LRMSSubmitError(
                "Resource %s already running maximum allowed number of jobs"
                " (%s). Increase 'max_cores' to raise." %
                (self.name, self.max_cores))

        if app.requested_memory and \
                (available_memory < app.requested_memory or
                 self.available_memory < app.requested_memory):
            raise gc3libs.exceptions.LRMSSubmitError(
                "Resource %s does not have enough available memory:"
                " %s requested, but only %s available." % (
                    self.name,
                    app.requested_memory.to_str('%g%s', unit=Memory.MB),
                    available_memory.to_str('%g%s', unit=Memory.MB),
                ))

        log.debug("Executing local command '%s' ...",
                  str.join(" ", app.arguments))

        # Check if spooldir is a valid directory
        if not self.spooldir:
            ex, stdout, stderr = self.transport.execute_command(
                'cd "$TMPDIR" && pwd')
            if ex != 0 or stdout.strip() == '' or not stdout[0] == '/':
                log.debug(
                    "Unable to recover a valid absolute path for spooldir."
                    " Using `/var/tmp`.")
                self.spooldir = '/var/tmp'
            else:
                self.spooldir = stdout.strip()

        # determine execution directory
        exit_code, stdout, stderr = self.transport.execute_command(
            "mktemp -d %s " % posixpath.join(self.spooldir, 'gc3libs.XXXXXX'))
        if exit_code != 0:
            log.error("Error creating temporary directory on host %s: %s",
                      self.frontend, stderr)
            log.debug('Freeing resources used by failed application')
            self.free(app)
            raise gc3libs.exceptions.LRMSSubmitError(
                "Error creating temporary directory on host %s: %s",
                self.frontend, stderr)

        execdir = stdout.strip()
        app.execution.lrms_execdir = execdir

        # Copy input files to remote dir
        for local_path, remote_path in app.inputs.items():
            if local_path.scheme != 'file':
                continue
            remote_path = posixpath.join(execdir, remote_path)
            remote_parent = os.path.dirname(remote_path)
            try:
                if (remote_parent not in ['', '.']
                        and not self.transport.exists(remote_parent)):
                    log.debug("Making remote directory '%s'", remote_parent)
                    self.transport.makedirs(remote_parent)
                log.debug("Transferring file '%s' to '%s'", local_path.path,
                          remote_path)
                self.transport.put(local_path.path, remote_path)
                # preserve execute permission on input files
                if os.access(local_path.path, os.X_OK):
                    self.transport.chmod(remote_path, 0o755)
            except:
                log.critical(
                    "Copying input file '%s' to remote host '%s' failed",
                    local_path.path, self.frontend)
                log.debug('Cleaning up failed application')
                self.free(app)
                raise

        # try to ensure that a local executable really has
        # execute permissions, but ignore failures (might be a
        # link to a file we do not own)
        if app.arguments[0].startswith('./'):
            try:
                self.transport.chmod(
                    posixpath.join(execdir, app.arguments[0][2:]), 0o755)
                # os.chmod(app.arguments[0], 0755)
            except:
                log.error("Failed setting execution flag on remote file '%s'",
                          posixpath.join(execdir, app.arguments[0]))

        # set up redirection
        redirection_arguments = ''
        if app.stdin is not None:
            # stdin = open(app.stdin, 'r')
            redirection_arguments += " <%s" % app.stdin

        if app.stdout is not None:
            redirection_arguments += " >%s" % app.stdout
            stdout_dir = os.path.dirname(app.stdout)
            if stdout_dir:
                self.transport.makedirs(posixpath.join(execdir, stdout_dir))

        if app.join:
            redirection_arguments += " 2>&1"
        else:
            if app.stderr is not None:
                redirection_arguments += " 2>%s" % app.stderr
                stderr_dir = os.path.dirname(app.stderr)
                if stderr_dir:
                    self.transport.makedirs(posixpath.join(
                        execdir, stderr_dir))

        # set up environment
        env_commands = []
        for k, v in app.environment.iteritems():
            env_commands.append("export {k}={v};".format(k=sh_quote_safe(k),
                                                         v=sh_quote_unsafe(v)))

        # Create the directory in which pid, output and wrapper script
        # files will be stored
        wrapper_dir = posixpath.join(execdir, ShellcmdLrms.WRAPPER_DIR)

        if not self.transport.isdir(wrapper_dir):
            try:
                self.transport.makedirs(wrapper_dir)
            except:
                log.error("Failed creating remote folder '%s'" % wrapper_dir)
                self.free(app)
                raise

        # Set up scripts to download/upload the swift/http files
        downloadfiles = []
        uploadfiles = []
        wrapper_downloader_filename = posixpath.join(
            wrapper_dir, ShellcmdLrms.WRAPPER_DOWNLOADER)

        for url, outfile in app.inputs.items():
            if url.scheme in [
                    'swift', 'swifts', 'swt', 'swts', 'http', 'https'
            ]:
                downloadfiles.append(
                    "python '%s' download '%s' '%s'" %
                    (wrapper_downloader_filename, str(url), outfile))

        for infile, url in app.outputs.items():
            if url.scheme in ['swift', 'swt', 'swifts', 'swts']:
                uploadfiles.append(
                    "python '%s' upload '%s' '%s'" %
                    (wrapper_downloader_filename, str(url), infile))
        if downloadfiles or uploadfiles:
            # Also copy the downloader.
            with open(
                    resource_filename(Requirement.parse("gc3pie"),
                                      "gc3libs/etc/downloader.py")) as fd:
                wrapper_downloader = self.transport.open(
                    wrapper_downloader_filename, 'w')
                wrapper_downloader.write(fd.read())
                wrapper_downloader.close()

        # Build
        pidfilename = posixpath.join(wrapper_dir, ShellcmdLrms.WRAPPER_PID)
        wrapper_output_filename = posixpath.join(
            wrapper_dir, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
        wrapper_script_fname = posixpath.join(wrapper_dir,
                                              ShellcmdLrms.WRAPPER_SCRIPT)

        try:
            # Create the wrapper script
            wrapper_script = self.transport.open(wrapper_script_fname, 'w')
            commands = (r"""#!/bin/sh
                echo $$ >{pidfilename}
                cd {execdir}
                exec {redirections}
                {environment}
                {downloadfiles}
                '{time_cmd}' -o '{wrapper_out}' -f '{fmt}' {command}
                rc=$?
                {uploadfiles}
                rc2=$?
                if [ $rc -ne 0 ]; then exit $rc; else exit $rc2; fi
                """.format(
                pidfilename=pidfilename,
                execdir=execdir,
                time_cmd=self.time_cmd,
                wrapper_out=wrapper_output_filename,
                fmt=ShellcmdLrms.TIMEFMT,
                redirections=redirection_arguments,
                environment=str.join('\n', env_commands),
                downloadfiles=str.join('\n', downloadfiles),
                uploadfiles=str.join('\n', uploadfiles),
                command=(str.join(' ', (sh_quote_unsafe(arg)
                                        for arg in app.arguments))),
            ))
            wrapper_script.write(commands)
            wrapper_script.close()
            #log.info("Wrapper script: <<<%s>>>", commands)
        except gc3libs.exceptions.TransportError:
            log.error("Freeing resources used by failed application")
            self.free(app)
            raise

        try:
            self.transport.chmod(wrapper_script_fname, 0o755)

            # Execute the script in background
            self.transport.execute_command(wrapper_script_fname, detach=True)
        except gc3libs.exceptions.TransportError:
            log.error("Freeing resources used by failed application")
            self.free(app)
            raise

        # Just after the script has been started the pidfile should be
        # filled in with the correct pid.
        #
        # However, the script can have not been able to write the
        # pidfile yet, so we have to wait a little bit for it...
        pidfile = None
        for retry in gc3libs.utils.ExponentialBackoff():
            try:
                pidfile = self.transport.open(pidfilename, 'r')
                break
            except gc3libs.exceptions.TransportError as ex:
                if '[Errno 2]' in str(ex):  # no such file or directory
                    time.sleep(retry)
                    continue
                else:
                    raise
        if pidfile is None:
            # XXX: probably self.free(app) should go here as well
            raise gc3libs.exceptions.LRMSSubmitError(
                "Unable to get PID file of submitted process from"
                " execution directory `%s`: %s" % (execdir, pidfilename))
        pid = pidfile.read().strip()
        try:
            pid = int(pid)
        except ValueError:
            # XXX: probably self.free(app) should go here as well
            pidfile.close()
            raise gc3libs.exceptions.LRMSSubmitError(
                "Invalid pid `%s` in pidfile %s." % (pid, pidfilename))
        pidfile.close()

        # Update application and current resources
        app.execution.lrms_jobid = pid
        # We don't need to update free_slots since its value is
        # checked at runtime.
        if app.requested_memory:
            self.available_memory -= app.requested_memory
        self.job_infos[pid] = {
            'requested_cores': app.requested_cores,
            'requested_memory': app.requested_memory,
            'execution_dir': execdir,
            'terminated': False,
        }
        self._update_job_resource_file(pid, self.job_infos[pid])
        return app
Example #25
0
    def get_resource_status(self):
        """
        Get dynamic information out of the LSF subsystem.

        return self

        dynamic information required (at least those):
        total_queued
        free_slots
        user_running
        user_queued
        """

        try:
            self.transport.connect()

            # Run lhosts to get the list of available nodes and their
            # related number of cores
            # used to compute self.total_slots
            # lhost output format:
            # ($nodeid,$OStype,$model,$cpuf,$ncpus,$maxmem,$maxswp)
            _command = ('%s -w' % self._lshosts)
            exit_code, stdout, stderr = self.transport.execute_command(
                _command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "LSF backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, stdout, stderr))

            if stdout:
                lhosts_output = stdout.strip().split('\n')
                # Remove Header
                lhosts_output.pop(0)
            else:
                lhosts_output = []

            # compute self.total_slots
            self.max_cores = 0
            for line in lhosts_output:
                # HOST_NAME      type    model  cpuf ncpus maxmem maxswp server RESOURCES  # noqa
                (hostname, h_type, h_model, h_cpuf, h_ncpus) = \
                    line.strip().split()[0:5]
                try:
                    self.max_cores += int(h_ncpus)
                except ValueError:
                    # h_ncpus == '-'
                    pass

            # Run `bjobs -u all -w` to get information about the jobs
            # for a given user used to compute `running_jobs`,
            # `self.queued`, `self.user_run` and `self.user_queued`.
            #
            # bjobs output format:
            # JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME  # noqa
            _command = ('%s -u all -w' % self._bjobs)
            log.debug("Runing `%s`... ", _command)
            exit_code, stdout, stderr = \
                self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "LSF backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, stdout, stderr))

            if stdout:
                bjobs_output = stdout.strip().split('\n')
                # Remove Header
                bjobs_output.pop(0)
            else:
                bjobs_output = []

            # user runing/queued
            used_cores = 0
            self.queued = 0
            self.user_queued = 0
            self.user_run = 0

            queued_statuses = [
                'PEND', 'PSUSP', 'USUSP', 'SSUSP', 'WAIT', 'ZOMBI'
            ]
            for line in bjobs_output:
                # JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME  # noqa
                (jobid, user, stat, queue, from_h, exec_h) = \
                    line.strip().split()[0:6]
                # to compute the number of cores allocated per each job
                # we use the output format of EXEC_HOST field
                # e.g.: 1*cpt178:2*cpt151
                for node in exec_h.split(':'):
                    try:
                        # multi core
                        (cores, n_name) = node.split('*')
                    except ValueError:
                        # single core
                        cores = 1
                try:
                    cores = int(cores)
                except ValueError:
                    # core == '-'
                    pass
                used_cores += cores

                if stat in queued_statuses:
                    self.queued += 1
                if user == self._username:
                    if stat in queued_statuses:
                        self.user_queued += 1
                    else:
                        self.user_run += 1

            self.free_slots = self.max_cores - used_cores

            return self

        except Exception as ex:
            # self.transport.close()
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__,
                      str(ex))
            raise
Example #26
0
    def _gather_machine_specs(self):
        """
        Gather information about this machine and, if `self.override`
        is true, also update the value of `max_cores` and
        `max_memory_per_jobs` attributes.

        This method works with both Linux and MacOSX.
        """
        self.transport.connect()

        # expand env variables in the `resource_dir` setting
        exit_code, stdout, stderr = self.transport.execute_command(
            'echo %s' % sh_quote_unsafe(self.cfg_resourcedir))
        self.resource_dir = stdout.strip()

        # XXX: it is actually necessary to create the folder
        # as a separate step
        if not self.transport.exists(self.resource_dir):
            try:
                log.info("Creating resource file directory: '%s' ...",
                         self.resource_dir)
                self.transport.makedirs(self.resource_dir)
            except Exception as ex:
                log.error("Failed creating resource directory '%s':"
                          " %s: %s", self.resource_dir, type(ex), str(ex))
                # cannot continue
                raise

        exit_code, stdout, stderr = self.transport.execute_command('uname -m')
        arch = gc3libs.config._parse_architecture(stdout)
        if arch != self.architecture:
            raise gc3libs.exceptions.ConfigurationError(
                "Invalid architecture: configuration file says `%s` but "
                "it actually is `%s`" % (str.join(', ', self.architecture),
                                         str.join(', ', arch)))

        exit_code, stdout, stderr = self.transport.execute_command('uname -s')
        self.running_kernel = stdout.strip()

        # ensure `time_cmd` points to a valid value
        self.time_cmd = self._locate_gnu_time()
        if not self.time_cmd:
            raise gc3libs.exceptions.ConfigurationError(
                "Unable to find GNU `time` installed on your system."
                " Please, install GNU time and set the `time_cmd`"
                " configuration option in gc3pie.conf.")

        if not self.override:
            # Ignore other values.
            return

        if self.running_kernel == 'Linux':
            exit_code, stdout, stderr = self.transport.execute_command('nproc')
            max_cores = int(stdout)

            # get the amount of total memory from /proc/meminfo
            with self.transport.open('/proc/meminfo', 'r') as fd:
                for line in fd:
                    if line.startswith('MemTotal'):
                        self.total_memory = int(line.split()[1]) * Memory.KiB
                        break

        elif self.running_kernel == 'Darwin':
            exit_code, stdout, stderr = self.transport.execute_command(
                'sysctl hw.ncpu')
            max_cores = int(stdout.split(':')[-1])

            exit_code, stdout, stderr = self.transport.execute_command(
                'sysctl hw.memsize')
            self.total_memory = int(stdout.split(':')[1]) * Memory.B

        if max_cores != self.max_cores:
            log.info(
                "Mismatch of value `max_cores` on resource '%s':"
                " configuration file says `max_cores=%d` while it's actually `%d`."
                " Updating current value.",
                self.name, self.max_cores, max_cores)
            self.max_cores = max_cores

        if self.total_memory != self.max_memory_per_core:
            log.info(
                "Mismatch of value `max_memory_per_core` on resource %s:"
                " configuration file says `max_memory_per_core=%s` while it's"
                " actually `%s`. Updating current value.",
                self.name,
                self.max_memory_per_core,
                self.total_memory.to_str('%g%s', unit=Memory.MB))
            self.max_memory_per_core = self.total_memory

        self.available_memory = self.total_memory
Example #27
0
    def update_job_state(self, app):
        try:
            job = app.execution
            job.lrms_jobid
        except AttributeError as ex:
            # `job` has no `lrms_jobid`: object is invalid
            raise gc3libs.exceptions.InvalidArgument(
                "Job object is invalid: %s" % str(ex))

        try:
            self.transport.connect()
            cmd = self._stat_command(job)
            log.debug("Checking remote job status with '%s' ..." % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                jobstatus = self._parse_stat_output(stdout)
                job.update(jobstatus)

                job.state = jobstatus.get('state', Run.State.UNKNOWN)
                if job.state == Run.State.UNKNOWN:
                    log.warning("Unknown batch job status,"
                                " setting GC3Pie job state to `UNKNOWN`")

                if 'exit_status' in jobstatus:
                    job.returncode = Run.shellexit_to_returncode(
                        int(jobstatus['exit_status']))

                # SLURM's `squeue` command exits with code 0 if the
                # job ID exists in the database (i.e., a job with that
                # ID has been run) but prints no output.  In this
                # case, we need to continue and examine the accounting
                # command output to get the termination status etc.
                if job.state != Run.State.TERMINATING:
                    return job.state
            else:
                log.error("Failed while running the `qstat`/`bjobs` command."
                          " exit code: %d, stderr: '%s'" % (exit_code, stderr))

            # In some batch systems, jobs disappear from qstat
            # output as soon as they are finished. In these cases,
            # we have to check some *accounting* command to check
            # the exit status.
            cmd = self._acct_command(job)
            if cmd:
                log.debug("Retrieving accounting information using command"
                          " '%s' ..." % cmd)
                try:
                    return self.__do_acct(job, cmd, self._parse_acct_output)
                except gc3libs.exceptions.AuxiliaryCommandError:
                    # This is used to distinguish between a standard
                    # Torque installation and a PBSPro where `tracejob`
                    # does not work but if `job_history_enable=True`,
                    # then we can actually access information about
                    # finished jobs with `qstat -x -f`.
                    try:
                        cmd = self._secondary_acct_command(job)
                        if cmd:
                            log.debug(
                                "The primary job accounting command"
                                " returned no information; trying"
                                " with '%s' instead...", cmd)
                            return self.__do_acct(
                                job, cmd, self._parse_secondary_acct_output)
                    except (gc3libs.exceptions.AuxiliaryCommandError,
                            NotImplementedError):
                        # ignore error -- there is nothing we can do
                        pass

            # No *stat command and no *acct command returned
            # correctly.
            try:
                if (time.time() - job.stat_failed_at) > self.accounting_delay:
                    # accounting info should be there, if it's not
                    # then job is definitely lost
                    log.critical(
                        "Failed executing remote command: '%s';"
                        "exit status %d", cmd, exit_code)
                    log.debug("  remote command returned stdout: '%s'", stdout)
                    log.debug("  remote command returned stderr: '%s'", stderr)
                    raise gc3libs.exceptions.LRMSError(
                        "Failed executing remote command: '%s'; exit status %d"
                        % (cmd, exit_code))
                else:
                    # do nothing, let's try later...
                    return job.state
            except AttributeError:
                # this is the first time `qstat` fails, record a
                # timestamp and retry later
                job.stat_failed_at = time.time()

        except Exception as ex:
            log.error("Error in querying Batch resource '%s': %s: %s",
                      self.name, ex.__class__.__name__, str(ex))
            raise
        # If we reach this point it means that we don't actually know
        # the current state of the job.
        job.state = Run.State.UNKNOWN
        return job.state
Example #28
0
    def submit_job(self, app):
        """
        Run an `Application` instance as a local process.

        :see: `LRMS.submit_job`
        """
        # Update current resource usage to check how many jobs are
        # running in there.  Please note that for consistency with
        # other backends, these updated information are not kept!
        try:
            self.transport.connect()
        except gc3libs.exceptions.TransportError as ex:
            raise gc3libs.exceptions.LRMSSubmitError(
                "Unable to access shellcmd resource at %s: %s" %
                (self.frontend, str(ex)))

        job_infos = self._get_persisted_resource_state()
        free_slots = self.max_cores - self._compute_used_cores(job_infos)
        available_memory = self.total_memory - \
            self._compute_used_memory(job_infos)

        if self.free_slots == 0 or free_slots == 0:
            # XXX: We shouldn't check for self.free_slots !
            raise gc3libs.exceptions.LRMSSubmitError(
                "Resource %s already running maximum allowed number of jobs"
                " (%s). Increase 'max_cores' to raise." %
                (self.name, self.max_cores))

        if app.requested_memory and \
                (available_memory < app.requested_memory or
                 self.available_memory < app.requested_memory):
            raise gc3libs.exceptions.LRMSSubmitError(
                "Resource %s does not have enough available memory:"
                " %s requested, but only %s available."
                % (self.name,
                   app.requested_memory.to_str('%g%s', unit=Memory.MB),
                   available_memory.to_str('%g%s', unit=Memory.MB),)
            )

        log.debug("Executing local command '%s' ...",
                  str.join(" ", app.arguments))

        # Check if spooldir is a valid directory
        if not self.spooldir:
            ex, stdout, stderr = self.transport.execute_command(
                'cd "$TMPDIR" && pwd')
            if ex != 0 or stdout.strip() == '' or not stdout[0] == '/':
                log.debug(
                    "Unable to recover a valid absolute path for spooldir."
                    " Using `/var/tmp`.")
                self.spooldir = '/var/tmp'
            else:
                self.spooldir = stdout.strip()

        # determine execution directory
        exit_code, stdout, stderr = self.transport.execute_command(
            "mktemp -d %s " % posixpath.join(
                self.spooldir, 'gc3libs.XXXXXX'))
        if exit_code != 0:
            log.error(
                "Error creating temporary directory on host %s: %s",
                self.frontend, stderr)
            log.debug('Freeing resources used by failed application')
            self.free(app)
            raise gc3libs.exceptions.LRMSSubmitError(
                "Error creating temporary directory on host %s: %s",
                self.frontend, stderr)

        execdir = stdout.strip()
        app.execution.lrms_execdir = execdir

        # Copy input files to remote dir
        for local_path, remote_path in app.inputs.items():
            if local_path.scheme != 'file':
                continue
            remote_path = posixpath.join(execdir, remote_path)
            remote_parent = os.path.dirname(remote_path)
            try:
                if (remote_parent not in ['', '.']
                        and not self.transport.exists(remote_parent)):
                    log.debug("Making remote directory '%s'", remote_parent)
                    self.transport.makedirs(remote_parent)
                log.debug("Transferring file '%s' to '%s'",
                          local_path.path, remote_path)
                self.transport.put(local_path.path, remote_path)
                # preserve execute permission on input files
                if os.access(local_path.path, os.X_OK):
                    self.transport.chmod(remote_path, 0o755)
            except:
                log.critical(
                    "Copying input file '%s' to remote host '%s' failed",
                    local_path.path, self.frontend)
                log.debug('Cleaning up failed application')
                self.free(app)
                raise

        # try to ensure that a local executable really has
        # execute permissions, but ignore failures (might be a
        # link to a file we do not own)
        if app.arguments[0].startswith('./'):
            try:
                self.transport.chmod(
                    posixpath.join(execdir, app.arguments[0][2:]),
                    0o755)
                # os.chmod(app.arguments[0], 0755)
            except:
                log.error(
                    "Failed setting execution flag on remote file '%s'",
                    posixpath.join(execdir, app.arguments[0]))

        # set up redirection
        redirection_arguments = ''
        if app.stdin is not None:
            # stdin = open(app.stdin, 'r')
            redirection_arguments += " <%s" % app.stdin

        if app.stdout is not None:
            redirection_arguments += " >%s" % app.stdout
            stdout_dir = os.path.dirname(app.stdout)
            if stdout_dir:
                self.transport.makedirs(posixpath.join(execdir, stdout_dir))

        if app.join:
            redirection_arguments += " 2>&1"
        else:
            if app.stderr is not None:
                redirection_arguments += " 2>%s" % app.stderr
                stderr_dir = os.path.dirname(app.stderr)
                if stderr_dir:
                    self.transport.makedirs(posixpath.join(execdir, stderr_dir))

        # set up environment
        env_commands = []
        for k, v in app.environment.iteritems():
            env_commands.append(
                "export {k}={v};"
                .format(k=sh_quote_safe(k), v=sh_quote_unsafe(v)))

        # Create the directory in which pid, output and wrapper script
        # files will be stored
        wrapper_dir = posixpath.join(
            execdir,
            ShellcmdLrms.WRAPPER_DIR)

        if not self.transport.isdir(wrapper_dir):
            try:
                self.transport.makedirs(wrapper_dir)
            except:
                log.error("Failed creating remote folder '%s'"
                          % wrapper_dir)
                self.free(app)
                raise

        # Set up scripts to download/upload the swift/http files
        downloadfiles = []
        uploadfiles = []
        wrapper_downloader_filename = posixpath.join(
            wrapper_dir,
            ShellcmdLrms.WRAPPER_DOWNLOADER)

        for url, outfile in app.inputs.items():
            if url.scheme in ['swift', 'swifts', 'swt', 'swts', 'http', 'https']:
                downloadfiles.append("python '%s' download '%s' '%s'" % (wrapper_downloader_filename, str(url), outfile))

        for infile, url in app.outputs.items():
            if url.scheme in ['swift', 'swt', 'swifts', 'swts']:
                uploadfiles.append("python '%s' upload '%s' '%s'" % (wrapper_downloader_filename, str(url), infile))
        if downloadfiles or uploadfiles:
            # Also copy the downloader.
            with open(resource_filename(Requirement.parse("gc3pie"),
                                        "gc3libs/etc/downloader.py")) as fd:
                wrapper_downloader = self.transport.open(
                    wrapper_downloader_filename, 'w')
                wrapper_downloader.write(fd.read())
                wrapper_downloader.close()

        # Build
        pidfilename = posixpath.join(wrapper_dir,
                                     ShellcmdLrms.WRAPPER_PID)
        wrapper_output_filename = posixpath.join(
            wrapper_dir,
            ShellcmdLrms.WRAPPER_OUTPUT_FILENAME)
        wrapper_script_fname = posixpath.join(
            wrapper_dir,
            ShellcmdLrms.WRAPPER_SCRIPT)

        try:
            # Create the wrapper script
            wrapper_script = self.transport.open(
                wrapper_script_fname, 'w')
            commands = (
                r"""#!/bin/sh
                echo $$ >{pidfilename}
                cd {execdir}
                exec {redirections}
                {environment}
                {downloadfiles}
                '{time_cmd}' -o '{wrapper_out}' -f '{fmt}' {command}
                rc=$?
                {uploadfiles}
                rc2=$?
                if [ $rc -ne 0 ]; then exit $rc; else exit $rc2; fi
                """.format(
                    pidfilename=pidfilename,
                    execdir=execdir,
                    time_cmd=self.time_cmd,
                    wrapper_out=wrapper_output_filename,
                    fmt=ShellcmdLrms.TIMEFMT,
                    redirections=redirection_arguments,
                    environment=str.join('\n', env_commands),
                    downloadfiles=str.join('\n', downloadfiles),
                    uploadfiles=str.join('\n', uploadfiles),
                    command=(str.join(' ',
                                      (sh_quote_unsafe(arg)
                                      for arg in app.arguments))),
            ))
            wrapper_script.write(commands)
            wrapper_script.close()
            #log.info("Wrapper script: <<<%s>>>", commands)
        except gc3libs.exceptions.TransportError:
            log.error("Freeing resources used by failed application")
            self.free(app)
            raise

        try:
            self.transport.chmod(wrapper_script_fname, 0o755)

            # Execute the script in background
            self.transport.execute_command(wrapper_script_fname, detach=True)
        except gc3libs.exceptions.TransportError:
            log.error("Freeing resources used by failed application")
            self.free(app)
            raise

        # Just after the script has been started the pidfile should be
        # filled in with the correct pid.
        #
        # However, the script can have not been able to write the
        # pidfile yet, so we have to wait a little bit for it...
        pidfile = None
        for retry in gc3libs.utils.ExponentialBackoff():
            try:
                pidfile = self.transport.open(pidfilename, 'r')
                break
            except gc3libs.exceptions.TransportError as ex:
                if '[Errno 2]' in str(ex):  # no such file or directory
                    time.sleep(retry)
                    continue
                else:
                    raise
        if pidfile is None:
            # XXX: probably self.free(app) should go here as well
            raise gc3libs.exceptions.LRMSSubmitError(
                "Unable to get PID file of submitted process from"
                " execution directory `%s`: %s"
                % (execdir, pidfilename))
        pid = pidfile.read().strip()
        try:
            pid = int(pid)
        except ValueError:
            # XXX: probably self.free(app) should go here as well
            pidfile.close()
            raise gc3libs.exceptions.LRMSSubmitError(
                "Invalid pid `%s` in pidfile %s." % (pid, pidfilename))
        pidfile.close()

        # Update application and current resources
        app.execution.lrms_jobid = pid
        # We don't need to update free_slots since its value is
        # checked at runtime.
        if app.requested_memory:
            self.available_memory -= app.requested_memory
        self.job_infos[pid] = {
            'requested_cores': app.requested_cores,
            'requested_memory': app.requested_memory,
            'execution_dir': execdir,
            'terminated': False,
        }
        self._update_job_resource_file(pid, self.job_infos[pid])
        return app
Example #29
0
    def get_resource_status(self):
        """
        Get dynamic information out of the LSF subsystem.

        return self

        dynamic information required (at least those):
        total_queued
        free_slots
        user_running
        user_queued
        """

        try:
            self.transport.connect()

            # Run lhosts to get the list of available nodes and their
            # related number of cores
            # used to compute self.total_slots
            # lhost output format:
            # ($nodeid,$OStype,$model,$cpuf,$ncpus,$maxmem,$maxswp)
            _command = ('%s -w' % self._lshosts)
            exit_code, stdout, stderr = self.transport.execute_command(
                _command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "LSF backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, stdout, stderr))

            if stdout:
                lhosts_output = stdout.strip().split('\n')
                # Remove Header
                lhosts_output.pop(0)
            else:
                lhosts_output = []

            # compute self.total_slots
            self.max_cores = 0
            for line in lhosts_output:
                # HOST_NAME      type    model  cpuf ncpus maxmem maxswp server RESOURCES  # noqa
                (hostname, h_type, h_model, h_cpuf, h_ncpus) = \
                    line.strip().split()[0:5]
                try:
                    self.max_cores += int(h_ncpus)
                except ValueError:
                    # h_ncpus == '-'
                    pass

            # Run `bjobs -u all -w` to get information about the jobs
            # for a given user used to compute `running_jobs`,
            # `self.queued`, `self.user_run` and `self.user_queued`.
            #
            # bjobs output format:
            # JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME  # noqa
            _command = ('%s -u all -w' % self._bjobs)
            log.debug("Runing `%s`... ", _command)
            exit_code, stdout, stderr = \
                self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "LSF backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, stdout, stderr))

            if stdout:
                bjobs_output = stdout.strip().split('\n')
                # Remove Header
                bjobs_output.pop(0)
            else:
                bjobs_output = []

            # user runing/queued
            used_cores = 0
            self.queued = 0
            self.user_queued = 0
            self.user_run = 0

            queued_statuses = ['PEND', 'PSUSP', 'USUSP',
                               'SSUSP', 'WAIT', 'ZOMBI']
            for line in bjobs_output:
                # JOBID   USER    STAT  QUEUE      FROM_HOST   EXEC_HOST   JOB_NAME   SUBMIT_TIME  # noqa
                (jobid, user, stat, queue, from_h, exec_h) = \
                    line.strip().split()[0:6]
                # to compute the number of cores allocated per each job
                # we use the output format of EXEC_HOST field
                # e.g.: 1*cpt178:2*cpt151
                for node in exec_h.split(':'):
                    try:
                        # multi core
                        (cores, n_name) = node.split('*')
                    except ValueError:
                        # single core
                        cores = 1
                try:
                    cores = int(cores)
                except ValueError:
                    # core == '-'
                    pass
                used_cores += cores

                if stat in queued_statuses:
                    self.queued += 1
                if user == self._username:
                    if stat in queued_statuses:
                        self.user_queued += 1
                    else:
                        self.user_run += 1

            self.free_slots = self.max_cores - used_cores

            return self

        except Exception as ex:
            # self.transport.close()
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s",
                      ex.__class__.__name__, str(ex))
            raise