Example #1
0
def count_jobs(qstat_output, whoami):
    """
    Parse PBS/Torque's ``qstat`` output (as contained in string `qstat_output`)
    and return a quadruple `(R, Q, r, q)` where:

      * `R` is the total number of running jobs in the PBS/Torque cell
        (from any user);

      * `Q` is the total number of queued jobs in the PBS/Torque cell
        (from any user);

      * `r` is the number of running jobs submitted by user `whoami`;

      * `q` is the number of queued jobs submitted by user `whoami`
    """
    total_running = 0
    total_queued = 0
    own_running = 0
    own_queued = 0
    for line in qstat_output.split('\n'):
        log.info("Output line: %s" % line)
        m = _qstat_line_re.match(line)
        if not m:
            continue
        if m.group('state') in ['R']:
            total_running += 1
            if m.group('username') == whoami:
                own_running += 1
        elif m.group('state') in ['Q']:
            total_queued += 1
            if m.group('username') == whoami:
                own_queued += 1
        log.info("running: %d, queued: %d" % (total_running, total_queued))

    return (total_running, total_queued, own_running, own_queued)
Example #2
0
def count_jobs(qstat_output, whoami):
    """
    Parse PBS/Torque's ``qstat`` output (as contained in string `qstat_output`)
    and return a quadruple `(R, Q, r, q)` where:

      * `R` is the total number of running jobs in the PBS/Torque cell
        (from any user);

      * `Q` is the total number of queued jobs in the PBS/Torque cell
        (from any user);

      * `r` is the number of running jobs submitted by user `whoami`;

      * `q` is the number of queued jobs submitted by user `whoami`
    """
    total_running = 0
    total_queued = 0
    own_running = 0
    own_queued = 0
    for line in qstat_output.split('\n'):
        log.info("Output line: %s" % line)
        m = _qstat_line_re.match(line)
        if not m:
            continue
        if m.group('state') in ['R']:
            total_running += 1
            if m.group('username') == whoami:
                own_running += 1
        elif m.group('state') in ['Q']:
            total_queued += 1
            if m.group('username') == whoami:
                own_queued += 1
        log.info("running: %d, queued: %d" % (total_running, total_queued))

    return (total_running, total_queued, own_running, own_queued)
Example #3
0
File: sge.py Project: imcf/gc3pie
    def get_resource_status(self):
        try:
            self.transport.connect()

            _command = ("%s -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_stdout, stderr))

            _command = ("%s -F -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_F_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_F_stdout, stderr))

            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(qstat_stdout, self._username)
            slots = compute_nr_of_slots(qstat_F_stdout)
            self.free_slots = int(slots['global']['available'])
            self.used_quota = -1

            log.info(
                "Updated resource '%s' status:"
                " free slots: %d,"
                " own running jobs: %d,"
                " own queued jobs: %d,"
                " total queued jobs: %d",
                self.name,
                self.free_slots,
                self.user_run,
                self.user_queued,
                self.queued,
            )
            return self

        except Exception as ex:
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__,
                      str(ex))
            raise
Example #4
0
    def get_resource_status(self):
        try:
            self.transport.connect()

            _command = ("%s -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_stdout, stderr))

            _command = ("%s -F -U %s" % (self._qstat, self._username))
            log.debug("Running `%s`...", _command)
            exit_code, qstat_F_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SGE backend failed executing '%s':"
                    "exit code: %d; stdout: '%s'; stderr: '%s'." %
                    (_command, exit_code, qstat_F_stdout, stderr))

            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(qstat_stdout, self._username)
            slots = compute_nr_of_slots(qstat_F_stdout)
            self.free_slots = int(slots['global']['available'])
            self.used_quota = -1

            log.info("Updated resource '%s' status:"
                     " free slots: %d,"
                     " own running jobs: %d,"
                     " own queued jobs: %d,"
                     " total queued jobs: %d",
                     self.name,
                     self.free_slots,
                     self.user_run,
                     self.user_queued,
                     self.queued,
                     )
            return self

        except Exception as ex:
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s",
                      ex.__class__.__name__, str(ex))
            raise
Example #5
0
    def get_resource_status(self):
        self.updated = False
        try:
            self.transport.connect()

            _command = ("%s --noheader -o '%%i^%%T^%%u^%%U^%%r^%%R'" %
                        self._squeue)
            log.debug("Running `%s`...", _command)
            exitcode, stdout, stderr = self.transport.execute_command(_command)
            if exitcode != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "SLURM backend failed executing '%s':"
                    " exit code: %d; stdout: '%s', stderr: '%s'" %
                    (_command, exitcode, stdout, stderr))

            log.debug("Computing updated values for total/available slots ...")
            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(stdout, self._username)
            self.total_run = total_running
            self.free_slots = -1
            self.used_quota = -1

            log.info(
                "Updated resource '%s' status:"
                " free slots: %d,"
                " total running: %d,"
                " own running jobs: %d,"
                " own queued jobs: %d,"
                " total queued jobs: %d",
                self.name,
                self.free_slots,
                self.total_run,
                self.user_run,
                self.user_queued,
                self.queued,
            )
            return self

        except Exception as ex:
            # self.transport.close()
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s",
                      ex.__class__.__name__,
                      str(ex),
                      exc_info=True)
            raise
Example #6
0
    def get_results(self, app, download_dir, overwrite=False):
        if app.output_base_url is not None:
            raise gc3libs.exceptions.UnrecoverableDataStagingError(
                "Retrieval of output files to non-local destinations"
                " is not supported (yet).")

        job = app.execution
        try:
            self.transport.connect()
            # Make list of files to copy, in the form of (remote_path,
            # local_path) pairs.  This entails walking the
            # `Application.outputs` list to expand wildcards and
            # directory references.
            stageout = list()
            for remote_relpath, local_url in app.outputs.iteritems():
                local_relpath = local_url.path
                if remote_relpath == gc3libs.ANY_OUTPUT:
                    remote_relpath = ''
                    local_relpath = ''
                stageout += _make_remote_and_local_path_pair(
                    self.transport, job, remote_relpath, download_dir,
                    local_relpath)

            # copy back all files, renaming them to adhere to the
            # ArcLRMS convention
            log.debug("Downloading job output into '%s' ...", download_dir)
            for remote_path, local_path in stageout:
                log.debug("Downloading remote file '%s' to local file '%s'",
                          remote_path, local_path)
                if (overwrite
                        or not os.path.exists(local_path)
                        or os.path.isdir(local_path)):
                    log.debug("Copying remote '%s' to local '%s'"
                              % (remote_path, local_path))
                    # ignore missing files (this is what ARC does too)
                    self.transport.get(remote_path, local_path,
                                       ignore_nonexisting=True)
                else:
                    log.info("Local file '%s' already exists;"
                             " will not be overwritten!",
                             local_path)

            return  # XXX: should we return list of downloaded files?

        except:
            raise
Example #7
0
    def get_resource_status(self):
        self.updated = False
        try:
            self.transport.connect()

            _command = ('%s -a' % self._qstat)
            log.debug("Running `%s`...", _command)
            exit_code, qstat_stdout, stderr \
                = self.transport.execute_command(_command)
            if exit_code != 0:
                # cannot continue
                raise gc3libs.exceptions.LRMSError(
                    "PBS backend failed executing '%s':"
                    " exit code: %d; stdout: '%s', stderr: '%s'"
                    % (_command, exit_code, qstat_stdout, stderr))

            log.debug("Computing updated values for total/available slots ...")
            (total_running, self.queued, self.user_run, self.user_queued) \
                = count_jobs(qstat_stdout, self._username)
            self.total_run = total_running
            self.free_slots = -1
            self.used_quota = -1

            log.info("Updated resource '%s' status:"
                     " free slots: %d,"
                     " total running: %d,"
                     " own running jobs: %d,"
                     " own queued jobs: %d,"
                     " total queued jobs: %d",
                     self.name,
                     self.free_slots,
                     self.total_run,
                     self.user_run,
                     self.user_queued,
                     self.queued,
                     )
            return self

        except Exception as ex:
            # self.transport.close()
            log.error("Error querying remote LRMS, see debug log for details.")
            log.debug("Error querying LRMS: %s: %s",
                      ex.__class__.__name__, str(ex), exc_info=True)
            raise
Example #8
0
    def submit_job(self, app):
        """This method will create a remote directory to store job's
        sandbox, and will copy the sandbox in there.
        """
        job = app.execution
        # Create the remote directory.
        try:
            self.transport.connect()

            cmd = "mkdir -p $HOME/.gc3pie_jobs;" \
                " mktemp -d $HOME/.gc3pie_jobs/lrms_job.XXXXXXXXXX"
            log.info("Creating remote temporary folder: command '%s' " % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                ssh_remote_folder = stdout.split('\n')[0]
            else:
                raise gc3libs.exceptions.LRMSError(
                    "Failed executing command '%s' on resource '%s';"
                    " exit code: %d, stderr: '%s'."
                    % (cmd, self.name, exit_code, stderr))
        except gc3libs.exceptions.TransportError, x:
            raise
Example #9
0
 def terminated(self):
     if self.execution.signal != 0:
         log.info("Task %s killed by signal %d", self, self.execution.signal)
     else:
         # self.execution.signal == 0, hence normal termination
         if self.execution.exitcode == 0:
             log.info("Task %s exited successfully!", self)
         else:
             log.info("Task %s exited with error code %d", self, self.execution.exitcode)
Example #10
0
 def terminated(self):
     if self.execution.signal != 0:
         log.info("Task %s killed by signal %d", self,
                  self.execution.signal)
     else:
         # self.execution.signal == 0, hence normal termination
         if self.execution.exitcode == 0:
             log.info("Task %s exited successfully!", self)
         else:
             log.info("Task %s exited with error code %d", self,
                      self.execution.exitcode)
Example #11
0
 def terminated(self):
     err_file_path = os.path.join(self.output_dir, self.stderr)
     with open(err_file_path, 'r') as err_file:
         errors = err_file.read()
         if 'Out of memory' in errors or 'exceeds maximum array size' in errors:
             self.execution.exitcode = 11
     # verbosely notify user
     if self.execution.signal != 0:
         log.info("Task %s killed by signal %d", self, self.execution.signal)
     else:
         # self.execution.signal == 0, hence normal termination
         if self.execution.exitcode == 0:
             log.info("Task %s exited successfully!", self)
         else:
             log.info("Task %s exited with error code %d", self, self.execution.exitcode)
Example #12
0
    def _gather_machine_specs(self):
        """
        Gather information about this machine and, if `self.override`
        is true, also update the value of `max_cores` and
        `max_memory_per_jobs` attributes.

        This method works with both Linux and MacOSX.
        """
        self.transport.connect()

        # expand env variables in the `resource_dir` setting
        exit_code, stdout, stderr = self.transport.execute_command(
            'echo %s' % sh_quote_unsafe(self.cfg_resourcedir))
        self.resource_dir = stdout.strip()

        # XXX: it is actually necessary to create the folder
        # as a separate step
        if not self.transport.exists(self.resource_dir):
            try:
                log.info("Creating resource file directory: '%s' ...",
                         self.resource_dir)
                self.transport.makedirs(self.resource_dir)
            except Exception as ex:
                log.error("Failed creating resource directory '%s':"
                          " %s: %s", self.resource_dir, type(ex), str(ex))
                # cannot continue
                raise

        exit_code, stdout, stderr = self.transport.execute_command('uname -m')
        arch = gc3libs.config._parse_architecture(stdout)
        if arch != self.architecture:
            raise gc3libs.exceptions.ConfigurationError(
                "Invalid architecture: configuration file says `%s` but "
                "it actually is `%s`" % (str.join(', ', self.architecture),
                                         str.join(', ', arch)))

        exit_code, stdout, stderr = self.transport.execute_command('uname -s')
        self.running_kernel = stdout.strip()

        # ensure `time_cmd` points to a valid value
        self.time_cmd = self._locate_gnu_time()
        if not self.time_cmd:
            raise gc3libs.exceptions.ConfigurationError(
                "Unable to find GNU `time` installed on your system."
                " Please, install GNU time and set the `time_cmd`"
                " configuration option in gc3pie.conf.")

        if not self.override:
            # Ignore other values.
            return

        if self.running_kernel == 'Linux':
            exit_code, stdout, stderr = self.transport.execute_command('nproc')
            max_cores = int(stdout)

            # get the amount of total memory from /proc/meminfo
            with self.transport.open('/proc/meminfo', 'r') as fd:
                for line in fd:
                    if line.startswith('MemTotal'):
                        self.total_memory = int(line.split()[1]) * Memory.KiB
                        break

        elif self.running_kernel == 'Darwin':
            exit_code, stdout, stderr = self.transport.execute_command(
                'sysctl hw.ncpu')
            max_cores = int(stdout.split(':')[-1])

            exit_code, stdout, stderr = self.transport.execute_command(
                'sysctl hw.memsize')
            self.total_memory = int(stdout.split(':')[1]) * Memory.B

        if max_cores != self.max_cores:
            log.info(
                "Mismatch of value `max_cores` on resource '%s':"
                " configuration file says `max_cores=%d` while it's actually `%d`."
                " Updating current value.",
                self.name, self.max_cores, max_cores)
            self.max_cores = max_cores

        if self.total_memory != self.max_memory_per_core:
            log.info(
                "Mismatch of value `max_memory_per_core` on resource %s:"
                " configuration file says `max_memory_per_core=%s` while it's"
                " actually `%s`. Updating current value.",
                self.name,
                self.max_memory_per_core,
                self.total_memory.to_str('%g%s', unit=Memory.MB))
            self.max_memory_per_core = self.total_memory

        self.available_memory = self.total_memory
Example #13
0
    def submit_job(self, app):
        """This method will create a remote directory to store job's
        sandbox, and will copy the sandbox in there.
        """
        job = app.execution

        # Create the remote directory.
        try:
            self.transport.connect()

            cmd = "mkdir -p $HOME/.gc3pie_jobs;" \
                " mktemp -d $HOME/.gc3pie_jobs/lrms_job.XXXXXXXXXX"
            log.info("Creating remote temporary folder: command '%s' " % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                ssh_remote_folder = stdout.split('\n')[0]
            else:
                raise gc3libs.exceptions.LRMSError(
                    "Failed executing command '%s' on resource '%s';"
                    " exit code: %d, stderr: '%s'."
                    % (cmd, self.name, exit_code, stderr))
        except gc3libs.exceptions.TransportError:
            raise
        except:
            raise

        # Copy the input file(s) to remote directory.
        for local_path, remote_path in app.inputs.items():
            remote_path = os.path.join(ssh_remote_folder, remote_path)
            remote_parent = os.path.dirname(remote_path)
            try:
                if remote_parent not in ['', '.']:
                    log.debug("Making remote directory '%s'",
                              remote_parent)
                    self.transport.makedirs(remote_parent)
                log.debug("Transferring file '%s' to '%s'",
                          local_path.path, remote_path)
                self.transport.put(local_path.path, remote_path)
                # preserve execute permission on input files
                if os.access(local_path.path, os.X_OK):
                    self.transport.chmod(remote_path, 0o755)
            except:
                log.critical(
                    "Copying input file '%s' to remote cluster '%s' failed",
                    local_path.path, self.frontend)
                raise

        if app.arguments[0].startswith('./'):
            gc3libs.log.debug("Making remote path '%s' executable.",
                              app.arguments[0])
            self.transport.chmod(os.path.join(ssh_remote_folder,
                                              app.arguments[0]), 0o755)

        # if STDOUT/STDERR should be saved in a directory, ensure it
        # exists (see Issue 495 for details)
        for dest in (app.stdout, app.stderr):
            if dest:
                destdir = os.path.dirname(dest)
                if destdir:
                    self.transport.makedirs(
                        posixpath.join(ssh_remote_folder, destdir))

        try:
            sub_cmd, aux_script = self._submit_command(app)
            if aux_script != '':
                # create temporary script name
                script_filename = ('./script.%s.sh' % uuid.uuid4())
                # save script to a temporary file and submit that one instead
                local_script_file = tempfile.NamedTemporaryFile()
                local_script_file.write('#!/bin/sh\n')
                # Add preamble file
                prologue = self.get_prologue_script(app)
                if prologue:
                    local_script_file.write(prologue)

                local_script_file.write(aux_script)

                # Add epilogue files
                epilogue = self.get_epilogue_script(app)
                if epilogue:
                    local_script_file.write(epilogue)

                local_script_file.flush()
                # upload script to remote location
                self.transport.put(
                    local_script_file.name,
                    os.path.join(ssh_remote_folder, script_filename))
                # set execution mode on remote script
                self.transport.chmod(
                    os.path.join(ssh_remote_folder, script_filename), 0o755)
                # cleanup
                local_script_file.close()
                if os.path.exists(local_script_file.name):
                    os.unlink(local_script_file.name)
            else:
                # we still need a script name even if there is no
                # script to submit
                script_filename = ''

            # Submit it
            exit_code, stdout, stderr = self.transport.execute_command(
                "/bin/sh -c %s" % sh_quote_safe('cd %s && %s %s' % (
                    ssh_remote_folder, sub_cmd, script_filename)))

            if exit_code != 0:
                raise gc3libs.exceptions.LRMSError(
                    "Failed executing command 'cd %s && %s %s' on resource"
                    " '%s'; exit code: %d, stderr: '%s'."
                    % (ssh_remote_folder, sub_cmd, script_filename,
                       self.name, exit_code, stderr))

            jobid = self._parse_submit_output(stdout)
            log.debug('Job submitted with jobid: %s', jobid)

            job.execution_target = self.frontend

            job.lrms_jobid = jobid
            job.lrms_jobname = jobid
            try:
                if app.jobname:
                    job.lrms_jobname = app.jobname
            except:
                pass

            if 'stdout' in app:
                job.stdout_filename = app.stdout
            else:
                job.stdout_filename = '%s.o%s' % (job.lrms_jobname, jobid)
            if app.join:
                job.stderr_filename = job.stdout_filename
            else:
                if 'stderr' in app:
                    job.stderr_filename = app.stderr
                else:
                    job.stderr_filename = '%s.e%s' % (job.lrms_jobname, jobid)
            job.history.append('Submitted to %s @ %s, got jobid %s'
                               % (self._batchsys_name, self.name, jobid))
            job.history.append("Submission command output:\n"
                               "  === stdout ===\n%s"
                               "  === stderr ===\n%s"
                               "  === end ===\n"
                               % (stdout, stderr), 'pbs', 'qsub')
            job.ssh_remote_folder = ssh_remote_folder

            return job

        except:
            log.critical(
                "Failure submitting job to resource '%s' - "
                "see log file for errors", self.name)
            raise
Example #14
0
    def submit_job(self, app):
        """This method will create a remote directory to store job's
        sandbox, and will copy the sandbox in there.
        """
        job = app.execution

        # Create the remote directory.
        try:
            self.transport.connect()

            cmd = "mkdir -p $HOME/.gc3pie_jobs;" \
                " mktemp -d $HOME/.gc3pie_jobs/lrms_job.XXXXXXXXXX"
            log.info("Creating remote temporary folder: command '%s' " % cmd)
            exit_code, stdout, stderr = self.transport.execute_command(cmd)
            if exit_code == 0:
                ssh_remote_folder = stdout.split('\n')[0]
            else:
                raise gc3libs.exceptions.LRMSError(
                    "Failed executing command '%s' on resource '%s';"
                    " exit code: %d, stderr: '%s'." %
                    (cmd, self.name, exit_code, stderr))
        except gc3libs.exceptions.TransportError:
            raise
        except:
            raise

        # Copy the input file(s) to remote directory.
        for local_path, remote_path in app.inputs.items():
            remote_path = os.path.join(ssh_remote_folder, remote_path)
            remote_parent = os.path.dirname(remote_path)
            try:
                if remote_parent not in ['', '.']:
                    log.debug("Making remote directory '%s'", remote_parent)
                    self.transport.makedirs(remote_parent)
                log.debug("Transferring file '%s' to '%s'", local_path.path,
                          remote_path)
                self.transport.put(local_path.path, remote_path)
                # preserve execute permission on input files
                if os.access(local_path.path, os.X_OK):
                    self.transport.chmod(remote_path, 0o755)
            except:
                log.critical(
                    "Copying input file '%s' to remote cluster '%s' failed",
                    local_path.path, self.frontend)
                raise

        if app.arguments[0].startswith('./'):
            gc3libs.log.debug("Making remote path '%s' executable.",
                              app.arguments[0])
            self.transport.chmod(
                os.path.join(ssh_remote_folder, app.arguments[0]), 0o755)

        # if STDOUT/STDERR should be saved in a directory, ensure it
        # exists (see Issue 495 for details)
        for dest in (app.stdout, app.stderr):
            if dest:
                destdir = os.path.dirname(dest)
                if destdir:
                    self.transport.makedirs(
                        posixpath.join(ssh_remote_folder, destdir))

        try:
            sub_cmd, aux_script = self._submit_command(app)
            if aux_script != '':
                # create temporary script name
                script_filename = ('./script.%s.sh' % uuid.uuid4())
                # save script to a temporary file and submit that one instead
                local_script_file = tempfile.NamedTemporaryFile()
                local_script_file.write('#!/bin/sh\n')
                # Add preamble file
                prologue = self.get_prologue_script(app)
                if prologue:
                    local_script_file.write(prologue)

                local_script_file.write(aux_script)

                # Add epilogue files
                epilogue = self.get_epilogue_script(app)
                if epilogue:
                    local_script_file.write(epilogue)

                local_script_file.flush()
                # upload script to remote location
                self.transport.put(
                    local_script_file.name,
                    os.path.join(ssh_remote_folder, script_filename))
                # set execution mode on remote script
                self.transport.chmod(
                    os.path.join(ssh_remote_folder, script_filename), 0o755)
                # cleanup
                local_script_file.close()
                if os.path.exists(local_script_file.name):
                    os.unlink(local_script_file.name)
            else:
                # we still need a script name even if there is no
                # script to submit
                script_filename = ''

            # Submit it
            exit_code, stdout, stderr = self.transport.execute_command(
                "/bin/sh -c %s" %
                sh_quote_safe('cd %s && %s %s' %
                              (ssh_remote_folder, sub_cmd, script_filename)))

            if exit_code != 0:
                raise gc3libs.exceptions.LRMSError(
                    "Failed executing command 'cd %s && %s %s' on resource"
                    " '%s'; exit code: %d, stderr: '%s'." %
                    (ssh_remote_folder, sub_cmd, script_filename, self.name,
                     exit_code, stderr))

            jobid = self._parse_submit_output(stdout)
            log.debug('Job submitted with jobid: %s', jobid)

            job.execution_target = self.frontend

            job.lrms_jobid = jobid
            job.lrms_jobname = jobid
            try:
                if app.jobname:
                    job.lrms_jobname = app.jobname
            except:
                pass

            if 'stdout' in app:
                job.stdout_filename = app.stdout
            else:
                job.stdout_filename = '%s.o%s' % (job.lrms_jobname, jobid)
            if app.join:
                job.stderr_filename = job.stdout_filename
            else:
                if 'stderr' in app:
                    job.stderr_filename = app.stderr
                else:
                    job.stderr_filename = '%s.e%s' % (job.lrms_jobname, jobid)
            job.history.append('Submitted to %s @ %s, got jobid %s' %
                               (self._batchsys_name, self.name, jobid))
            job.history.append(
                "Submission command output:\n"
                "  === stdout ===\n%s"
                "  === stderr ===\n%s"
                "  === end ===\n" % (stdout, stderr), 'pbs', 'qsub')
            job.ssh_remote_folder = ssh_remote_folder

            return job

        except:
            log.critical(
                "Failure submitting job to resource '%s' - "
                "see log file for errors", self.name)
            raise
Example #15
0
    def _gather_machine_specs(self):
        """
        Gather information about this machine and, if `self.override`
        is true, also update the value of `max_cores` and
        `max_memory_per_jobs` attributes.

        This method works with both Linux and MacOSX.
        """
        self.transport.connect()

        # expand env variables in the `resource_dir` setting
        exit_code, stdout, stderr = self.transport.execute_command(
            'echo %s' % sh_quote_unsafe(self.cfg_resourcedir))
        self.resource_dir = stdout.strip()

        # XXX: it is actually necessary to create the folder
        # as a separate step
        if not self.transport.exists(self.resource_dir):
            try:
                log.info("Creating resource file directory: '%s' ...",
                         self.resource_dir)
                self.transport.makedirs(self.resource_dir)
            except Exception as ex:
                log.error("Failed creating resource directory '%s':"
                          " %s: %s", self.resource_dir, type(ex), str(ex))
                # cannot continue
                raise

        exit_code, stdout, stderr = self.transport.execute_command('uname -m')
        arch = gc3libs.config._parse_architecture(stdout)
        if arch != self.architecture:
            raise gc3libs.exceptions.ConfigurationError(
                "Invalid architecture: configuration file says `%s` but "
                "it actually is `%s`" %
                (str.join(', ', self.architecture), str.join(', ', arch)))

        exit_code, stdout, stderr = self.transport.execute_command('uname -s')
        self.running_kernel = stdout.strip()

        # ensure `time_cmd` points to a valid value
        self.time_cmd = self._locate_gnu_time()
        if not self.time_cmd:
            raise gc3libs.exceptions.ConfigurationError(
                "Unable to find GNU `time` installed on your system."
                " Please, install GNU time and set the `time_cmd`"
                " configuration option in gc3pie.conf.")

        if not self.override:
            # Ignore other values.
            return

        if self.running_kernel == 'Linux':
            exit_code, stdout, stderr = self.transport.execute_command('nproc')
            max_cores = int(stdout)

            # get the amount of total memory from /proc/meminfo
            with self.transport.open('/proc/meminfo', 'r') as fd:
                for line in fd:
                    if line.startswith('MemTotal'):
                        self.total_memory = int(line.split()[1]) * Memory.KiB
                        break

        elif self.running_kernel == 'Darwin':
            exit_code, stdout, stderr = self.transport.execute_command(
                'sysctl hw.ncpu')
            max_cores = int(stdout.split(':')[-1])

            exit_code, stdout, stderr = self.transport.execute_command(
                'sysctl hw.memsize')
            self.total_memory = int(stdout.split(':')[1]) * Memory.B

        if max_cores != self.max_cores:
            log.info(
                "Mismatch of value `max_cores` on resource '%s':"
                " configuration file says `max_cores=%d` while it's actually `%d`."
                " Updating current value.", self.name, self.max_cores,
                max_cores)
            self.max_cores = max_cores

        if self.total_memory != self.max_memory_per_core:
            log.info(
                "Mismatch of value `max_memory_per_core` on resource %s:"
                " configuration file says `max_memory_per_core=%s` while it's"
                " actually `%s`. Updating current value.", self.name,
                self.max_memory_per_core,
                self.total_memory.to_str('%g%s', unit=Memory.MB))
            self.max_memory_per_core = self.total_memory

        self.available_memory = self.total_memory