Esempio n. 1
0
    def _start(self):
        """Detach a process from the controlling terminal and run it in the
        background as a daemon.

        We don't use pksubprocess. This method is not called from the MainThread
        so can't set signals.
        """
        env = _safe_env()
        env['SIREPO_MPI_CORES'] = str(mpi.cfg.cores)
        try:
            pid = os.fork()
        except OSError as e:
            pkdlog('{}: fork OSError: {} errno={}', self.jid, e.strerror, e.errno)
            reraise
        if pid != 0:
            pkdlog('{}: started: pid={} cmd={}', self.jid, pid, self.cmd)
            self.__pid = pid
            return
        try:
            os.chdir(str(self.run_dir))
            #Don't os.setsid() so signals propagate properly
            maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
            if (maxfd == resource.RLIM_INFINITY):
                maxfd = runner.MAX_OPEN_FILES
            for fd in range(0, maxfd):
                try:
                    os.close(fd)
                except OSError:
                    pass
            sys.stdin = open(template_common.RUN_LOG, 'a+')
            assert sys.stdin.fileno() == 0
            os.dup2(0, 1)
            sys.stdout = os.fdopen(1, 'a+')
            os.dup2(0, 2)
            sys.stderr = os.fdopen(2, 'a+')
            pkdlog('{}: child will exec: {}', self.jid, self.cmd)
            sys.stderr.flush()
            try:
                simulation_db.write_status('running', self.run_dir)
                os.execvpe(self.cmd[0], self.cmd, env=env)
            except BaseException as e:
                pkdlog(
                    '{}: execvp error: {} errno={}',
                    self.jid,
                    e.strerror if hasattr(e, 'strerror') else '',
                    e.errno if hasattr(e, 'errno') else '',
                )
            finally:
                sys.exit(1)
        except BaseException as e:
            # NOTE: there's no lock here so just append to the log. This
            # really shouldn't happen, but it might (out of memory) so just
            # log to the run log and hope somebody notices
            self._error_during_start(e, pkdexc())
            raise
Esempio n. 2
0
    def _start(self):
        """Detach a process from the controlling terminal and run it in the
        background as a daemon.

        We don't use pksubprocess. This method is not called from the MainThread
        so can't set signals.
        """
        env = _safe_env()
        env['SIREPO_MPI_CORES'] = str(mpi.cfg.cores)
        try:
            pid = os.fork()
        except OSError as e:
            pkdlog('{}: fork OSError: {} errno={}', self.jid, e.strerror, e.errno)
            reraise
        if pid != 0:
            pkdlog('{}: started: pid={} cmd={}', self.jid, pid, self.cmd)
            self.__pid = pid
            return
        try:
            os.chdir(str(self.run_dir))
            #Don't os.setsid() so signals propagate properly
            maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
            if (maxfd == resource.RLIM_INFINITY):
                maxfd = runner.MAX_OPEN_FILES
            for fd in range(0, maxfd):
                try:
                    os.close(fd)
                except OSError:
                    pass
            sys.stdin = open(template_common.RUN_LOG, 'a+')
            assert sys.stdin.fileno() == 0
            os.dup2(0, 1)
            sys.stdout = os.fdopen(1, 'a+')
            os.dup2(0, 2)
            sys.stderr = os.fdopen(2, 'a+')
            pkdlog('{}: child will exec: {}', self.jid, self.cmd)
            sys.stderr.flush()
            try:
                simulation_db.write_status('running', self.run_dir)
                os.execvpe(self.cmd[0], self.cmd, env=env)
            except BaseException as e:
                pkdlog(
                    '{}: execvp error: {} errno={}',
                    self.jid,
                    e.strerror if hasattr(e, 'strerror') else '',
                    e.errno if hasattr(e, 'errno') else '',
                )
            finally:
                sys.exit(1)
        except BaseException as e:
            # NOTE: there's no lock here so just append to the log. This
            # really shouldn't happen, but it might (out of memory) so just
            # log to the run log and hope somebody notices
            self._error_during_start(e, pkdexc())
            raise
Esempio n. 3
0
    def _start(self):
        """Detach a process from the controlling terminal and run it in the
        background as a daemon.

        We don't use pksubprocess. This method is not called from the MainThread
        so can't set signals.
        """
        try:
            pid = os.fork()
        except OSError as e:
            pkdlog('{}: fork OSError: {} errno={}', self.jid, e.strerror,
                   e.errno)
            reraise
        if pid != 0:
            pkdlog('{}: started: pid={} cmd={}', self.jid, pid, self.cmd)
            self.pid = pid
            return
        try:
            os.chdir(str(self.run_dir))
            #Don't os.setsid() so signals propagate properly
            import resource
            maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
            if (maxfd == resource.RLIM_INFINITY):
                maxfd = _MAX_OPEN_FILES
            for fd in range(0, maxfd):
                try:
                    os.close(fd)
                except OSError:
                    pass
            sys.stdin = open(template_common.RUN_LOG, 'a+')
            assert sys.stdin.fileno() == 0
            os.dup2(0, 1)
            sys.stdout = os.fdopen(1, 'a+')
            os.dup2(0, 2)
            sys.stderr = os.fdopen(2, 'a+')
            pkdlog('{}: child will exec: {}', self.jid, self.cmd)
            sys.stderr.flush()
            try:
                simulation_db.write_status('running', self.run_dir)
                os.execvp(self.cmd[0], self.cmd)
            finally:
                pkdlog('{}: execvp error: {} errno={}', self.jid, e.strerror,
                       e.errno)
                sys.exit(1)
        except BaseException as e:
            with open(str(self.run_dir.join(template_common.RUN_LOG)),
                      'a') as f:
                f.write('{}: error starting simulation: {}'.format(
                    self.jid, e))
            raise
Esempio n. 4
0
    def _slot_start(self, slot):
        """Have a slot so now ask docker to run the job

        POSIT: Job locked by caller
        """
        # __host is sentinel of the start attempt
        self.__host = slot.host
        ctx = pkcollections.Dict(
            kill_secs=runner.KILL_TIMEOUT_SECS,
            run_dir=self.run_dir,
            run_log=self.run_dir.join(template_common.RUN_LOG),
            run_secs=self.run_secs(),
            sh_cmd=self.__sh_cmd(),
        )
        self.__image = _image()
        script = str(self.run_dir.join('runner-docker.sh'))
        with open(str(script), 'wb') as f:
            f.write(pkjinja.render_resource('runner/docker.sh', ctx))
        cmd = _RUN_PREFIX + (
            '--cpus={}'.format(slot.cores),
            '--detach',
            #TODO(robnagler) other environ vars required?
            '--env=SIREPO_MPI_CORES={}'.format(slot.cores),
            '--init',
            '--memory={}g'.format(slot.gigabytes),
            '--name={}'.format(self.__cname),
            '--network=none',
            #TODO(robnagler) this doesn't do anything
            #            '--ulimit=cpu=1',
            # do not use a user name, because that may not map inside the
            # container properly. /etc/passwd on the host and guest are
            # different.
            '--user={}'.format(os.getuid()),
        ) + self.__volumes() + (
            #TODO(robnagler) make this configurable per code (would be structured)
            self.__image,
            'bash',
            script,
        )
        self.__cid = _cmd(slot.host, cmd)
        simulation_db.write_status('running', self.run_dir)
        pkdlog(
            '{}: started slot={} cid={} dir={} cmd={}',
            self.__cname,
            slot,
            self.__cid,
            self.run_dir,
            cmd,
        )
Esempio n. 5
0
    def _start_job(self):
        """Detach a process from the controlling terminal and run it in the
        background as a daemon.

        We don't use pksubprocess. This method is not called from the MainThread
        so can't set signals.
        """
        try:
            pid = os.fork()
        except OSError as e:
            pkdlog('{}: fork OSError: {} errno={}', self.jid, e.strerror, e.errno)
            reraise
        if pid != 0:
            pkdlog('{}: started: pid={} cmd={}', self.jid, pid, self.cmd)
            return pid
        try:
            os.chdir(str(self.run_dir))
            #Don't os.setsid() so signals propagate properly
            import resource
            maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
            if (maxfd == resource.RLIM_INFINITY):
                maxfd = 1024
            for fd in range(0, maxfd):
                try:
                    os.close(fd)
                except OSError:
                    pass
            sys.stdin = open(template_common.RUN_LOG, 'a+')
            assert sys.stdin.fileno() == 0
            os.dup2(0, 1)
            sys.stdout = os.fdopen(1, 'a+')
            os.dup2(0, 2)
            sys.stderr = os.fdopen(2, 'a+')
            pkdlog('{}: child will exec: {}', self.jid, self.cmd)
            sys.stderr.flush()
            try:
                simulation_db.write_status('running', self.run_dir)
                os.execvp(self.cmd[0], self.cmd)
            finally:
                pkdlog('{}: execvp error: {} errno={}', self.jid, e.strerror, e.errno)
                sys.exit(1)
        except BaseException as e:
            with open(str(self.run_dir.join(template_common.RUN_LOG)), 'a') as f:
                f.write('{}: error starting simulation: {}'.format(self.jid, e))
            raise
Esempio n. 6
0
    def _error_during_start(self, exception, stack):
        """An exception happened, log what you can.

        Callback from implementations

        POSIT: job already locked or in subprocess (see Background._start)
        """
        try:
            with open(str(self.run_dir.join(template_common.RUN_LOG)), 'a') as f:
                f.write(
                    '{}: error starting simulation: {}\n{}'.format(
                        self.jid,
                        exception,
                        stack,
                    ),
                )
            simulation_db.write_status('error', self.run_dir)
            pkdlog('{}: unable to start job: {} {}', self.jid, exception, stack)
        except Exception:
            pass