def _start(self): """Detach a process from the controlling terminal and run it in the background as a daemon. We don't use pksubprocess. This method is not called from the MainThread so can't set signals. """ env = _safe_env() env['SIREPO_MPI_CORES'] = str(mpi.cfg.cores) try: pid = os.fork() except OSError as e: pkdlog('{}: fork OSError: {} errno={}', self.jid, e.strerror, e.errno) reraise if pid != 0: pkdlog('{}: started: pid={} cmd={}', self.jid, pid, self.cmd) self.__pid = pid return try: os.chdir(str(self.run_dir)) #Don't os.setsid() so signals propagate properly maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1] if (maxfd == resource.RLIM_INFINITY): maxfd = runner.MAX_OPEN_FILES for fd in range(0, maxfd): try: os.close(fd) except OSError: pass sys.stdin = open(template_common.RUN_LOG, 'a+') assert sys.stdin.fileno() == 0 os.dup2(0, 1) sys.stdout = os.fdopen(1, 'a+') os.dup2(0, 2) sys.stderr = os.fdopen(2, 'a+') pkdlog('{}: child will exec: {}', self.jid, self.cmd) sys.stderr.flush() try: simulation_db.write_status('running', self.run_dir) os.execvpe(self.cmd[0], self.cmd, env=env) except BaseException as e: pkdlog( '{}: execvp error: {} errno={}', self.jid, e.strerror if hasattr(e, 'strerror') else '', e.errno if hasattr(e, 'errno') else '', ) finally: sys.exit(1) except BaseException as e: # NOTE: there's no lock here so just append to the log. This # really shouldn't happen, but it might (out of memory) so just # log to the run log and hope somebody notices self._error_during_start(e, pkdexc()) raise
def _start(self): """Detach a process from the controlling terminal and run it in the background as a daemon. We don't use pksubprocess. This method is not called from the MainThread so can't set signals. """ try: pid = os.fork() except OSError as e: pkdlog('{}: fork OSError: {} errno={}', self.jid, e.strerror, e.errno) reraise if pid != 0: pkdlog('{}: started: pid={} cmd={}', self.jid, pid, self.cmd) self.pid = pid return try: os.chdir(str(self.run_dir)) #Don't os.setsid() so signals propagate properly import resource maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1] if (maxfd == resource.RLIM_INFINITY): maxfd = _MAX_OPEN_FILES for fd in range(0, maxfd): try: os.close(fd) except OSError: pass sys.stdin = open(template_common.RUN_LOG, 'a+') assert sys.stdin.fileno() == 0 os.dup2(0, 1) sys.stdout = os.fdopen(1, 'a+') os.dup2(0, 2) sys.stderr = os.fdopen(2, 'a+') pkdlog('{}: child will exec: {}', self.jid, self.cmd) sys.stderr.flush() try: simulation_db.write_status('running', self.run_dir) os.execvp(self.cmd[0], self.cmd) finally: pkdlog('{}: execvp error: {} errno={}', self.jid, e.strerror, e.errno) sys.exit(1) except BaseException as e: with open(str(self.run_dir.join(template_common.RUN_LOG)), 'a') as f: f.write('{}: error starting simulation: {}'.format( self.jid, e)) raise
def _slot_start(self, slot): """Have a slot so now ask docker to run the job POSIT: Job locked by caller """ # __host is sentinel of the start attempt self.__host = slot.host ctx = pkcollections.Dict( kill_secs=runner.KILL_TIMEOUT_SECS, run_dir=self.run_dir, run_log=self.run_dir.join(template_common.RUN_LOG), run_secs=self.run_secs(), sh_cmd=self.__sh_cmd(), ) self.__image = _image() script = str(self.run_dir.join('runner-docker.sh')) with open(str(script), 'wb') as f: f.write(pkjinja.render_resource('runner/docker.sh', ctx)) cmd = _RUN_PREFIX + ( '--cpus={}'.format(slot.cores), '--detach', #TODO(robnagler) other environ vars required? '--env=SIREPO_MPI_CORES={}'.format(slot.cores), '--init', '--memory={}g'.format(slot.gigabytes), '--name={}'.format(self.__cname), '--network=none', #TODO(robnagler) this doesn't do anything # '--ulimit=cpu=1', # do not use a user name, because that may not map inside the # container properly. /etc/passwd on the host and guest are # different. '--user={}'.format(os.getuid()), ) + self.__volumes() + ( #TODO(robnagler) make this configurable per code (would be structured) self.__image, 'bash', script, ) self.__cid = _cmd(slot.host, cmd) simulation_db.write_status('running', self.run_dir) pkdlog( '{}: started slot={} cid={} dir={} cmd={}', self.__cname, slot, self.__cid, self.run_dir, cmd, )
def _start_job(self): """Detach a process from the controlling terminal and run it in the background as a daemon. We don't use pksubprocess. This method is not called from the MainThread so can't set signals. """ try: pid = os.fork() except OSError as e: pkdlog('{}: fork OSError: {} errno={}', self.jid, e.strerror, e.errno) reraise if pid != 0: pkdlog('{}: started: pid={} cmd={}', self.jid, pid, self.cmd) return pid try: os.chdir(str(self.run_dir)) #Don't os.setsid() so signals propagate properly import resource maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1] if (maxfd == resource.RLIM_INFINITY): maxfd = 1024 for fd in range(0, maxfd): try: os.close(fd) except OSError: pass sys.stdin = open(template_common.RUN_LOG, 'a+') assert sys.stdin.fileno() == 0 os.dup2(0, 1) sys.stdout = os.fdopen(1, 'a+') os.dup2(0, 2) sys.stderr = os.fdopen(2, 'a+') pkdlog('{}: child will exec: {}', self.jid, self.cmd) sys.stderr.flush() try: simulation_db.write_status('running', self.run_dir) os.execvp(self.cmd[0], self.cmd) finally: pkdlog('{}: execvp error: {} errno={}', self.jid, e.strerror, e.errno) sys.exit(1) except BaseException as e: with open(str(self.run_dir.join(template_common.RUN_LOG)), 'a') as f: f.write('{}: error starting simulation: {}'.format(self.jid, e)) raise
def _error_during_start(self, exception, stack): """An exception happened, log what you can. Callback from implementations POSIT: job already locked or in subprocess (see Background._start) """ try: with open(str(self.run_dir.join(template_common.RUN_LOG)), 'a') as f: f.write( '{}: error starting simulation: {}\n{}'.format( self.jid, exception, stack, ), ) simulation_db.write_status('error', self.run_dir) pkdlog('{}: unable to start job: {} {}', self.jid, exception, stack) except Exception: pass