def mesos_directory(): if not "MESOS_DIRECTORY" in os.environ: return work_dir = os.path.abspath(os.getcwd()) task_dir = os.path.abspath(os.environ["MESOS_DIRECTORY"]) if task_dir != work_dir: log.info("Changing directory to MESOS_DIRECTORY=%s", task_dir) os.chdir(task_dir)
def remove(self, *args, **kwargs): errors = 0 lk = deimos.flock.LK(self.lock, LOCK_EX | LOCK_NB) try: lk.lock() except deimos.flock.Err: msg = "Lock unavailable -- is cleanup already running?" if self.optimistic: log.info(msg) return 0 else: log.error(msg) raise e try: for d in self.dirs(*args, **kwargs): state = deimos.state.state(d) if state is None: log.warning("Not able to load state from: %s", d) continue try: cmd = ["rm", "-rf", d + "/"] cmd += [state._mesos()] if state.cid() is not None: cmd += [state._docker()] Run()(cmd) except subprocess.CalledProcessError: errors += 1 finally: lk.unlock() if errors != 0: log.error("There were failures on %d directories", errors) return 4
def load_configuration(f=None, interactive=sys.stdout.isatty()): error = None defaults = _Struct(docker=Docker(), index=DockerIndex(), containers=Containers(), uris=URIs(), state=State(), log=Log( console=logging.DEBUG if interactive else None, syslog=logging.INFO if not interactive else None)) parsed = None try: f = f if f else path() if f: parsed = parse(f) except Exception as e: error = e finally: confs = defaults.merge(parsed) if parsed else defaults deimos.logger.initialize(**dict(confs.log.items())) if error: log.exception((("Error loading %s: " % f) if f else "") + str(error)) sys.exit(16) if parsed: log.info("Loaded configuration from %s" % f) for _, conf in parsed.items(): log.debug("Found: %r", conf) return confs
def load_configuration(f=None, interactive=sys.stdout.isatty()): error = None defaults = _Struct( docker=Docker(), index=DockerIndex(), containers=Containers(), uris=URIs(), state=State(), hooks=Hooks(), log=Log(console=(logging.DEBUG if interactive else None), syslog=(logging.INFO if not interactive else None)), ) parsed = None try: f = f if f else path() if f: parsed = parse(f) except Exception as e: error = e finally: confs = defaults.merge(parsed) if parsed else defaults deimos.logger.initialize(**dict(confs.log.items())) if error: pre = ("Error loading %s: " % f) if f else "" log.exception(pre + str(error)) sys.exit(16) if parsed: log.info("Loaded configuration from %s" % f) for _, conf in parsed.items(): log.debug("Found: %r", conf) return confs
def remove(self, *args, **kwargs): errors = 0 lk = deimos.flock.LK(self.lock, LOCK_EX|LOCK_NB) try: lk.lock() except deimos.flock.Err: msg = "Lock unavailable -- is cleanup already running?" if self.optimistic: log.info(msg) return 0 else: log.error(msg) raise e try: for d in self.dirs(*args, **kwargs): state = deimos.state.state(d) if state is None: log.warning("Not able to load state from: %s", d) continue try: cmd = ["rm", "-rf", d + "/"] cmd += [state._mesos()] if state.cid() is not None: cmd += [state._docker()] Run()(cmd) except subprocess.CalledProcessError: errors += 1 finally: lk.unlock() if errors != 0: log.error("There were failures on %d directories", errors) return 4
def stop_docker_and_resume(self, signum): if self.state is not None and self.state.cid() is not None: cid = self.state.cid() log.info("Trying to stop Docker container: %s", cid) try: Run()(deimos.docker.stop(cid)) except subprocess.CalledProcessError: pass return deimos.sig.Resume()
def destroy(self, destroy_pb, *args): log.info(" ".join(args)) container_id = destroy_pb.container_id.value state = deimos.state.State(self.state_root, mesos_id=container_id) state.await_launch() lk_d = state.lock("destroy", LOCK_EX) if state.exit() is None: Run()(deimos.docker.stop(state.cid())) else: log.info("Container is stopped") return 0
def await_cid(self, seconds=60): base = 0.05 start = time.time() steps = [1.0, 1.25, 1.6, 2.0, 2.5, 3.2, 4.0, 5.0, 6.4, 8.0] scales = (10.0 ** n for n in itertools.count()) scaled = ([scale * step for step in steps] for scale in scales) sleeps = itertools.chain.from_iterable(scaled) log.info("Awaiting CID file: %s", self.resolve("cid")) while self.cid(refresh=True) in [None, ""]: time.sleep(next(sleeps)) if time.time() - start >= seconds: raise CIDTimeout("No CID file after %ds" % seconds)
def await_cid(self, seconds=60): base = 0.05 start = time.time() steps = [1.0, 1.25, 1.6, 2.0, 2.5, 3.2, 4.0, 5.0, 6.4, 8.0] scales = (10.0**n for n in itertools.count()) scaled = ([scale * step for step in steps] for scale in scales) sleeps = itertools.chain.from_iterable(scaled) log.info("Awaiting CID file: %s", self.resolve("cid")) while self.cid(refresh=True) in [None, ""]: time.sleep(next(sleeps)) if time.time() - start >= seconds: raise CIDTimeout("No CID file after %ds" % seconds)
def usage(self, usage_pb, *args): log.info(" ".join(args)) container_id = usage_pb.container_id.value state = deimos.state.State(self.state_root, mesos_id=container_id) state.await_launch() state.ids() if state.cid() is None: log.info("Container not started?") return 0 if state.exit() is not None: log.info("Container is stopped") return 0 cg = deimos.cgroups.CGroups(**deimos.docker.cgroups(state.cid())) if len(cg.keys()) == 0: log.info("Container has no CGroups...already stopped?") return 0 try: recordio.write(ResourceStatistics, timestamp=time.time(), mem_limit_bytes=cg.memory.limit(), cpus_limit=cg.cpu.limit(), # cpus_user_time_secs = cg.cpuacct.user_time(), # cpus_system_time_secs = cg.cpuacct.system_time(), mem_rss_bytes=cg.memory.rss()) except AttributeError as e: log.error("Missing CGroup!") raise e return 0
def usage(self, usage_pb, *args): log.info(" ".join(args)) container_id = usage_pb.container_id.value state = deimos.state.State(self.state_root, mesos_id=container_id) state.await_launch() state.ids() if state.cid() is None: log.info("Container not started?") return 0 if state.exit() is not None: log.info("Container is stopped") return 0 cg = deimos.cgroups.CGroups(**deimos.docker.cgroups(state.cid())) if len(cg.keys()) == 0: log.info("Container has no CGroups...already stopped?") return 0 try: recordio.write( ResourceStatistics, timestamp=time.time(), mem_limit_bytes=cg.memory.limit(), cpus_limit=cg.cpu.limit(), # cpus_user_time_secs = cg.cpuacct.user_time(), # cpus_system_time_secs = cg.cpuacct.system_time(), mem_rss_bytes=cg.memory.rss()) except AttributeError as e: log.error("Missing CGroup!") raise e return 0
def observe(self, *args): log.info(" ".join(args)) state = deimos.state.State(self.state_root, mesos_id=args[0]) self.state = state deimos.sig.install(self.stop_docker_and_resume) state.await_launch() try: # Take the wait lock to block calls to wait() state.lock("wait", LOCK_SH, seconds=None) except IOError as e: # Allows for signal recovery if e.errno != errno.EINTR: raise e state.lock("wait", LOCK_SH, seconds=1) if state.exit() is not None: return state.exit() raise Err("Wait lock is not held nor is exit file present")
def destroy(self, container_id, *args): log.info(" ".join([container_id] + list(args))) state = deimos.state.State(self.state_root, mesos_id=container_id) state.await_launch() lk_d = state.lock("destroy", LOCK_EX) if state.exit() is not None: Run()(deimos.docker.stop(state.cid())) else: log.info("Container is stopped") if not sys.stdout.closed: # If we're called as part of the signal handler set up by launch, # STDOUT is probably closed already. Writing the Protobuf would # only result in a bevy of error messages. proto_out(protos.ExternalStatus, message="destroy: ok") return 0
def lock(self, name, flags, seconds=60): fmt_time = "indefinite" if seconds is None else "%ds" % seconds fmt_flags = deimos.flock.format_lock_flags(flags) flags, seconds = deimos.flock.nb_seconds(flags, seconds) log.info("request // %s %s (%s)", name, fmt_flags, fmt_time) p = self.resolve(os.path.join("lock", name), mkdir=True) lk = deimos.flock.LK(p, flags, seconds) try: lk.lock() except deimos.flock.Err: log.error("failure // %s %s (%s)", name, fmt_flags, fmt_time) raise if (flags & LOCK_EX) != 0: lk.handle.write(iso() + "\n") log.info("success // %s %s (%s)", name, fmt_flags, fmt_time) return lk
def run(options, image, command=[], env={}, cpus=None, mems=None, ports=[]): envs = env.items() if isinstance(env, dict) else env pairs = [("-e", "%s=%s" % (k, v)) for k, v in envs] if ports != []: # NB: Forces external call to pre-fetch image port_pairings = list(itertools.izip_longest(ports, inner_ports(image))) log.info("Port pairings (Mesos, Docker) // %r", port_pairings) for allocated, target in port_pairings: if allocated is None: log.warning("Container exposes more ports than were allocated") break options += ["-p", "%d:%d" % (allocated, target or allocated)] argv = ["run"] + options argv += ["-c", str(cpus)] if cpus else [] argv += ["-m", str(mems)] if mems else [] argv += [_ for __ in pairs for _ in __] # This is just flatten argv += [image] + command return docker(*argv)
def stop_docker_and_resume(self, signum): if self.state is not None and self.state.cid() is not None: cid = self.state.cid() log.info("Trying to stop Docker container: %s", cid) container_id = self.state.docker_id log.debug("Unwiring the container %s from MidoNet", container_id) try: log.info("state = %s", dir(self.state)) midonet.unwire_container_from_midonet(container_id) log.debug("Successfully unwired the container %s from MidoNet "\ "bridge", container_id) except Exception as ex: log.error(traceback.format_exc()) try: Run()(deimos.docker.stop(cid)) except subprocess.CalledProcessError: pass return deimos.sig.Resume()
def containers(self, *args): log.info(" ".join(args)) data = Run(data=True)(deimos.docker.docker("ps", "--no-trunc", "-q")) mesos_ids = [] for line in data.splitlines(): cid = line.strip() state = deimos.state.State(self.state_root, docker_id=cid) if not state.exists(): continue try: state.lock("wait", LOCK_SH | LOCK_NB) except deimos.flock.Err: # LOCK_EX held, so launch() is running mesos_ids += [state.mesos_container_id()] containers = Containers() for mesos_id in mesos_ids: container = containers.containers.add() container.value = mesos_id recordio.writeProto(containers) return 0
def wait(self, *args): log.info(" ".join(list(args))) if list(args[0:1]) != ["--docker"]: return # We rely on the Mesos default wait strategy in general # In Docker mode, we use Docker wait to wait for the container and # then exit with the returned exit code. The passed in ID should be a # Docker CID, not a Mesos container ID. state = deimos.state.State(self.state_root, docker_id=args[1]) self.state = state deimos.sig.install(self.stop_docker_and_resume) state.await_launch() try: state.lock("wait", LOCK_SH, seconds=None) except IOError as e: # Allows for signal recovery if e.errno != errno.EINTR: raise e state.lock("wait", LOCK_SH, 1) if state.exit() is not None: return state.exit() raise Err("Wait lock is not held nor is exit file present")
def wait(self, *args): log.info(" ".join(list(args))) if list(args[0:1]) != ["--docker"]: return # We rely on the Mesos default wait strategy in general # In Docker mode, we use Docker wait to wait for the container and # then exit with the returned exit code. The passed in ID should be a # Docker CID, not a Mesos container ID. state = deimos.state.State(self.state_root, docker_id=args[1]) self.state = state deimos.sig.install(self.signal_docker_and_resume) state.await_launch() try: state.lock("wait", LOCK_SH, seconds=None) except IOError as e: # Allows for signal recovery if e.errno != errno.EINTR: raise e state.lock("wait", LOCK_SH, 1) if state.exit() is not None: return state.exit() raise Err("Wait lock is not held nor is exit file present")
def place_uris(launchy, directory, optimistic_unpack=False): cmd = deimos.cmd.Run() cmd(["mkdir", "-p", directory]) for item in launchy.uris: uri = item.value gen_unpack_cmd = unpacker(uri) if optimistic_unpack else None log.info("Retrieving URI: %s", deimos.cmd.escape([uri])) try: basename = uri.split("/")[-1] f = os.path.join(directory, basename) if basename == "": raise IndexError except IndexError: log.info("Not able to determine basename: %r", uri) continue try: cmd(["curl", "-sSfL", uri, "--output", f]) except subprocess.CalledProcessError as e: log.warning("Failed while processing URI: %s", deimos.cmd.escape(uri)) continue if item.executable: os.chmod(f, 0755) if gen_unpack_cmd is not None: log.info("Unpacking %s" % f) cmd(gen_unpack_cmd(f, directory)) cmd(["rm", "-f", f])
def place_uris(task, directory, optimistic_unpack=False): cmd = deimos.cmd.Run() cmd(["mkdir", "-p", directory]) for item in uris(task): uri = item.value gen_unpack_cmd = unpacker(uri) if optimistic_unpack else None log.info("Retrieving URI: %s", deimos.cmd.escape([uri])) try: basename = uri.split("/")[-1] f = os.path.join(directory, basename) if basename == "": raise IndexError except IndexError: log.info("Not able to determine basename: %r", uri) continue try: cmd(["curl", "-sSfL", uri, "--output", f]) except subprocess.CalledProcessError as e: log.warning("Failed while processing URI: %s", deimos.cmd.escape(uri)) continue if item.executable: os.chmod(f, 0755) if gen_unpack_cmd is not None: log.info("Unpacking %s" % f) cmd(gen_unpack_cmd(f, directory)) cmd(["rm", "-f", f])
def ids(self, height=2): log = deimos.logger.logger(height) if self.eid() is not None: log.info("eid = %s", self.eid()) if self.mesos_container_id() is not None: log.info("mesos = %s", self.mesos_container_id()) if self.cid() is not None: log.info("docker = %s", self.cid())
def wait(self, wait_pb, *args): log.info(" ".join(args)) container_id = wait_pb.container_id.value state = deimos.state.State(self.state_root, mesos_id=container_id) self.state = state deimos.sig.install(self.stop_docker_and_resume) state.await_launch() try: # Wait for the observe lock so observe completes first state.lock("observe", LOCK_SH, seconds=None) state.lock("wait", LOCK_SH, seconds=None) except IOError as e: # Allows for signal recovery if e.errno != errno.EINTR: raise e state.lock("observe", LOCK_SH, seconds=1) state.lock("wait", LOCK_SH, seconds=1) termination = (state.exit() if state.exit() is not None else 64) << 8 recordio.write(Termination, killed=False, message="", status=termination) if state.exit() is not None: return state.exit() raise Err("Wait lock is not held nor is exit file present")
def wait(self, *args): log.info(" ".join(args)) observe = False # NB: The "@@observe-docker@@" variant is a work around for Mesos's # option parser. There is a fix in the pipeline. if list(args[0:1]) in [ ["--observe-docker"], ["@@observe-docker@@"] ]: # In Docker mode, we use Docker wait to wait for the container # and then exit with the returned exit code. The Docker CID is # passed on the command line. state = deimos.state.State(self.state_root, docker_id=args[1]) observe = True else: message = recordio.read(Wait) container_id = message.container_id.value state = deimos.state.State(self.state_root, mesos_id=container_id) self.state = state deimos.sig.install(self.stop_docker_and_resume) state.await_launch() try: if not observe: state.lock("observe", LOCK_SH, seconds=None) state.lock("wait", LOCK_SH, seconds=None) except IOError as e: # Allows for signal recovery if e.errno != errno.EINTR: raise e if not observe: state.lock("observe", LOCK_SH, seconds=1) state.lock("wait", LOCK_SH, seconds=1) termination = (state.exit() if state.exit() is not None else 64) << 8 recordio.write(Termination, killed = False, message = "", status = termination) if state.exit() is not None: return state.exit() raise Err("Wait lock is not held nor is exit file present")
def destroy(self, destroy_pb, *args): log.info(" ".join(args)) container_id = destroy_pb.container_id.value state = deimos.state.State(self.state_root, mesos_id=container_id) state.await_launch() lk_d = state.lock("destroy", LOCK_EX) if state.exit() is None: container_id = state.docker_id log.debug("Unwiring the container %s from MidoNet", container_id) try: log.info("state_root = %s", dir(state)) midonet.unwire_container_from_midonet(container_id) log.debug("Successfully unwired the container %s from MidoNet " \ "bridge", container_id) except Exception as ex: log.error(traceback.format_exc()) Run()(deimos.docker.stop(state.cid())) else: log.info("Container is stopped") return 0
def launch(self, container_id, *args): log.info(" ".join([container_id] + list(args))) deimos.sig.install(self.sig_proxy) run_options = [] state = deimos.state.State(self.state_root, mesos_id=container_id) state.push() lk_l = state.lock("launch", LOCK_EX) mesos_directory() task = protos.TaskInfo() task.ParseFromString(sys.stdin.read()) for line in proto_lines(task): log.debug(line) state.executor_id = executor_id(task) state.push() state.ids() url, options = self.container_settings.override(*container(task)) pre, image = url.split("docker:///") if pre != "": raise Err("URL '%s' is not a valid docker:// URL!" % url) if image == "": image = self.default_image(task) log.info("image = %s", image) run_options += ["--sig-proxy"] run_options += ["--rm"] # This is how we ensure container cleanup run_options += ["--cidfile", state.resolve("cid")] place_uris(task, self.shared_dir, self.optimistic_unpack) run_options += ["-w", self.workdir] # Docker requires an absolute path to a source filesystem, separated # from the bind path in the container with a colon, but the absolute # path to the Mesos sandbox might have colons in it (TaskIDs with # timestamps can cause this situation). So we create a soft link to it # and mount that. shared_full = os.path.abspath(self.shared_dir) sandbox_symlink = state.sandbox_symlink(shared_full) run_options += ["-v", "%s:%s" % (sandbox_symlink, self.workdir)] cpus, mems = cpu_and_mem(task) env = [(_.name, _.value) for _ in task.command.environment.variables] run_options += options # We need to wrap the call to Docker in a call to the Mesos executor # if no executor is passed as part of the task. We need to pass the # MESOS_* environment variables in to the container if we're going to # start an executor. observer_argv = None if needs_executor_wrapper(task): options = ["--mesos-executor", "--observer"] if not (len(args) > 1 and args[0] in options): raise Err("Task %s needs --observer to be set!" % state.eid()) observer_argv = list( args[1:]) + [deimos.path.me(), "wait", "--docker"] else: env += mesos_env() + [("MESOS_DIRECTORY", self.workdir)] runner_argv = deimos.docker.run(run_options, image, argv(task), env=env, ports=ports(task), cpus=cpus, mems=mems) log_mesos_env(logging.DEBUG) observer = None with open("stdout", "w") as o: # This awkward multi 'with' is a with open("stderr", "w") as e: # concession to 2.6 compatibility with open(os.devnull) as devnull: log.info(deimos.cmd.present(runner_argv)) self.runner = subprocess.Popen(runner_argv, stdin=devnull, stdout=o, stderr=e) state.pid(self.runner.pid) state.await_cid() state.push() lk_w = state.lock("wait", LOCK_EX) lk_l.unlock() state.ids() proto_out(protos.ExternalStatus, message="launch: ok") sys.stdout.close() # Mark STDOUT as closed for Python code os.close( 1) # Use low-level call to close OS side of STDOUT if observer_argv is not None: observer_argv += [state.cid()] log.info(deimos.cmd.present(observer_argv)) call = deimos.cmd.in_sh(observer_argv) # TODO: Collect these leaking file handles. obs_out = open(state.resolve("observer.out"), "w+") obs_err = open(state.resolve("observer.err"), "w+") # If the Mesos executor sees LIBPROCESS_PORT=0 (which # is passed by the slave) there are problems when it # attempts to bind. ("Address already in use"). # Purging both LIBPROCESS_* net variables, to be safe. for v in ["LIBPROCESS_PORT", "LIBPROCESS_IP"]: if v in os.environ: del os.environ[v] observer = subprocess.Popen(call, stdin=devnull, stdout=obs_out, stderr=obs_err, close_fds=True) data = Run(data=True)(deimos.docker.wait(state.cid())) state.exit(data) lk_w.unlock() for p, arr in [(self.runner, runner_argv), (observer, observer_argv)]: if p is None or p.wait() == 0: continue log.warning(deimos.cmd.present(arr, p.wait())) return state.exit()
def cli(argv=None): deimos.sig.install(lambda _: None) if argv is None: argv = sys.argv sub = argv[1] if len(argv) > 1 else None if sub in ["-h", "--help", "help"]: print format_help() return 0 conf = deimos.config.load_configuration() if sub == "config": log.info("Final configuration:") for _, conf in conf.items(): print "%r" % conf return 0 if sub == "locks": deimos.flock.lock_browser(os.path.join(conf.state.root, "mesos")) return 0 if sub == "state": cleanup = deimos.cleanup.Cleanup(conf.state.root) t, rm = time.time(), False for arg in argv[2:]: if arg == "--rm": rm = True continue t = calendar.timegm(time.strptime(arg, "%Y-%m-%dT%H:%M:%SZ")) if rm: return cleanup.remove(t) else: for d in cleanup.dirs(t): sys.stdout.write(d + "\n") return 0 if sub not in deimos.containerizer.methods(): print >>sys.stderr, format_help() print >>sys.stderr, "** Please specify a subcommand **".center(79) log.error("Bad ARGV: %r" % argv[1:]) return 1 deimos.docker.options = conf.docker.argv() containerizer = deimos.containerizer.docker.Docker( container_settings=conf.containers, index_settings=conf.index, optimistic_unpack=conf.uris.unpack, state_root=conf.state.root ) deimos.usage.report() try: result = deimos.containerizer.stdio(containerizer, *argv[1:]) deimos.usage.report() if result is not None: if isinstance(result, bool): return 0 if result else 1 if isinstance(result, int): return result if isinstance(result, str): sys.stdout.write(result) else: for item in result: sys.stdout.write(str(item) + "\n") except Err as e: log.error("%s.%s: %s", type(e).__module__, type(e).__name__, str(e)) return 4 except subprocess.CalledProcessError as e: log.error(str(e)) return 4 except Exception: log.exception("Unhandled failure in %s", sub) return 8 return 0
def launch(self, launch_pb, *args): log.info(" ".join(args)) fork = False if "--no-fork" in args else True deimos.sig.install(self.log_signal) run_options = [] launchy = deimos.mesos.Launch(launch_pb) state = deimos.state.State(self.state_root, mesos_id=launchy.container_id) state.push() lk_l = state.lock("launch", LOCK_EX) state.executor_id = launchy.executor_id state.push() state.ids() mesos_directory() # Redundant? if launchy.directory: os.chdir(launchy.directory) # TODO: if launchy.user: # os.seteuid(launchy.user) url, options = launchy.container options, trailing_argv = split_on(options, "//") url, options = self.container_settings.override(url, options) true_argv = launchy.argv if trailing_argv is None else trailing_argv image = self.determine_image(url, launchy) log.info("image = %s", image) run_options += ["--sig-proxy"] run_options += ["--rm"] # This is how we ensure container cleanup run_options += ["--cidfile", state.resolve("cid")] place_uris(launchy, self.shared_dir, self.optimistic_unpack) run_options += ["-w", self.workdir] # Docker requires an absolute path to a source filesystem, separated # from the bind path in the container with a colon, but the absolute # path to the Mesos sandbox might have colons in it (TaskIDs with # timestamps can cause this situation). So we create a soft link to it # and mount that. shared_full = os.path.abspath(self.shared_dir) sandbox_symlink = state.sandbox_symlink(shared_full) run_options += ["-v", "%s:%s" % (sandbox_symlink, self.workdir)] cpus, mems = launchy.cpu_and_mem env = launchy.env run_options += options # We need to wrap the call to Docker in a call to the Mesos executor # if no executor is passed as part of the task. We need to pass the # MESOS_* environment variables in to the container if we're going to # start an executor. observer_argv = None if launchy.needs_observer: # NB: The "@@docker@@" variant is a work around for Mesos's option # parser. There is a fix in the pipeline. observer_argv = [mesos_executor(), "--override", deimos.path.me(), "observe", state.mesos_id] state.lock("observe", LOCK_EX | LOCK_NB) # Explanation of Locks # When the observer is running, we would like its call to # observe() to finish before all the wait(); and we'd like the # observer to have a chance to report TASK_FINISHED before the # calls to wait() report their results (which would result in a # TASK_FAILED). # # For this reason, we take the "observe" lock in launch(), before # we call the observer and before releasing the "launch" or "wait" # locks. # # Calls to observe() actually skip locking "observe"; but wait() # calls must take this lock. The "observe" lock is held by # launch() until the observer executor completes, at which point # we can be reasonably sure its status was propagated to the Mesos # slave. else: env += mesos_env() + [("MESOS_DIRECTORY", self.workdir)] self.place_dockercfg() runner_argv = deimos.docker.run(run_options, image, true_argv, env=env, ports=launchy.ports, cpus=cpus, mems=mems) log_mesos_env(logging.DEBUG) observer = None with open("stdout", "w") as o: # This awkward multi 'with' is a with open("stderr", "w") as e: # concession to 2.6 compatibility with open(os.devnull) as devnull: log.info(deimos.cmd.present(runner_argv)) self.runner = subprocess.Popen(runner_argv, stdin=devnull, stdout=o, stderr=e) state.pid(self.runner.pid) state.await_cid() state.push() lk_w = state.lock("wait", LOCK_EX) lk_l.unlock() if fork: pid = os.fork() if pid is not 0: state.ids() log.info("Forking watcher into child...") return state.ids() if observer_argv is not None: log.info(deimos.cmd.present(observer_argv)) call = deimos.cmd.in_sh(observer_argv, allstderr=False) # TODO: Collect these leaking file handles. obs_out = open(state.resolve("observer.out"), "w+") obs_err = open(state.resolve("observer.err"), "w+") # If the Mesos executor sees LIBPROCESS_PORT=0 (which # is passed by the slave) there are problems when it # attempts to bind. ("Address already in use"). # Purging both LIBPROCESS_* net variables, to be safe. for v in ["LIBPROCESS_PORT", "LIBPROCESS_IP"]: if v in os.environ: del os.environ[v] observer = subprocess.Popen(call, stdin=devnull, stdout=obs_out, stderr=obs_err, close_fds=True) data = Run(data=True)(deimos.docker.wait(state.cid())) state.exit(data) lk_w.unlock() for p, arr in [(self.runner, runner_argv), (observer, observer_argv)]: if p is None: continue thread = threading.Thread(target=p.wait) thread.start() thread.join(10) if thread.is_alive(): log.warning(deimos.cmd.present(arr, "SIGTERM after 10s")) p.terminate() thread.join(1) if thread.is_alive(): log.warning(deimos.cmd.present(arr, "SIGKILL after 1s")) p.kill() msg = deimos.cmd.present(arr, p.wait()) if p.wait() == 0: log.info(msg) else: log.warning(msg) return state.exit()
def update(self, update_pb, *args): log.info(" ".join(args)) log.info("Update is a no-op for Docker...")
def place_dockercfg(self): dockercfg = self.index_settings.dockercfg if dockercfg is not None: log.info("Copying to .dockercfg: %s" % dockercfg) Run()(["cp", dockercfg, ".dockercfg"])
def launch(self, launch_pb, *args): log.info(" ".join(args)) fork = False if "--no-fork" in args else True deimos.sig.install(self.log_signal) run_options = [] launchy = deimos.mesos.Launch(launch_pb) state = deimos.state.State(self.state_root, mesos_id=launchy.container_id) state.push() lk_l = state.lock("launch", LOCK_EX) state.executor_id = launchy.executor_id state.push() state.ids() mesos_directory() # Redundant? if launchy.directory: os.chdir(launchy.directory) # TODO: if launchy.user: # os.seteuid(launchy.user) url, options = launchy.container options, trailing_argv = split_on(options, "//") url, options = self.container_settings.override(url, options) true_argv = launchy.argv if trailing_argv is None else trailing_argv image = self.determine_image(url, launchy) log.info("image = %s", image) run_options += ["--sig-proxy"] run_options += ["--rm"] # This is how we ensure container cleanup run_options += ["--cidfile", state.resolve("cid")] place_uris(launchy, self.shared_dir, self.optimistic_unpack) run_options += ["-w", self.workdir] # Docker requires an absolute path to a source filesystem, separated # from the bind path in the container with a colon, but the absolute # path to the Mesos sandbox might have colons in it (TaskIDs with # timestamps can cause this situation). So we create a soft link to it # and mount that. shared_full = os.path.abspath(self.shared_dir) sandbox_symlink = state.sandbox_symlink(shared_full) run_options += ["-v", "%s:%s" % (sandbox_symlink, self.workdir)] cpus, mems = launchy.cpu_and_mem env = launchy.env run_options += options env_dict = dict(env) if env_dict.get("MIDONET_BRIDGE_ID", None): run_options += ["--net=none"] # We need to wrap the call to Docker in a call to the Mesos executor # if no executor is passed as part of the task. We need to pass the # MESOS_* environment variables in to the container if we're going to # start an executor. observer_argv = None if launchy.needs_observer: # NB: The "@@docker@@" variant is a work around for Mesos's option # parser. There is a fix in the pipeline. observer_argv = [ mesos_executor(), "--override", deimos.path.me(), "observe", state.mesos_id ] state.lock("observe", LOCK_EX | LOCK_NB) # Explanation of Locks # When the observer is running, we would like its call to # observe() to finish before all the wait(); and we'd like the # observer to have a chance to report TASK_FINISHED before the # calls to wait() report their results (which would result in a # TASK_FAILED). # # For this reason, we take the "observe" lock in launch(), before # we call the observer and before releasing the "launch" or "wait" # locks. # # Calls to observe() actually skip locking "observe"; but wait() # calls must take this lock. The "observe" lock is held by # launch() until the observer executor completes, at which point # we can be reasonably sure its status was propagated to the Mesos # slave. else: env += mesos_env() + [("MESOS_DIRECTORY", self.workdir)] self.place_dockercfg() runner_argv = deimos.docker.run(run_options, image, true_argv, env=env, ports=launchy.ports, cpus=cpus, mems=mems) log_mesos_env(logging.DEBUG) observer = None with open("stdout", "w") as o: # This awkward multi 'with' is a with open("stderr", "w") as e: # concession to 2.6 compatibility with open(os.devnull) as devnull: log.info(deimos.cmd.present(runner_argv)) self.runner = subprocess.Popen(runner_argv, stdin=devnull, stdout=o, stderr=e) state.pid(self.runner.pid) state.await_cid() log.debug("Wiring the container to MidoNet") try: bridge_id = env_dict.get( "MIDONET_BRIDGE_ID", "78488c47-d1de-4d16-a27a-4e6419dc4f88") container_id = state.docker_id ip_addr = env_dict.get("MIDONET_IP_ADDRESS", "192.168.100.42") default_gw = env_dict.get("MIDONET_DEFAULT_GATEWAY", None) midonet.wire_container_to_midonet( container_id, bridge_id, ip_addr, default_gw) log.debug("Successfully wired the container %s to MidoNet " \ "bridge %s", container_id, bridge_id) except Exception as ex: log.error(traceback.format_exc()) state.push() lk_w = state.lock("wait", LOCK_EX) lk_l.unlock() if fork: pid = os.fork() if pid is not 0: state.ids() log.info("Forking watcher into child...") return state.ids() if observer_argv is not None: log.info(deimos.cmd.present(observer_argv)) call = deimos.cmd.in_sh(observer_argv, allstderr=False) # TODO: Collect these leaking file handles. obs_out = open(state.resolve("observer.out"), "w+") obs_err = open(state.resolve("observer.err"), "w+") # If the Mesos executor sees LIBPROCESS_PORT=0 (which # is passed by the slave) there are problems when it # attempts to bind. ("Address already in use"). # Purging both LIBPROCESS_* net variables, to be safe. for v in ["LIBPROCESS_PORT", "LIBPROCESS_IP"]: if v in os.environ: del os.environ[v] observer = subprocess.Popen(call, stdin=devnull, stdout=obs_out, stderr=obs_err, close_fds=True) data = Run(data=True)(deimos.docker.wait(state.cid())) state.exit(data) lk_w.unlock() for p, arr in [(self.runner, runner_argv), (observer, observer_argv)]: if p is None: continue thread = threading.Thread(target=p.wait) thread.start() thread.join(10) if thread.is_alive(): log.warning(deimos.cmd.present(arr, "SIGTERM after 10s")) p.terminate() thread.join(1) if thread.is_alive(): log.warning(deimos.cmd.present(arr, "SIGKILL after 1s")) p.kill() msg = deimos.cmd.present(arr, p.wait()) if p.wait() == 0: log.info(msg) else: log.warning(msg) return state.exit()
def launch(self, container_id, *args): log.info(" ".join([container_id] + list(args))) deimos.sig.install(self.sig_proxy) run_options = [] state = deimos.state.State(self.state_root, mesos_id=container_id) state.push() lk_l = state.lock("launch", LOCK_EX) mesos_directory() task = protos.TaskInfo() task.ParseFromString(sys.stdin.read()) state.executor_id = executor_id(task) state.push() state.ids() url, options = self.container_settings.override(*container(task)) pre, image = url.split("docker:///") if pre != "": raise Err("URL '%s' is not a valid docker:// URL!" % url) if image == "": image = self.default_image(task) log.info("image = %s", image) run_options += [ "--sig-proxy" ] run_options += [ "--rm" ] # This is how we ensure container cleanup run_options += [ "--cidfile", state.resolve("cid") ] place_uris(task, self.shared_dir, self.optimistic_unpack) run_options += [ "-w", self.workdir ] # Docker requires an absolute path to a source filesystem, separated # from the bind path in the container with a colon, but the absolute # path to the Mesos sandbox might have colons in it (TaskIDs with # timestamps can cause this situation). So we create a soft link to it # and mount that. shared_full = os.path.abspath(self.shared_dir) sandbox_symlink = state.sandbox_symlink(shared_full) run_options += [ "-v", "%s:%s" % (sandbox_symlink, self.workdir) ] cpus, mems = cpu_and_mem(task) env = [(_.name, _.value) for _ in task.command.environment.variables] run_options += options # We need to wrap the call to Docker in a call to the Mesos executor # if no executor is passed as part of the task. We need to pass the # MESOS_* environment variables in to the container if we're going to # start an executor. observer_argv = None if needs_executor_wrapper(task): options = ["--mesos-executor", "--executor"] if not(len(args) > 1 and args[0] in options): raise Err("Task %s needs --executor to be set!" % state.tid()) observer_argv = [ args[1], deimos.path.me(), "wait", "--docker" ] else: env += mesos_env() + [("MESOS_DIRECTORY", self.workdir)] runner_argv = deimos.docker.run(run_options, image, argv(task), env=env, ports=ports(task), cpus=cpus, mems=mems) log_mesos_env(logging.DEBUG) observer = None with open("stdout", "w") as o: # This awkward multi 'with' is a with open("stderr", "w") as e: # concession to 2.6 compatibility with open(os.devnull) as devnull: log.info(deimos.cmd.present(runner_argv)) self.runner = subprocess.Popen(runner_argv, stdin=devnull, stdout=o, stderr=e) state.pid(self.runner.pid) state.await_cid() state.push() lk_w = state.lock("wait", LOCK_EX) lk_l.unlock() state.ids() proto_out(protos.ExternalStatus, message="launch: ok") sys.stdout.close() # Mark STDOUT as closed for Python code os.close(1) # Use low-level call to close OS side of STDOUT if observer_argv is not None: observer_argv += [state.cid()] log.info(deimos.cmd.present(observer_argv)) call = deimos.cmd.in_sh(observer_argv) # TODO: Collect these leaking file handles. obs_out = open(state.resolve("observer.out"), "w+") obs_err = open(state.resolve("observer.err"), "w+") # If the Mesos executor sees LIBPROCESS_PORT=0 (which # is passed by the slave) there are problems when it # attempts to bind. ("Address already in use"). # Purging both LIBPROCESS_* net variables, to be safe. for v in ["LIBPROCESS_PORT", "LIBPROCESS_IP"]: if v in os.environ: del os.environ[v] observer = subprocess.Popen(call, stdin=devnull, stdout=obs_out, stderr=obs_err, close_fds=True) data = Run(data=True)(deimos.docker.wait(state.cid())) state.exit(data) lk_w.unlock() for p, arr in [(self.runner, runner_argv), (observer, observer_argv)]: if p is None or p.wait() == 0: continue log.warning(deimos.cmd.present(arr, p.wait())) return state.exit()