def _init_containers_spec(self): init_c = [] if "init" not in self.cmap: return json.dumps(init_c) for c in self.cmap["init"] or []: c_spec = c.generate_spec() c_formatted = swagger_client.ApiClient().sanitize_for_serialization(c_spec) init_c.append(c_formatted) return json.dumps(init_c)
def stop(self, jobname=None): """ NOTE: This function assumes that a pod is already running. This process kills the user command so that artifacts collection can occur Once this is done, the pod will be completed. This call will return when pod is completed. Note: pod is not deleted (just completed) """ def get_container_status(s, container_name, name): if isinstance(s, dict): try: c_status = s.get("containerStatuses", None) for c in c_status or []: n = c.get("name", None) if n == container_name: return c except Exception: logger.exception( "cannot get_container_status for [%s] [%s]", name, container_name) return None def get_container_state(s, container_name, name): container_status = get_container_status(s, container_name, name=name) container_states = ["waiting", "running", "terminated"] if isinstance(container_status, dict): if "state" in container_status: for state_string in container_states: if state_string in container_status["state"]: # wait if state in state_strings logger.debug("state=%s for [%s] [%s]", state_string, name, container_name) return state_string logger.error("unknown state for [%s] [%s]: %s", name, container_name, s) return None else: # No state logger.error("no state for [%s] [%s]: %s", name, container_name, s) return None else: # no status logger.error("no status for [%s] [%s]: %s", name, container_name, s) return None def get_pod_phase(s): if isinstance(s, dict): return s.get("phase", None) else: return None def validator_func(pod_status): # always return true for any event return True def send_kill_signal_to_main_container(): ax_command_path = "/ax-execu-host/art" busybox_command_path = os.path.join(ax_command_path, "busybox-i686") bash_path = os.path.join(ax_command_path, "ax_bash_ax") touch_command = "{} {}".format(busybox_command_path, "touch") pgrep_command = "{} {}".format(busybox_command_path, "pgrep") xargs_command = "{} {}".format(busybox_command_path, "xargs") kill_command = os.path.join(ax_command_path, "ax_kill_ax") cat_command = os.path.join(ax_command_path, "ax_cat_ax") # execute command to initiate user command kill # This command may or may not execute properly if the container is already dying or dead # but it does not matter to us here since we will have a waiter. This command will ensure # that if a container is running, it will start the process of terminating # TODO: we may have pods that are started programmatically that do not have artifacts later # HACK HACK cmd = [ bash_path, "-c", "{touch} {scratch_path}/.ax_delete ; {kill} -9 `{cat} {scratch_path}/.ax_pid` " .format(touch=touch_command, scratch_path=ARTIFACTS_CONTAINER_SCRATCH_PATH, pgrep=pgrep_command, xargs=xargs_command, kill=kill_command, cat=cat_command) ] logger.debug( "Try gracefully stop main container in [%s][%s]. cmd=%s", jobname, self.name, cmd) output = self.exec_commands(cmd) logger.debug("Kill output:\n%s", output) main_name = self.get_main_container_name() wait_name = SIDEKICK_WAIT_CONTAINER_NAME logger.debug("About to stop pod [%s][%s]", jobname, self.name) count = 0 while True: count += 1 if count > 180: logger.warning("Pod [%s][%s] too many lopps, abort. count=%s", jobname, self.name, count) return False obj = { "kind": "pods", "name": jobname if jobname else self.name, "validator": validator_func } waiter = KubeObjWaiter() monitor = AXKubeMonitor() monitor.wait_for_kube_object(obj, timeout=DELETE_WAITER_WAIT_TIMEOUT, waiter=waiter) # read status here read_count = 0 while True: read_count += 1 if read_count > 180: logger.warning( "Pod [%s][%s] too many retry, abort. count=%s", jobname, self.name, count) return False try: status = self.client.api.read_namespaced_pod_status( self.namespace, self.name).status assert isinstance(status, swagger_client.V1PodStatus) status_dict = swagger_client.ApiClient( ).sanitize_for_serialization(status) break except Exception: # xxx todo: what if self.name is not there? logger.exception( "exception in get status for Pod [%s][%s] retry=%s count=%s", jobname, self.name, read_count, count) time.sleep(10) continue main_container_state = get_container_state(status_dict, main_name, self.name) wait_container_state = get_container_state(status_dict, wait_name, self.name) pod_phase = get_pod_phase(status_dict) logger.debug("Pod [%s][%s] phase=%s. main=%s, wait=%s count=%s", jobname, self.name, pod_phase, main_container_state, wait_container_state, count) if main_container_state == "waiting": logger.debug("Pod [%s][%s] main in %s count=%s", jobname, self.name, main_container_state, count) elif main_container_state == "running": logger.debug("Pod [%s][%s] main in %s count=%s", jobname, self.name, main_container_state, count) send_kill_signal_to_main_container() elif main_container_state is None: if pod_phase == "Pending": logger.debug("Pod [%s][%s] in %s phase count=%s", jobname, self.name, pod_phase, count) else: logger.warning( "Pod [%s][%s] unknown main container state, abort. %s count=%s", jobname, self.name, status_dict, count) return False else: assert main_container_state == "terminated", "bad state {}".format( main_container_state) if wait_container_state in ["waiting", "running"]: logger.debug("Pod [%s][%s] wait in %s count=%s", jobname, self.name, wait_container_state, count) pass elif wait_container_state == "terminated": logger.debug( "Pod [%s][%s] all containers are terminated. stop() done. count=%s", jobname, self.name, count) return True else: logger.warning( "Pod [%s][%s] unknown wait container state, abort. %s. count=%s", jobname, self.name, status_dict, count) return False logger.debug("Pod [%s][%s] wait for new event. count=%s", jobname, self.name, count) waiter.wait() if waiter.result != KubeObjStatusCode.OK: logger.info("Pod [%s][%s] waiter return %s, events: %s", jobname, self.name, waiter.result, waiter.details) else: logger.debug("Pod [%s][%s] waiter return ok count=%s", jobname, self.name, count)