def run_tfjob_with_shutdown_policy(self, component, shutdown_policy): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) if shutdown_policy == "worker": tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "worker", 1) else: tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_invalid_tfjob_spec(self): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() component = INVALID_TFJOB_COMPONENT_NAME + "_" + self.tfjob_version # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) logging.info("Wait for conditions Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) last_condition = results.get("status", {}).get("conditions", [{}])[-1] if last_condition.get("type", "").lower() != "failed": self.failure = "Job {0} in namespace {1} did not fail; status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return pattern = ".*the spec is invalid.*" condition_message = last_condition.get("message", "") if not re.match(pattern, condition_message): self.failure = "Condition message {0} did not match pattern {1}".format( condition_message, pattern) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job( api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_invalid_tfjob_spec(self): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() component = INVALID_TFJOB_COMPONENT_NAME + "_" + self.tfjob_version # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) try: util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) except subprocess.CalledProcessError as e: if "invalid: spec.tfReplicaSpecs: Required value" in e.output: logging.info("Created job failed which is expected. Reason %s", e.output) else: self.failure = "Job {0} in namespace {1} failed because {2}".format( self.name, self.namespace, e.output) logging.error(self.failure)
def run_tfjob_with_cleanpod_policy(self, component, clean_pod_policy): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # All pods are deleted. if clean_pod_policy == "All": pod_labels = tf_job_client.get_labels(self.name) pod_selector = tf_job_client.to_selector(pod_labels) k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace, pod_selector) # Only running pods (PS) are deleted, completed pods are not. elif clean_pod_policy == "Running": tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Chief", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Worker", ["Succeeded"]) pod_labels = tf_job_client.get_labels(self.name, "PS") pod_selector = tf_job_client.to_selector(pod_labels) k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace, pod_selector) # No pods are deleted. elif clean_pod_policy == "None": tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Chief", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Worker", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "PS", ["Running"]) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_simple_tfjob(self, component): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Check for creation failures. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: # TODO(jlewi): Starting with # https://github.com/kubeflow/tf-operator/pull/646 the number of events # no longer seems to match the expected; it looks like maybe events # are being combined? For now we just log a warning rather than an # error. logging.warning(creation_failures) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_tfjob_with_replica_restart_policy(self, component, replica_restart_policy, exit_code): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) if replica_restart_policy == "Always" and exit_code == 0: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) elif replica_restart_policy == "Always" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) elif replica_restart_policy == "OnFailure" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) elif replica_restart_policy == "OnFailure" and exit_code == 0: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) elif replica_restart_policy == "Never" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) elif replica_restart_policy == "Never" and exit_code == 0: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) elif replica_restart_policy == "ExitCode" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) else: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) if res is False: self.failure = "Job {0} in namespace {1} with restart policy {2} failed test \ with exit_code {3}".format(self.name, self.namespace, replica_restart_policy, exit_code) logging.error(self.failure) return # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_tfjob_and_verify_runconfig(self): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() masterHost = api_client.configuration.host component = COMPONENT_NAME + "_" + self.tfjob_version # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) num_ps = results.get("spec", {}).get("tfReplicaSpecs", {}).get("PS", {}).get("replicas", 0) num_workers = results.get("spec", {}).get("tfReplicaSpecs", {}).get("Worker", {}).get("replicas", 0) num_evaluators = results.get("spec", {}).get("tfReplicaSpecs", {}).get("Evaluator", {}).get("replicas", 0) verify_runconfig(masterHost, self.namespace, self.name, "chief", num_ps, num_workers, num_evaluators) verify_runconfig(masterHost, self.namespace, self.name, "worker", num_ps, num_workers, num_evaluators) verify_runconfig(masterHost, self.namespace, self.name, "ps", num_ps, num_workers, num_evaluators) verify_runconfig(masterHost, self.namespace, self.name, "evaluator", num_ps, num_workers, num_evaluators) tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_pod_names(self): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() component = COMPONENT_NAME + "_" + self.tfjob_version ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) job_specs = extract_job_specs( results.get("spec", {}).get("tfReplicaSpecs", {})) expected_pod_names = [] for replica_type, replica_num in job_specs.items(): logging.info("job_type = %s, replica = %s", replica_type, replica_num) for i in range(replica_num): expected_pod_names.append( POD_NAME_FORMAT.format(name=self.name, replica=replica_type, index=i)) expected_pod_names = set(expected_pod_names) actual_pod_names = tf_job_client.get_pod_names(api_client, self.namespace, self.name) # We are not able to guarantee pods selected with default namespace and job # name are only for this test run only. Therefore we only do partial check, # e.g. make sure expected set of pod names are in the selected pod names. if not (expected_pod_names & actual_pod_names) == expected_pod_names: msg = "Actual pod names doesn't match. Expected: {0} Actual: {1}".format( str(expected_pod_names), str(actual_pod_names)) logging.error(msg) raise RuntimeError(msg) tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)