Example #1
0
  def run_simple_tfjob(self, component):
    api_client = k8s_client.ApiClient()

    # Setup the ksonnet app
    ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                         self.params)

    # Create the TF job
    ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
    util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir)
    logging.info("Created job %s in namespaces %s", self.name, self.namespace)

    # Wait for the job to either be in Running state or a terminal state
    logging.info("Wait for conditions Running, Succeeded, or Failed")
    results = tf_job_client.wait_for_condition(
      api_client,
      self.namespace,
      self.name, ["Running", "Succeeded", "Failed"],
      version=self.tfjob_version,
      status_callback=tf_job_client.log_status)
    logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

    # Wait for the job to complete.
    logging.info("Waiting for job to finish.")
    results = tf_job_client.wait_for_job(
      api_client,
      self.namespace,
      self.name,
      self.tfjob_version,
      status_callback=tf_job_client.log_status)
    logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

    if not tf_job_client.job_succeeded(results):
      self.failure = "Job {0} in namespace {1} in status {2}".format(
        self.name, self.namespace, results.get("status", {}))
      logging.error(self.failure)
      return

    # Check for creation failures.
    creation_failures = tf_job_client.get_creation_failures_from_tfjob(
      api_client, self.namespace, results)
    if creation_failures:
      # TODO(jlewi): Starting with
      # https://github.com/kubeflow/tf-operator/pull/646 the number of events
      # no longer seems to match the expected; it looks like maybe events
      # are being combined? For now we just log a warning rather than an
      # error.
      logging.warning(creation_failures)

    # Delete the TFJob.
    tf_job_client.delete_tf_job(
      api_client, self.namespace, self.name, version=self.tfjob_version)
    logging.info("Waiting for job %s in namespaces %s to be deleted.",
                 self.name, self.namespace)
    tf_job_client.wait_for_delete(
      api_client,
      self.namespace,
      self.name,
      self.tfjob_version,
      status_callback=tf_job_client.log_status)
Example #2
0
    def run_tfjob_with_shutdown_policy(self, component, shutdown_policy):
        tf_operator_util.load_kube_config()
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace,
                                      component, self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        if shutdown_policy == "worker":
            tf_job_client.terminate_replicas(api_client, self.namespace,
                                             self.name, "worker", 1)
        else:
            tf_job_client.terminate_replicas(api_client, self.namespace,
                                             self.name, "chief", 1)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Example #3
0
    def run_distributed_training_job(self, component):
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # Check for creation failures.
        creation_failures = tf_job_client.get_creation_failures_from_tfjob(
            api_client, self.namespace, results)
        if creation_failures:
            logging.warning(creation_failures)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Example #4
0
    def test_train(self):
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.
        api_client = k8s_client.ApiClient()

        component = "tfjob"
        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        util.run([self.ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        # Check for errors creating pods and services. Can potentially
        # help debug failed test runs.
        creation_failures = tf_job_client.get_creation_failures_from_tfjob(
            api_client, self.namespace, results)
        if creation_failures:
            logging.warning(creation_failures)

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(  # pylint: disable=attribute-defined-outside-init
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return
Example #5
0
    def test_train(self):
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.
        api_client = k8s_client.ApiClient()

        # Setup parameters for kustomize
        # TODO(jinchihe): Should enhance here after the kustomize util created.
        configmap = 'mnist-map-gcs'
        for pair in self.params.split(","):
            k, v = pair.split("=", 1)
            if k == "namespace" or k == "image":
                util.run(["kustomize edit set", k, v], cwd=self.app_dir)
            elif k == "numPs":
                util.run(["./definition.sh --numPs", v], cwd=self.app_dir)
            elif k == "numWorkers":
                util.run(["./definition.sh --numWorkers", v], cwd=self.app_dir)
            elif k == "secret":
                secretName, secretMountPath = v.split("=", 1)
                util.run([
                    "kustomize edit add configmap", configmap,
                    "--from-literal=secretName=", secretName
                ],
                         cwd=self.app_dir)
                util.run([
                    "kustomize edit add configmap", configmap,
                    "--from-literal=secretMountPath=", secretMountPath
                ],
                         cwd=self.app_dir)
            elif k == "envVariables":
                var_k, var_v = v.split("=", 1)
                util.run([
                    "kustomize edit add configmap", configmap,
                    "--from-literal=", var_k, "=", var_v
                ],
                         cwd=self.app_dir)
            else:
                util.run([
                    "kustomize edit add configmap", configmap,
                    "--from-literal=", k, "=", v
                ],
                         cwd=self.app_dir)

        # Create the TF job
        util.run(["kustomize build . |kubectl apply -f -"], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        # Check for errors creating pods and services. Can potentially
        # help debug failed test runs.
        creation_failures = tf_job_client.get_creation_failures_from_tfjob(
            api_client, self.namespace, results)
        if creation_failures:
            logging.warning(creation_failures)

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(  # pylint: disable=attribute-defined-outside-init
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return
Example #6
0
    def test_tfjob_and_verify_runconfig(self):
        api_client = k8s_client.ApiClient()
        masterHost = api_client.configuration.host
        component = COMPONENT_NAME + "_" + self.tfjob_version

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        num_ps = results.get("spec", {}).get("tfReplicaSpecs",
                                             {}).get("PS",
                                                     {}).get("replicas", 0)
        num_workers = results.get("spec",
                                  {}).get("tfReplicaSpecs",
                                          {}).get("Worker",
                                                  {}).get("replicas", 0)
        verify_runconfig(masterHost, self.namespace, self.name, "chief",
                         num_ps, num_workers)
        verify_runconfig(masterHost, self.namespace, self.name, "worker",
                         num_ps, num_workers)
        verify_runconfig(masterHost, self.namespace, self.name, "ps", num_ps,
                         num_workers)
        verify_runconfig(masterHost, self.namespace, self.name, "evaluator",
                         num_ps, num_workers)

        tf_job_client.terminate_replicas(api_client, self.namespace, self.name,
                                         "chief", 1)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Example #7
0
def test_training(
        record_xml_attribute,
        tfjob_name,
        namespace,
        trainer_image,
        num_ps,  #pylint: disable=too-many-arguments
        num_workers,
        train_steps,
        batch_size,
        learning_rate,
        model_dir,
        export_dir):

    util.set_pytest_junit(record_xml_attribute, "test_mnist")

    util.maybe_activate_service_account()

    app_dir = os.path.join(os.path.dirname(__file__), "../training/GCS")
    app_dir = os.path.abspath(app_dir)
    logging.info("--app_dir not set defaulting to: %s", app_dir)

    # TODO (@jinchihe) Using kustomize 2.0.3 to work around below issue:
    # https://github.com/kubernetes-sigs/kustomize/issues/1295
    kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
             'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
    util.run(['wget', '-q', '-O', '/usr/local/bin/kustomize', kusUrl],
             cwd=app_dir)
    util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=app_dir)

    # TODO (@jinchihe): The kubectl need to be upgraded to 1.14.0 due to below issue.
    # Invalid object doesn't have additional properties ...
    kusUrl = 'https://storage.googleapis.com/kubernetes-release/' \
             'release/v1.14.0/bin/linux/amd64/kubectl'
    util.run(['wget', '-q', '-O', '/usr/local/bin/kubectl', kusUrl],
             cwd=app_dir)
    util.run(['chmod', 'a+x', '/usr/local/bin/kubectl'], cwd=app_dir)

    # Configurate custom parameters using kustomize
    util.run(['kustomize', 'edit', 'set', 'namespace', namespace], cwd=app_dir)
    util.run([
        'kustomize', 'edit', 'set', 'image', 'training-image=' + trainer_image
    ],
             cwd=app_dir)

    util.run(['../base/definition.sh', '--numPs', num_ps], cwd=app_dir)
    util.run(['../base/definition.sh', '--numWorkers', num_workers],
             cwd=app_dir)

    trainning_config = {
        "name": tfjob_name,
        "trainSteps": train_steps,
        "batchSize": batch_size,
        "learningRate": learning_rate,
        "modelDir": model_dir,
        "exportDir": export_dir,
    }

    configmap = 'mnist-map-training'
    for key, value in trainning_config.items():
        util.run([
            'kustomize', 'edit', 'add', 'configmap', configmap,
            '--from-literal=' + key + '=' + value
        ],
                 cwd=app_dir)

    # Created the TFJobs.
    util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'],
             cwd=app_dir)
    util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir)
    logging.info("Created job %s in namespaces %s", tfjob_name, namespace)

    kube_config.load_kube_config()
    api_client = k8s_client.ApiClient()

    # Wait for the job to complete.
    logging.info("Waiting for job to finish.")
    results = tf_job_client.wait_for_job(
        api_client,
        namespace,
        tfjob_name,
        status_callback=tf_job_client.log_status)
    logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

    # Check for errors creating pods and services. Can potentially
    # help debug failed test runs.
    creation_failures = tf_job_client.get_creation_failures_from_tfjob(
        api_client, namespace, results)
    if creation_failures:
        logging.warning(creation_failures)

    if not tf_job_client.job_succeeded(results):
        failure = "Job {0} in namespace {1} in status {2}".format(  # pylint: disable=attribute-defined-outside-init
            tfjob_name, namespace, results.get("status", {}))
        logging.error(failure)

        # if the TFJob failed, print out the pod logs for debugging.
        pod_names = tf_job_client.get_pod_names(api_client, namespace,
                                                tfjob_name)
        logging.info("The Pods name:\n %s", pod_names)

        core_api = k8s_client.CoreV1Api(api_client)

        for pod in pod_names:
            logging.info("Getting logs of Pod %s.", pod)
            try:
                pod_logs = core_api.read_namespaced_pod_log(pod, namespace)
                logging.info("The logs of Pod %s log:\n %s", pod, pod_logs)
            except k8s_client.rest.ApiException as e:
                logging.info(
                    "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n",
                    e)
        return
Example #8
0
    def test_train(self):
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.
        api_client = k8s_client.ApiClient()

        # TODO (jinchihe) beflow code will be removed once new test-worker image
        # is publish in https://github.com/kubeflow/testing/issues/373.
        kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
             'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
        util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl],
                 cwd=self.app_dir)
        util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'],
                 cwd=self.app_dir)

        # Setup parameters for kustomize
        configmap = 'mnist-map-training'
        for pair in self.params.split(","):
            k, v = pair.split("=", 1)
            if k == "namespace":
                util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir)
            elif k == "image":
                util.run(
                    ['kustomize', 'edit', 'set', k, 'training-image=' + v],
                    cwd=self.app_dir)
            elif k == "numPs":
                util.run(['../base/definition.sh', '--numPs', v],
                         cwd=self.app_dir)
            elif k == "numWorkers":
                util.run(['../base/definition.sh', '--numWorkers', v],
                         cwd=self.app_dir)
            elif k == "secret":
                secretName, secretMountPath = v.split("=", 1)
                util.run([
                    'kustomize', 'edit', 'add', 'configmap', configmap,
                    '--from-literal=secretName=' + secretName
                ],
                         cwd=self.app_dir)
                util.run([
                    'kustomize', 'edit', 'add', 'configmap', configmap,
                    '--from-literal=secretMountPath=' + secretMountPath
                ],
                         cwd=self.app_dir)
            elif k == "envVariables":
                var_k, var_v = v.split("=", 1)
                util.run([
                    'kustomize', 'edit', 'add', 'configmap', configmap,
                    '--from-literal=' + var_k + '=' + var_v
                ],
                         cwd=self.app_dir)
            else:
                util.run([
                    'kustomize', 'edit', 'add', 'configmap', configmap,
                    '--from-literal=' + k + '=' + v
                ],
                         cwd=self.app_dir)

        # Create the TF job
        # Seems the util.run cannot handle pipes case, using check_call.
        subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -'
        subprocess.check_call(subCmd, shell=True)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        # Check for errors creating pods and services. Can potentially
        # help debug failed test runs.
        creation_failures = tf_job_client.get_creation_failures_from_tfjob(
            api_client, self.namespace, results)
        if creation_failures:
            logging.warning(creation_failures)

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(  # pylint: disable=attribute-defined-outside-init
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)

            # if the TFJob failed, print out the pod logs for debugging.
            pod_names = tf_job_client.get_pod_names(api_client, self.namespace,
                                                    self.name)
            logging.info("The Pods name:\n %s", pod_names)

            core_api = k8s_client.CoreV1Api(api_client)

            for pod in pod_names:
                logging.info("Getting logs of Pod %s.", pod)
                try:
                    pod_logs = core_api.read_namespaced_pod_log(
                        pod, self.namespace)
                    logging.info("The logs of Pod %s log:\n %s", pod, pod_logs)
                except k8s_client.rest.ApiException as e:
                    logging.info(
                        "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n",
                        e)
            return
    def run_tfjob_with_cleanpod_policy(self, component, clean_pod_policy):
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # All pods are deleted.
        if clean_pod_policy == "All":
            pod_labels = tf_job_client.get_labels(self.name)
            pod_selector = tf_job_client.to_selector(pod_labels)
            k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace,
                                                 pod_selector)
        # Only running pods (PS) are deleted, completed pods are not.
        elif clean_pod_policy == "Running":
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Chief", ["Succeeded"])
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Worker", ["Succeeded"])
            pod_labels = tf_job_client.get_labels(self.name, "PS")
            pod_selector = tf_job_client.to_selector(pod_labels)
            k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace,
                                                 pod_selector)
        # No pods are deleted.
        elif clean_pod_policy == "None":
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Chief", ["Succeeded"])
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Worker", ["Succeeded"])
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "PS", ["Running"])

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Example #10
0
    def test_pod_names(self):
        api_client = k8s_client.ApiClient()
        component = COMPONENT_NAME + "_" + self.tfjob_version

        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)
        util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        job_specs = extract_job_specs(
            results.get("spec", {}).get("tfReplicaSpecs", {}))
        expected_pod_names = []
        for replica_type, replica_num in job_specs.items():
            logging.info("job_type = %s, replica = %s", replica_type,
                         replica_num)
            for i in range(replica_num):
                expected_pod_names.append(
                    POD_NAME_FORMAT.format(name=self.name,
                                           replica=replica_type,
                                           index=i))
        expected_pod_names = set(expected_pod_names)
        actual_pod_names = tf_job_client.get_pod_names(api_client,
                                                       self.namespace,
                                                       self.name)

        # We are not able to guarantee pods selected with default namespace and job
        # name are only for this test run only. Therefore we only do partial check,
        # e.g. make sure expected set of pod names are in the selected pod names.
        if not (expected_pod_names & actual_pod_names) == expected_pod_names:
            msg = "Actual pod names doesn't match. Expected: {0} Actual: {1}".format(
                str(expected_pod_names), str(actual_pod_names))
            logging.error(msg)
            raise RuntimeError(msg)

        tf_job_client.terminate_replicas(api_client, self.namespace, self.name,
                                         "chief", 1)
        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)