def run_simple_tfjob(self, component):
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # Check for creation failures.
        creation_failures = tf_job_client.get_creation_failures_from_tfjob(
            api_client, self.namespace, results)
        if creation_failures:
            # TODO(jlewi): Starting with
            # https://github.com/kubeflow/tf-operator/pull/646 the number of events
            # no longer seems to match the expected; it looks like maybe events
            # are being combined? For now we just log a warning rather than an
            # error.
            logging.warning(creation_failures)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Example #2
0
    def run_distributed_training_job(self, component):
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # Check for creation failures.
        creation_failures = tf_job_client.get_creation_failures_from_tfjob(
            api_client, self.namespace, results)
        if creation_failures:
            logging.warning(creation_failures)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Example #3
0
  def test_train(self):
    # We repeat the test multiple times.
    # This ensures that if we delete the job we can create a new job with the
    # same name.
    api_client = k8s_client.ApiClient()

    component = "train"
    # Setup the ksonnet app
    ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                         self.params)


    # Create the TF job
    util.run([self.ks_cmd, "apply", self.env, "-c", component],
             cwd=self.app_dir)
    logging.info("Created job %s in namespaces %s", self.name, self.namespace)

    # Wait for the job to complete.
    logging.info("Waiting for job to finish.")
    results = tf_job_client.wait_for_job(
          api_client,
          self.namespace,
          self.name,
          status_callback=tf_job_client.log_status)
    logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

    # Check for errors creating pods and services. Can potentially
    # help debug failed test runs.
    creation_failures = tf_job_client.get_creation_failures_from_tfjob(
        api_client, self.namespace, results)
    if creation_failures:
      logging.warning(creation_failures)

    if not tf_job_client.job_succeeded(results):
      self.failure = "Job {0} in namespace {1} in status {2}".format(  # pylint: disable=attribute-defined-outside-init
          self.name, self.namespace, results.get("status", {}))
      logging.error(self.failure)
      return