Example #1
0
    def test_write_xml(self):
        with tempfile.NamedTemporaryFile(delete=False) as hf:
            pass

        success = test_util.TestCase()
        success.class_name = "some_test"
        success.name = "first"
        success.time = 10

        failure = test_util.TestCase()
        failure.class_name = "some_test"
        failure.name = "first"
        failure.time = 10
        failure.failure = "failed for some reason."

        test_util.create_junit_xml_file([success, failure], hf.name)
        with open(hf.name) as hf:
            output = hf.read()
            print(output)
        expected = (
            """<testsuite failures="1" tests="2" time="20">"""
            """<testcase classname="some_test" name="first" time="10" />"""
            """<testcase classname="some_test" """
            """failure="failed for some reason." name="first" """
            """time="10" /></testsuite>""")

        self.assertEquals(expected, output)
Example #2
0
def test(args):
    """Run the tests."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)

    t = test_util.TestCase()
    try:
        start = time.time()
        util.run(["helm", "test", "tf-job"])
    except subprocess.CalledProcessError as e:
        t.failure = "helm test failed;\n" + (e.output or "")
        # Reraise the exception so that the prow job will fail and the test
        # is marked as a failure.
        # TODO(jlewi): It would be better to this wholistically; e.g. by
        # processing all the junit xml files and checking for any failures. This
        # should be more tractable when we migrate off Airflow to Argo.
        raise
    finally:
        t.time = time.time() - start
        t.name = "e2e-test"
        t.class_name = "GKE"
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #3
0
def run_test(args):
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(args.spec)

    loader = jinja2.FileSystemLoader(os.path.dirname(args.spec))

    if not args.image_tag:
        raise ValueError("--image_tag must be provided.")

    logging.info("Loading spec from %s with image_tag=%s", args.spec,
                 args.image_tag)
    spec_contents = jinja2.Environment(loader=loader).get_template(
        os.path.basename(args.spec)).render(image_tag=args.image_tag)

    spec = yaml.load(spec_contents)

    # Make the job name unique.
    spec["metadata"]["name"] += "-" + uuid.uuid4().hex[0:4]
    try:
        start = time.time()
        api_response = tf_job_client.create_tf_job(api_client, spec)
        namespace = api_response["metadata"]["namespace"]
        name = api_response["metadata"]["name"]

        logging.info("Created job %s in namespaces %s", name, namespace)
        results = tf_job_client.wait_for_job(
            api_client,
            namespace,
            name,
            status_callback=tf_job_client.log_status)

        if results["status"]["state"] != "succeeded":
            t.failure = "Job {0} in namespace {1} in state {2}".format(
                name, namespace, results["status"]["state"])

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check tensorboard is created if its part of the job spec.
        #  2. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #4
0
  def testSubprocessError(self):
    def run():
      raise subprocess.CalledProcessError(10, "some command", output="some output")

    t = test_util.TestCase()
    self.assertRaises(subprocess.CalledProcessError, test_util.wrap_test, run, t)
    self.assertGreater(t.time, 0)
    self.assertEquals("Subprocess failed;\nsome output", t.failure)
Example #5
0
  def testOk(self):
    def ok():
      time.sleep(1)

    t = test_util.TestCase()
    test_util.wrap_test(ok, t)
    self.assertGreater(t.time, 0)
    self.assertEquals(None, t.failure)
Example #6
0
    def testGeneralError(self):
        def run():
            raise ValueError("some error")

        t = test_util.TestCase()
        self.assertRaises(ValueError, test_util.wrap_test, run, t)
        self.assertGreater(t.time, 0)
        self.assertEquals("Test failed; some error", t.failure)
Example #7
0
def run_test(args, test_case):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    namespace, name, env = test_runner.setup_ks_app(args)
    t.name = os.path.basename(name)

    try:  # pylint: disable=too-many-nested-blocks
        util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir)

        logging.info("Created job %s in namespaces %s", name, namespace)

        logging.info("Wait for conditions Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            namespace,
            name, ["Succeeded", "Failed"],
            status_callback=tf_job_client.log_status)

        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        # For v1alpha2 check for non-empty completionTime
        last_condition = results.get("status", {}).get("conditions", [])[-1]
        if last_condition.get("type", "").lower() != "failed":
            message = "Job {0} in namespace {1} did not fail; status {2}".format(
                name, namespace, results.get("status", {}))
            logging.error(message)
            test_case.add_failure_info(message)
            return

        pattern = ".*the spec is invalid.*"
        condition_message = last_condition.get("message", "")
        if not re.match(pattern, condition_message):
            message = "Condition message {0} did not match pattern {1}".format(
                condition_message, pattern)
            logging.error(message)
            test_case.add_failure_info(message)
    except tf_operator_util.JobTimeoutError as e:
        if e.job:
            spec = "Job:\n" + json.dumps(e.job, indent=2)
        else:
            spec = "JobTimeoutError did not contain job"
        message = ("Timeout waiting for {0} in namespace {1} to finish; "
                   ).format(name, namespace) + spec
        logging.exception(message)
        test_case.add_failure_info(message)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        message = "There was a problem running the job; Exception {0}".format(
            e)
        logging.exception(message)
        test_case.add_failure_info(message)
Example #8
0
def run_tests(args):
    # Print out the pylint version because different versions can produce
    # different results.
    util.run(["pylint", "--version"])

    # kubeflow_testing is imported as a submodule so we should exclude it
    # TODO(jlewi): Perhaps we should get a list of submodules and exclude
    # them automatically?
    dir_excludes = ["kubeflow_testing", "vendor"]
    includes = ["*_test.py"]
    test_cases = []

    env = os.environ.copy()
    # TODO(jlewi): Once we switch to using Argo I think we can stop setting
    # the PYTHONPATH here and just inheriting it from the environment.
    # When we use ARGO each step will run in its own pod and we can set the
    # PYTHONPATH environment variable as needed for that pod.
    env["PYTHONPATH"] = (args.src_dir + ":" +
                         os.path.join(args.src_dir, "kubeflow_testing", "py"))

    num_failed = 0
    for root, dirs, files in os.walk(args.src_dir, topdown=True):
        # excludes can be done with fnmatch.filter and complementary set,
        # but it's more annoying to read.
        dirs[:] = [d for d in dirs if d not in dir_excludes]
        for pat in includes:
            for f in fnmatch.filter(files, pat):
                full_path = os.path.join(root, f)

                test_case = test_util.TestCase()
                test_case.class_name = "pytest"
                test_case.name = full_path[len(args.src_dir):]
                start_time = time.time()
                test_cases.append(test_case)
                try:
                    util.run(["python", full_path], cwd=args.src_dir, env=env)
                except subprocess.CalledProcessError:
                    test_case.failure = "{0} failed.".format(test_case.name)
                    num_failed += 1
                finally:
                    test_case.time = time.time() - start_time

    if num_failed:
        logging.error("%s tests failed.", num_failed)
    else:
        logging.info("No lint issues.")

    if not args.junit_path:
        logging.info("No --junit_path.")
        return

    gcs_client = None
    if args.junit_path.startswith("gs://"):
        gcs_client = storage.Client(project=args.project)

    test_util.create_junit_xml_file(test_cases, args.junit_path, gcs_client)
Example #9
0
  def test_get_num_failures_success(self):
    success = test_util.TestCase()
    success.class_name = "some_test"
    success.name = "first"
    success.time = 10

    e = test_util.create_xml([success])
    s = StringIO.StringIO()
    e.write(s)
    xml_value = s.getvalue()
    self.assertEquals(0, test_util.get_num_failures(xml_value))
Example #10
0
  def test_get_num_failures(self):
    failure = test_util.TestCase()
    failure.class_name = "some_test"
    failure.name = "first"
    failure.time = 10
    failure.failure = "failed for some reason."

    e = test_util.create_xml([failure])
    s = StringIO.StringIO()
    e.write(s)
    xml_value = s.getvalue()
    self.assertEquals(1, test_util.get_num_failures(xml_value))
Example #11
0
def run_tests(args):
  # Print out the pylint version because different versions can produce
  # different results.
  util.run(["pylint", "--version"])

  dir_excludes = ["vendor"]
  includes = ["*_test.py"]
  test_cases = []

  env = os.environ.copy()
  env["PYTHONPATH"] = args.src_dir

  num_failed = 0
  for root, dirs, files in os.walk(args.src_dir, topdown=True):
    # excludes can be done with fnmatch.filter and complementary set,
    # but it's more annoying to read.
    dirs[:] = [d for d in dirs if d not in dir_excludes]
    for pat in includes:
      for f in fnmatch.filter(files, pat):
        full_path = os.path.join(root, f)

        test_case = test_util.TestCase()
        test_case.class_name = "pytest"
        test_case.name = full_path.strip(args.src_dir)
        start_time = time.time()
        test_cases.append(test_case)
        try:
          util.run(["python", full_path], cwd=args.src_dir, env=env)
        except subprocess.CalledProcessError:
          test_case.failure = "{0} failed.".format(test_case.name)
          num_failed += 1
        finally:
          test_case.time = time.time() - start_time

  if num_failed:
    logging.error("%s tests failed.", num_failed)
  else:
    logging.info("No lint issues.")


  if not args.junit_path:
    logging.info("No --junit_path.")
    return

  gcs_client = None
  if args.junit_path.startswith("gs://"):
    gcs_client = storage.Client(project=args.project)

  test_util.create_junit_xml_file(test_cases, args.junit_path, gcs_client)
Example #12
0
def run_lint(args):
    start_time = time.time()
    # Print out the pylint version because different versions can produce
    # different results.
    util.run(["pylint", "--version"])

    dir_excludes = ["vendor"]
    includes = ["*.py"]
    failed_files = []
    rc_file = os.path.join(args.src_dir, ".pylintrc")
    for root, dirs, files in os.walk(args.src_dir, topdown=True):
        # excludes can be done with fnmatch.filter and complementary set,
        # but it's more annoying to read.
        dirs[:] = [d for d in dirs if d not in dir_excludes]
        for pat in includes:
            for f in fnmatch.filter(files, pat):
                full_path = os.path.join(root, f)
                try:
                    util.run(["pylint", "--rcfile=" + rc_file, full_path],
                             cwd=args.src_dir)
                except subprocess.CalledProcessError:
                    failed_files.append(full_path.strip(args.src_dir))

    if failed_files:
        logging.error("%s files had lint errors.", len(failed_files))
    else:
        logging.info("No lint issues.")

    if not args.junit_path:
        logging.info("No --junit_path.")
        return

    test_case = test_util.TestCase()
    test_case.class_name = "pylint"
    test_case.name = "pylint"
    test_case.time = time.time() - start_time
    if failed_files:
        test_case.failure = "Files with lint issues: {0}".format(
            ", ".join(failed_files))

    gcs_client = None
    if args.junit_path.startswith("gs://"):
        gcs_client = storage.Client(project=args.project)

    test_util.create_junit_xml_file([test_case], args.junit_path, gcs_client)
Example #13
0
def test(args):
    """Run the tests."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)

    t = test_util.TestCase()
    try:
        start = time.time()
        util.run(["helm", "test", "tf-job"])
    except subprocess.CalledProcessError as e:
        t.failure = "helm test failed;\n" + e.output
    finally:
        t.time = time.time() - start
        t.name = "e2e-test"
        t.class_name = "GKE"
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #14
0
def setup_kubeflow(args):
    """Setup Kubeflow.

  Args:
    args: Command line arguments that control the setup process.
  """
    project = args.project
    cluster_name = args.cluster
    zone = args.zone

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    try:
        start = time.time()

        params = {
            "tfJobImage": args.image,
            "name": "kubeflow-core",
            "namespace": args.namespace,
            "tfJobVersion": args.tf_job_version,
        }

        component = "core"

        account = util.run_and_output(
            ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
        logging.info("Using GCP account %s", account)

        ks_deploy(args.test_app_dir, component, params, account=account)

        # Verify that the TfJob operator is actually deployed.
        if args.tf_job_version == "v1alpha2":
            tf_job_deployment_name = "tf-job-operator-v1alpha2"
        elif args.tf_job_version == "v1beta1":
            tf_job_deployment_name = "tf-job-operator-v1beta1"
        else:
            raise ValueError("Unrecognized value for tf_job_version %s" %
                             args.tf_job_version)
        logging.info("Verifying TfJob deployment %s started.",
                     tf_job_deployment_name)

        # TODO(jlewi): We should verify the image of the operator is the correct
        # one.
        try:
            util.wait_for_deployment(api_client, args.namespace,
                                     tf_job_deployment_name)
        finally:
            # Run kubectl describe to get useful information about the deployment.
            # This will help troubleshoot any errors.
            util.run([
                "kubectl", "-n", args.namespace, "describe", "deploy",
                tf_job_deployment_name
            ])
            util.run([
                "kubectl", "-n", args.namespace, "describe", "pods", "-l",
                "name=tf-job-operator"
            ])

    # Reraise the exception so that the step fails because there's no point
    # continuing the test.
    except subprocess.CalledProcessError as e:
        t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
        raise
    except util.TimeoutError as e:
        t.failure = e.message
        raise
    finally:
        t.time = time.time() - start
        t.name = "kubeflow-deploy"
        t.class_name = "GKE"
        gcs_client = storage.Client(project=args.project)
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #15
0
def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    # TODO(jlewi): When using GKE we should copy the .kube config and any other
    # files to the test directory. We should then set the environment variable
    # KUBECONFIG to point at that file. This should prevent us from having
    # to rerun util.configure_kubectl on each step. Instead we could run it once
    # as part of GKE cluster creation and store the config in the NFS directory.
    # This would make the handling of credentials
    # and KUBECONFIG more consistent between GKE and minikube and eventually
    # this could be extended to other K8s deployments.
    if cluster_name:
        util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()
    masterHost = api_client.configuration.host

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    namespace, name, env = _setup_ks_app(args)
    t.name = os.path.basename(name)

    start = time.time()

    try:  # pylint: disable=too-many-nested-blocks
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.

        # TODO(jlewi): We should make this an argument.
        num_trials = 2

        for trial in range(num_trials):
            logging.info("Trial %s", trial)
            util.run(["ks", "apply", env, "-c", args.component],
                     cwd=args.app_dir)

            logging.info("Created job %s in namespaces %s", name, namespace)
            logging.info("tfjob_version=%s", args.tfjob_version)
            # Wait for the job to either be in Running state or a terminal state
            if args.tfjob_version == "v1alpha1":
                logging.info("Wait for Phase Running, Done, or Failed")
                results = tf_job_client.wait_for_phase(
                    api_client,
                    namespace,
                    name, ["Running", "Done", "Failed"],
                    status_callback=tf_job_client.log_status)
            else:
                logging.info(
                    "Wait for conditions Running, Succeeded, or Failed")
                results = tf_job_client.wait_for_condition(
                    api_client,
                    namespace,
                    name, ["Running", "Succeeded", "Failed"],
                    status_callback=tf_job_client.log_status)

            logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

            # The job is now either running or done.
            if args.shutdown_policy:
                logging.info("Enforcing shutdownPolicy %s",
                             args.shutdown_policy)
                if args.shutdown_policy in ["master", "chief"]:
                    if args.tfjob_version == "v1alpha1":
                        replica = "master"
                    else:
                        replica = "chief"
                elif args.shutdown_policy in ["worker", "all_workers"]:
                    replica = "worker"
                else:
                    raise ValueError("Unrecognized shutdown_policy "
                                     "%s" % args.shutdown_policy)

                # Number of targets.
                num_targets = 1
                if args.shutdown_policy in ["all_workers"]:
                    # Assume v1alpha2
                    num_targets = results.get("spec", {}).get(
                        "tfReplicaSpecs", {}).get("Worker",
                                                  {}).get("replicas", 0)
                    logging.info("There are %s worker replicas", num_targets)

                if args.tfjob_version == "v1alpha1":
                    runtime_id = results.get("spec", {}).get("RuntimeId")
                    target = "{name}-{replica}-{runtime}".format(
                        name=name, replica=replica, runtime=runtime_id)
                    pod_labels = get_labels(name, runtime_id)
                    pod_selector = to_selector(pod_labels)
                else:
                    target = "{name}-{replica}".format(name=name,
                                                       replica=replica)
                    pod_labels = get_labels_v1alpha2(namespace, name)
                    pod_selector = to_selector(pod_labels)

                # Wait for the pods to be ready before we shutdown
                # TODO(jlewi): We are get pods using a label selector so there is
                # a risk that the pod we actual care about isn't present.
                logging.info(
                    "Waiting for pods to be running before shutting down.")
                wait_for_pods_to_be_in_phases(
                    api_client,
                    namespace,
                    pod_selector, ["Running"],
                    timeout=datetime.timedelta(minutes=4))
                logging.info("Pods are ready")
                logging.info("Issuing the terminate request")
                for num in range(num_targets):
                    full_target = target + "-{0}".format(num)
                    terminateReplica(masterHost, namespace, full_target)

            logging.info("Waiting for job to finish.")
            results = tf_job_client.wait_for_job(
                api_client,
                namespace,
                name,
                args.tfjob_version,
                status_callback=tf_job_client.log_status)

            if args.tfjob_version == "v1alpha1":
                if results.get("status", {}).get("state",
                                                 {}).lower() != "succeeded":
                    t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format(
                        trial, name, namespace,
                        results.get("status", {}).get("state", None))
                    logging.error(t.failure)
                    break
            else:
                # For v1alpha2 check for non-empty completionTime
                last_condition = results.get("status",
                                             {}).get("conditions", [])[-1]
                if last_condition.get("type", "").lower() != "succeeded":
                    t.failure = "Trial {0} Job {1} in namespace {2} in status {3}".format(
                        trial, name, namespace, results.get("status", {}))
                    logging.error(t.failure)
                    break

            runtime_id = results.get("spec", {}).get("RuntimeId")
            logging.info("Trial %s Job %s in namespace %s runtime ID %s",
                         trial, name, namespace, runtime_id)

            uid = results.get("metadata", {}).get("uid")
            events = get_events(api_client, namespace, uid)
            for e in events:
                logging.info("K8s event: %s", e.message)

            # Print out the K8s events because it can be useful for debugging.
            for e in events:
                logging.info("Recieved K8s Event:\n%s", e)
            created_pods, created_services = parse_events(events)

            num_expected = 0
            if args.tfjob_version == "v1alpha1":
                for replica in results.get("spec", {}).get("replicaSpecs", []):
                    num_expected += replica.get("replicas", 0)
            else:
                for replicakey in results.get("spec",
                                              {}).get("tfReplicaSpecs", {}):
                    replica_spec = results.get("spec",
                                               {}).get("tfReplicaSpecs",
                                                       {}).get(replicakey, {})
                    if replica_spec:
                        num_expected += replica_spec.get("replicas", 1)

            creation_failures = []
            if len(created_pods) != num_expected:
                message = ("Expected {0} pods to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_pods))
                creation_failures.append(message)

            if len(created_services) != num_expected:
                message = ("Expected {0} services to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_services))
                creation_failures.append(message)

            if creation_failures:
                # TODO(jlewi): Starting with
                # https://github.com/kubeflow/tf-operator/pull/646 the number of events
                # no longer seems to match the expected; it looks like maybe events
                # are being combined? For now we just log a warning rather than an
                # error.
                logging.warning(creation_failures)
            if args.tfjob_version == "v1alpha1":
                pod_labels = get_labels(name, runtime_id)
                pod_selector = to_selector(pod_labels)
            else:
                pod_labels = get_labels_v1alpha2(name)
                pod_selector = to_selector(pod_labels)

            # We don't wait for pods to be deleted in v1alpha2 because CleanPodPolicy
            # means completed pods won't be deleted.
            # TODO(jlewi): We should add a test to deal with deleted pods.
            if args.tfjob_version == "v1alpha1":
                wait_for_pods_to_be_deleted(api_client, namespace,
                                            pod_selector)

            tf_job_client.delete_tf_job(api_client,
                                        namespace,
                                        name,
                                        version=args.tfjob_version)

            logging.info("Waiting for job %s in namespaces %s to be deleted.",
                         name, namespace)
            wait_for_delete(api_client,
                            namespace,
                            name,
                            args.tfjob_version,
                            status_callback=tf_job_client.log_status)

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
        logging.exception(t.failure)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.exception("There was a problem running the job; Exception %s",
                          e)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #16
0
def setup(args):
  """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
  gke = discovery.build("container", "v1")

  project = args.project
  cluster_name = args.cluster
  zone = args.zone
  machine_type = "n1-standard-8"

  cluster_request = {
    "cluster": {
      "name": cluster_name,
      "description": "A GKE cluster for TF.",
      "initialNodeCount": 1,
      "nodeConfig": {
        "machineType": machine_type,
        "oauthScopes": [
          "https://www.googleapis.com/auth/cloud-platform",
        ],
      },
    }
  }

  if args.accelerators:
    # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
    cluster_request["cluster"]["enableKubernetesAlpha"] = True

    cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
    for accelerator_spec in args.accelerators:
      accelerator_type, accelerator_count = accelerator_spec.split("=", 1)
      cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
        "acceleratorCount":
        accelerator_count,
        "acceleratorType":
        accelerator_type,
      })

  util.create_cluster(gke, project, zone, cluster_request)

  util.configure_kubectl(project, zone, cluster_name)

  util.load_kube_config()
  # Create an API client object to talk to the K8s master.
  api_client = k8s_client.ApiClient()

  t = test_util.TestCase()
  try:
    start = time.time()

    params = {
      "tfJobImage": args.image,
      "name": "kubeflow-core",
      "namespace": args.namespace,
      "tfJobVersion":  args.tf_job_version,
    }

    component = "core"

    account = util.run_and_output(
      ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
    logging.info("Using GCP account %s", account)
    util.run([
      "kubectl", "create", "clusterrolebinding", "default-admin",
      "--clusterrole=cluster-admin", "--user="******"v1alpha1":
      tf_job_deployment_name = "tf-job-operator"
    elif args.tf_job_version == "v1alpha2":
      tf_job_deployment_name = "tf-job-operator-v1alpha2"
    else:
      raise ValueError(
        "Unrecognized value for tf_job_version %s" % args.tf_job_version)
    logging.info("Verifying TfJob deployment %s started.",
                 tf_job_deployment_name)

    # TODO(jlewi): We should verify the image of the operator is the correct.
    util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name)

  # Reraise the exception so that the step fails because there's no point
  # continuing the test.
  except subprocess.CalledProcessError as e:
    t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
    raise
  except util.TimeoutError as e:
    t.failure = e.message
    raise
  finally:
    t.time = time.time() - start
    t.name = "kubeflow-deploy"
    t.class_name = "GKE"
    gcs_client = storage.Client(project=args.project)
    test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #17
0
def run_lint(args):
    start_time = time.time()
    # Print out the pylint version because different versions can produce
    # different results.
    util.run(["pylint", "--version"])

    # kubeflow_testing is imported as a submodule so we should exclude it
    # TODO(jlewi): Perhaps we should get a list of submodules and exclude
    # them automatically?
    dir_excludes = [
        "dashboard/frontend/node_modules",
        "kubeflow_testing",
        "test/test-app",
        "vendor",
    ]
    full_dir_excludes = [
        os.path.join(os.path.abspath(args.src_dir), f) for f in dir_excludes
    ]
    includes = ["*.py"]
    failed_files = []
    rc_file = os.path.join(args.src_dir, ".pylintrc")
    for root, dirs, files in os.walk(os.path.abspath(args.src_dir),
                                     topdown=True):
        # excludes can be done with fnmatch.filter and complementary set,
        # but it's more annoying to read.
        exclude = False
        for e in full_dir_excludes:
            if root.startswith(e):
                exclude = True
                break
        if exclude:
            continue

        dirs[:] = [d for d in dirs]
        for pat in includes:
            for f in fnmatch.filter(files, pat):
                full_path = os.path.join(root, f)
                try:
                    util.run(["pylint", "--rcfile=" + rc_file, full_path],
                             cwd=args.src_dir)
                except subprocess.CalledProcessError:
                    failed_files.append(full_path[len(args.src_dir):])

    if failed_files:
        failed_files.sort()
        logging.error("%s files had lint errors:\n%s", len(failed_files),
                      "\n".join(failed_files))
    else:
        logging.info("No lint issues.")

    if not args.junit_path:
        logging.info("No --junit_path.")
        return

    test_case = test_util.TestCase()
    test_case.class_name = "pylint"
    test_case.name = "pylint"
    test_case.time = time.time() - start_time
    if failed_files:
        test_case.failure = "Files with lint issues: {0}".format(
            ", ".join(failed_files))

    gcs_client = None
    if args.junit_path.startswith("gs://"):
        gcs_client = storage.Client(project=args.project)

    test_util.create_junit_xml_file([test_case], args.junit_path, gcs_client)
Example #18
0
def setup(args):
    """Test deploying Kubeflow."""
    if args.cluster:
        project = args.project
        cluster_name = args.cluster
        zone = args.zone
        logging.info("Using cluster: %s in project: %s in zone: %s",
                     cluster_name, project, zone)
        # Print out config to help debug issues with accounts and
        # credentials.
        util.run(["gcloud", "config", "list"])
        util.configure_kubectl(project, zone, cluster_name)
        util.load_kube_config()
    else:
        # TODO(jlewi): This is sufficient for API access but it doesn't create
        # a kubeconfig file which ksonnet needs for ks init.
        logging.info("Running inside cluster.")
        incluster_config.load_incluster_config()

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    now = datetime.datetime.now()
    run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    if not os.path.exists(args.test_dir):
        os.makedirs(args.test_dir)

    logging.info("Using test directory: %s", args.test_dir)

    namespace_name = run_label

    def run():
        namespace = _setup_test(api_client, namespace_name)
        logging.info("Using namespace: %s", namespace)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

        # Initialize a ksonnet app.
        app_name = "kubeflow-test"
        util.run([
            "ks",
            "init",
            app_name,
        ], cwd=args.test_dir, use_print=True)

        app_dir = os.path.join(args.test_dir, app_name)

        kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
        util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
                 cwd=app_dir)

        # Install required packages
        packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

        for p in packages:
            util.run(["ks", "pkg", "install", p], cwd=app_dir)

        # Delete the vendor directory and replace with a symlink to the src
        # so that we use the code at the desired commit.
        target_dir = os.path.join(app_dir, "vendor", "kubeflow")

        logging.info("Deleting %s", target_dir)
        shutil.rmtree(target_dir)

        source = os.path.join(args.test_dir, "src", "kubeflow")
        logging.info("Creating link %s -> %s", target_dir, source)
        os.symlink(source, target_dir)

        # Deploy Kubeflow
        util.run([
            "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace.metadata.name
        ],
                 cwd=app_dir)

        # TODO(jlewi): For reasons I don't understand even though we ran
        # configure_kubectl above, if we don't rerun it we get rbac errors
        # when we do ks apply; I think because we aren't using the proper service
        # account. This might have something to do with the way ksonnet gets
        # its credentials; maybe we need to configure credentials after calling
        # ks init?
        if args.cluster:
            util.configure_kubectl(args.project, args.zone, args.cluster)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "kubeflow-core",
        ]

        util.run(apply_command, cwd=app_dir)

        # Verify that the TfJob operator is actually deployed.
        tf_job_deployment_name = "tf-job-operator"
        logging.info("Verifying TfJob controller started.")
        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 tf_job_deployment_name)

        # Verify that JupyterHub is actually deployed.
        jupyter_name = "tf-hub"
        logging.info("Verifying TfHub started.")
        util.wait_for_statefulset(api_client, namespace.metadata.name,
                                  jupyter_name)

    main_case = test_util.TestCase()
    main_case.class_name = "KubeFlow"
    main_case.name = "deploy-kubeflow"
    try:
        test_util.wrap_test(run, main_case)
    finally:
        # Delete the namespace
        logging.info("Deleting namespace %s", namespace_name)

        # We report teardown as a separate test case because this will help
        # us track down issues with garbage collecting namespaces.
        teardown = test_util.TestCase(main_case.class_name, "teardown")

        def run_teardown():
            core_api = k8s_client.CoreV1Api(api_client)
            core_api.delete_namespace(namespace_name, {})

        try:
            test_util.wrap_test(run_teardown, teardown)
        except Exception as e:  # pylint: disable-msg=broad-except
            logging.error("There was a problem deleting namespace: %s; %s",
                          namespace_name, e.message)
        junit_path = os.path.join(args.artifacts_dir,
                                  "junit_kubeflow-deploy.xml")
        logging.info("Writing test results to %s", junit_path)
        test_util.create_junit_xml_file([main_case, teardown], junit_path)
Example #19
0
def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    # TODO(jlewi): When using GKE we should copy the .kube config and any other
    # files to the test directory. We should then set the environment variable
    # KUBECONFIG to point at that file. This should prevent us from having
    # to rerun util.configure_kubectl on each step. Instead we could run it once
    # as part of GKE cluster creation and store the config in the NFS directory.
    # This would make the handling of credentials
    # and KUBECONFIG more consistent between GKE and minikube and eventually
    # this could be extended to other K8s deployments.
    if cluster_name:
        util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    salt = uuid.uuid4().hex[0:4]

    # Create a new environment for this run
    env = "test-env-{0}".format(salt)

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    name = None
    namespace = None
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        if k == "name":
            name = v

        if k == "namespace":
            namespace = v
        util.run(["ks", "param", "set", "--env=" + env, args.component, k, v],
                 cwd=args.app_dir)

    if not name:
        raise ValueError("name must be provided as a parameter.")

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(name)

    if not namespace:
        raise ValueError("namespace must be provided as a parameter.")

    start = time.time()

    try:
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.

        # TODO(jlewi): We should make this an argument.
        num_trials = 2

        for trial in range(num_trials):
            logging.info("Trial %s", trial)
            util.run(["ks", "apply", env, "-c", args.component],
                     cwd=args.app_dir)

            logging.info("Created job %s in namespaces %s", name, namespace)
            results = tf_job_client.wait_for_job(
                api_client,
                namespace,
                name,
                status_callback=tf_job_client.log_status)

            if results.get("status", {}).get("state",
                                             {}).lower() != "succeeded":
                t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format(
                    trial, name, namespace,
                    results.get("status", {}).get("state", None))
                logging.error(t.failure)
                break

            runtime_id = results.get("spec", {}).get("RuntimeId")
            logging.info("Trial %s Job %s in namespace %s runtime ID %s",
                         trial, name, namespace, runtime_id)

            uid = results.get("metadata", {}).get("uid")
            events = get_events(api_client, namespace, uid)
            created_pods, created_services = parse_events(events)

            num_expected = 0
            for replica in results.get("spec", {}).get("replicaSpecs", []):
                num_expected += replica.get("replicas", 0)

            creation_failures = []
            if len(created_pods) != num_expected:
                message = ("Expected {0} pods to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_pods))
                creation_failures.append(message)

            if len(created_services) != num_expected:
                message = ("Expected {0} services to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_services))
                creation_failures.append(message)

            if creation_failures:
                t.failure = "Trial {0} Job {1} in namespace {2}: {3}".format(
                    trial, name, namespace, ", ".join(creation_failures))
                logging.error(t.failure)
                break
            pod_labels = get_labels(name, runtime_id)
            pod_selector = to_selector(pod_labels)

            wait_for_pods_to_be_deleted(api_client, namespace, pod_selector)

            tf_job_client.delete_tf_job(api_client, namespace, name)

            logging.info("Waiting for job %s in namespaces %s to be deleted.",
                         name, namespace)
            wait_for_delete(api_client,
                            namespace,
                            name,
                            status_callback=tf_job_client.log_status)

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
        logging.error(t.failure)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.error("There was a problem running the job; Exception %s", e)
        logging.error(
            "There was a problem running the job; Exception "
            "message: %s", e.message)
        logging.error("Exception type: %s", e.__class__)
        logging.error("Exception args: %s", e.args)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #20
0
def run_test(args):
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    salt = uuid.uuid4().hex[0:4]

    # Create a new environment for this run
    env = "test-env-{0}".format(salt)

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    name = None
    namespace = None
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        if k == "name":
            name = v

        if k == "namespace":
            namespace = v
        util.run(["ks", "param", "set", "--env=" + env, args.component, k, v],
                 cwd=args.app_dir)

    if not name:
        raise ValueError("name must be provided as a parameter.")

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(name)

    if not namespace:
        raise ValueError("namespace must be provided as a parameter.")

    start = time.time()

    try:
        util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir)

        logging.info("Created job %s in namespaces %s", name, namespace)
        results = tf_job_client.wait_for_job(
            api_client,
            namespace,
            name,
            status_callback=tf_job_client.log_status)

        if results.get("status", {}).get("state", {}).lower() != "succeeded":
            t.failure = "Job {0} in namespace {1} in state {2}".format(
                name, namespace,
                results.get("status", {}).get("state", None))

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check tensorboard is created if its part of the job spec.
        #  2. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.error("There was a problem running the job; Exception %s", e)
        logging.error(
            "There was a problem running the job; Exception "
            "message: %s", e.message)
        logging.error("Exception type: %s", e.__class__)
        logging.error("Exception args: %s", e.args)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #21
0
def setup(args):
    """Test deploying Kubeflow."""
    if args.cluster:
        project = args.project
        cluster_name = args.cluster
        zone = args.zone
        logging.info("Using cluster: %s in project: %s in zone: %s",
                     cluster_name, project, zone)
        # Print out config to help debug issues with accounts and
        # credentials.
        util.run(["gcloud", "config", "list"])
        util.configure_kubectl(project, zone, cluster_name)
        util.load_kube_config()
    else:
        # TODO(jlewi): This is sufficient for API access but it doesn't create
        # a kubeconfig file which ksonnet needs for ks init.
        logging.info("Running inside cluster.")
        incluster_config.load_incluster_config()

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    now = datetime.datetime.now()
    run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    if not os.path.exists(args.test_dir):
        os.makedirs(args.test_dir)

    logging.info("Using test directory: %s", args.test_dir)

    namespace_name = run_label

    def run():
        namespace = _setup_test(api_client, namespace_name)
        logging.info("Using namespace: %s", namespace)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

        # Initialize a ksonnet app.
        app_name = "kubeflow-test"
        util.run([
            "ks",
            "init",
            app_name,
        ], cwd=args.test_dir, use_print=True)

        app_dir = os.path.join(args.test_dir, app_name)

        # TODO(jlewi): In presubmits we probably want to change this so we can
        # pull the changes on a branch. Its not clear whether that's well supported
        # in Ksonnet yet.
        kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
        util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
                 cwd=app_dir)

        # Install required packages
        # TODO(jlewi): For presubmits how do we pull the package from the desired
        # branch at the desired commit.
        packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

        for p in packages:
            util.run(["ks", "pkg", "install", p], cwd=app_dir)

        # Deploy Kubeflow
        util.run([
            "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace.metadata.name
        ],
                 cwd=app_dir)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "kubeflow-core",
        ]

        if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
            with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf:
                key = json.load(hf)
                apply_command.append("--as=" + key["client_email"])
        util.run(apply_command, cwd=app_dir)

        # Verify that the TfJob operator is actually deployed.
        tf_job_deployment_name = "tf-job-operator"
        logging.info("Verifying TfJob controller started.")
        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 tf_job_deployment_name)

        # Verify that JupyterHub is actually deployed.
        jupyter_name = "tf-hub"
        logging.info("Verifying TfHub started.")
        util.wait_for_statefulset(api_client, namespace.metadata.name,
                                  jupyter_name)

    main_case = test_util.TestCase()
    main_case.class_name = "KubeFlow"
    main_case.name = "deploy-kubeflow"
    try:
        test_util.wrap_test(run, main_case)
    finally:
        # Delete the namespace
        logging.info("Deleting namespace %s", namespace_name)

        # We report teardown as a separate test case because this will help
        # us track down issues with garbage collecting namespaces.
        teardown = test_util.TestCase(main_case.class_name, "teardown")

        def run_teardown():
            core_api = k8s_client.CoreV1Api(api_client)
            core_api.delete_namespace(namespace_name, {})

        try:
            test_util.wrap_test(run_teardown, teardown)
        except Exception as e:  # pylint: disable-msg=broad-except
            logging.error("There was a problem deleting namespace: %s; %s",
                          namespace_name, e.message)
        junit_path = os.path.join(args.artifacts_dir,
                                  "junit_kubeflow-deploy.xml")
        logging.info("Writing test results to %s", junit_path)
        test_util.create_junit_xml_file([main_case, teardown], junit_path)
Example #22
0
def setup(args):
    """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
    gke = discovery.build("container", "v1")

    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    chart = args.chart
    machine_type = "n1-standard-8"

    cluster_request = {
        "cluster": {
            "name": cluster_name,
            "description": "A GKE cluster for TF.",
            "initialNodeCount": 1,
            "nodeConfig": {
                "machineType":
                machine_type,
                "oauthScopes": [
                    "https://www.googleapis.com/auth/cloud-platform",
                ],
            },
            # TODO(jlewi): Stop pinning GKE version once 1.8 becomes the default.
            "initialClusterVersion": "1.8.1-gke.1",
        }
    }

    if args.accelerators:
        # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
        cluster_request["cluster"]["enableKubernetesAlpha"] = True

        cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
        for accelerator_spec in args.accelerators:
            accelerator_type, accelerator_count = accelerator_spec.split(
                "=", 1)
            cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
                "acceleratorCount":
                accelerator_count,
                "acceleratorType":
                accelerator_type,
            })

    util.create_cluster(gke, project, zone, cluster_request)

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    util.setup_cluster(api_client)

    if chart.startswith("gs://"):
        remote = chart
        chart = os.path.join(tempfile.gettempdir(), os.path.basename(chart))
        gcs_client = storage.Client(project=project)
        bucket_name, path = util.split_gcs_uri(remote)

        bucket = gcs_client.get_bucket(bucket_name)
        blob = bucket.blob(path)
        logging.info("Downloading %s to %s", remote, chart)
        blob.download_to_filename(chart)

    t = test_util.TestCase()
    try:
        start = time.time()
        util.run([
            "helm", "install", chart, "-n", "tf-job", "--wait", "--replace",
            "--set", "rbac.install=true,cloud=gke"
        ])
    except subprocess.CalledProcessError as e:
        t.failure = "helm install failed;\n" + e.output
    finally:
        t.time = time.time() - start
        t.name = "helm-tfjob-install"
        t.class_name = "GKE"
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #23
0
def setup_cluster(args):
    """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
    gke = discovery.build("container", "v1")

    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    machine_type = "n1-standard-8"

    cluster_request = {
        "cluster": {
            "name": cluster_name,
            "description": "A GKE cluster for TF.",
            "initialNodeCount": 1,
            "nodeConfig": {
                "machineType":
                machine_type,
                "oauthScopes": [
                    "https://www.googleapis.com/auth/cloud-platform",
                ],
            },
        }
    }

    if args.accelerators:
        # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
        cluster_request["cluster"]["enableKubernetesAlpha"] = True

        cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
        for accelerator_spec in args.accelerators:
            accelerator_type, accelerator_count = accelerator_spec.split(
                "=", 1)
            cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
                "acceleratorCount":
                accelerator_count,
                "acceleratorType":
                accelerator_type,
            })

    util.create_cluster(gke, project, zone, cluster_request)

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    try:
        start = time.time()

        account = util.run_and_output(
            ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
        logging.info("Using GCP account %s", account)
        util.run([
            "kubectl", "create", "clusterrolebinding", "default-admin",
            "--clusterrole=cluster-admin", "--user="******"setup-cluster failed;\n" + (e.output or "")
        raise
    except util.TimeoutError as e:
        t.failure = e.message
        raise
    finally:
        t.time = time.time() - start
        t.name = "setup-cluster"
        t.class_name = "GKE"
        gcs_client = storage.Client(project=args.project)
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #24
0
def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    salt = uuid.uuid4().hex[0:4]

    # Create a new environment for this run
    env = "test-env-{0}".format(salt)

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    name = None
    namespace = None
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        if k == "name":
            name = v

        if k == "namespace":
            namespace = v
        util.run(["ks", "param", "set", "--env=" + env, args.component, k, v],
                 cwd=args.app_dir)

    if not name:
        raise ValueError("name must be provided as a parameter.")

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(name)

    if not namespace:
        raise ValueError("namespace must be provided as a parameter.")

    start = time.time()

    try:
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.

        # TODO(jlewi): We should make this an argument.
        num_trials = 2

        for trial in range(num_trials):
            logging.info("Trial %s", trial)
            util.run(["ks", "apply", env, "-c", args.component],
                     cwd=args.app_dir)

            logging.info("Created job %s in namespaces %s", name, namespace)
            results = tf_job_client.wait_for_job(
                api_client,
                namespace,
                name,
                status_callback=tf_job_client.log_status)

            if results.get("status", {}).get("state",
                                             {}).lower() != "succeeded":
                t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format(
                    trial, name, namespace,
                    results.get("status", {}).get("state", None))
                logging.error(t.failure)
                break

            runtime_id = results.get("spec", {}).get("RuntimeId")
            logging.info("Trial %s Job %s in namespace %s runtime ID %s",
                         trial, name, namespace, runtime_id)

            # TODO(jlewi): We should check that pods were created for each replica
            pod_labels = get_labels(name, runtime_id)
            pod_selector = to_selector(pod_labels)
            pods = list_pods(api_client, namespace, pod_selector)

            logging.info("Trial %s selector: %s matched %s pods", trial,
                         pod_selector, len(pods.items))

            if not pods.items:
                t.failure = (
                    "Trial {0} Job {1} in namespace {2} no pods found for "
                    " selector {3}").format(trial, name, namespace,
                                            pod_selector)
                logging.error(t.failure)
                break

            tf_job_client.delete_tf_job(api_client, namespace, name)

            wait_for_delete(api_client,
                            namespace,
                            name,
                            status_callback=tf_job_client.log_status)

            # Verify the pods have been deleted. tf_job_client uses foreground
            # deletion so there shouldn't be any resources for the job left
            # once the job is gone.
            pods = list_pods(api_client, namespace, pod_selector)

            logging.info("Trial %s selector: %s matched %s pods", trial,
                         pod_selector, len(pods.items))

            if pods.items:
                t.failure = (
                    "Trial {0} Job {1} in namespace {2} pods found for "
                    " selector {3}; pods\n{4}").format(trial, name, namespace,
                                                       pod_selector, pods)
                logging.error(t.failure)
                break

            logging.info("Trial %s all pods deleted.", trial)

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
        logging.error(t.failure)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.error("There was a problem running the job; Exception %s", e)
        logging.error(
            "There was a problem running the job; Exception "
            "message: %s", e.message)
        logging.error("Exception type: %s", e.__class__)
        logging.error("Exception args: %s", e.args)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)