def run_test(args): """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(args.spec) loader = jinja2.FileSystemLoader(os.path.dirname(args.spec)) if not args.image_tag: raise ValueError("--image_tag must be provided.") logging.info("Loading spec from %s with image_tag=%s", args.spec, args.image_tag) spec_contents = jinja2.Environment(loader=loader).get_template( os.path.basename(args.spec)).render(image_tag=args.image_tag) spec = yaml.load(spec_contents) # Make the job name unique. spec["metadata"]["name"] += "-" + uuid.uuid4().hex[0:4] try: start = time.time() api_response = tf_job_client.create_tf_job(api_client, spec) namespace = api_response["metadata"]["namespace"] name = api_response["metadata"]["name"] logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results["status"]["state"] != "succeeded": t.failure = "Job {0} in namespace {1} in state {2}".format( name, namespace, results["status"]["state"]) # TODO(jlewi): # Here are some validation checks to run: # 1. Check tensorboard is created if its part of the job spec. # 2. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup(args): """Test deploying Kubeflow.""" if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = run_label def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir, use_print=True) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) source = os.path.join(args.test_dir, "src", "kubeflow") logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) main_case = test_util.TestCase() main_case.class_name = "KubeFlow" main_case.name = "deploy-kubeflow" try: test_util.wrap_test(run, main_case) finally: # Delete the namespace logging.info("Deleting namespace %s", namespace_name) # We report teardown as a separate test case because this will help # us track down issues with garbage collecting namespaces. teardown = test_util.TestCase(main_case.class_name, "teardown") def run_teardown(): core_api = k8s_client.CoreV1Api(api_client) core_api.delete_namespace(namespace_name, {}) try: test_util.wrap_test(run_teardown, teardown) except Exception as e: # pylint: disable-msg=broad-except logging.error("There was a problem deleting namespace: %s; %s", namespace_name, e.message) junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([main_case, teardown], junit_path)
def run_test(args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone # TODO(jlewi): When using GKE we should copy the .kube config and any other # files to the test directory. We should then set the environment variable # KUBECONFIG to point at that file. This should prevent us from having # to rerun util.configure_kubectl on each step. Instead we could run it once # as part of GKE cluster creation and store the config in the NFS directory. # This would make the handling of credentials # and KUBECONFIG more consistent between GKE and minikube and eventually # this could be extended to other K8s deployments. if cluster_name: util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() salt = uuid.uuid4().hex[0:4] # Create a new environment for this run env = "test-env-{0}".format(salt) util.run(["ks", "env", "add", env], cwd=args.app_dir) name = None namespace = None for pair in args.params.split(","): k, v = pair.split("=", 1) if k == "name": name = v if k == "namespace": namespace = v util.run(["ks", "param", "set", "--env=" + env, args.component, k, v], cwd=args.app_dir) if not name: raise ValueError("name must be provided as a parameter.") t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(name) if not namespace: raise ValueError("namespace must be provided as a parameter.") start = time.time() try: # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. # TODO(jlewi): We should make this an argument. num_trials = 2 for trial in range(num_trials): logging.info("Trial %s", trial) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format( trial, name, namespace, results.get("status", {}).get("state", None)) logging.error(t.failure) break runtime_id = results.get("spec", {}).get("RuntimeId") logging.info("Trial %s Job %s in namespace %s runtime ID %s", trial, name, namespace, runtime_id) uid = results.get("metadata", {}).get("uid") events = get_events(api_client, namespace, uid) created_pods, created_services = parse_events(events) num_expected = 0 for replica in results.get("spec", {}).get("replicaSpecs", []): num_expected += replica.get("replicas", 0) creation_failures = [] if len(created_pods) != num_expected: message = ("Expected {0} pods to be created but only " "got {1} create events.").format( num_expected, len(created_pods)) creation_failures.append(message) if len(created_services) != num_expected: message = ("Expected {0} services to be created but only " "got {1} create events.").format( num_expected, len(created_services)) creation_failures.append(message) if creation_failures: t.failure = "Trial {0} Job {1} in namespace {2}: {3}".format( trial, name, namespace, ", ".join(creation_failures)) logging.error(t.failure) break pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) wait_for_pods_to_be_deleted(api_client, namespace, pod_selector) tf_job_client.delete_tf_job(api_client, namespace, name) logging.info("Waiting for job %s in namespaces %s to be deleted.", name, namespace) wait_for_delete(api_client, namespace, name, status_callback=tf_job_client.log_status) # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) logging.error(t.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.error("There was a problem running the job; Exception %s", e) logging.error( "There was a problem running the job; Exception " "message: %s", e.message) logging.error("Exception type: %s", e.__class__) logging.error("Exception args: %s", e.args) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run_test(args): """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() salt = uuid.uuid4().hex[0:4] # Create a new environment for this run env = "test-env-{0}".format(salt) util.run(["ks", "env", "add", env], cwd=args.app_dir) name = None namespace = None for pair in args.params.split(","): k, v = pair.split("=", 1) if k == "name": name = v if k == "namespace": namespace = v util.run(["ks", "param", "set", "--env=" + env, args.component, k, v], cwd=args.app_dir) if not name: raise ValueError("name must be provided as a parameter.") t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(name) if not namespace: raise ValueError("namespace must be provided as a parameter.") start = time.time() try: util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Job {0} in namespace {1} in state {2}".format( name, namespace, results.get("status", {}).get("state", None)) # TODO(jlewi): # Here are some validation checks to run: # 1. Check tensorboard is created if its part of the job spec. # 2. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.error("There was a problem running the job; Exception %s", e) logging.error( "There was a problem running the job; Exception " "message: %s", e.message) logging.error("Exception type: %s", e.__class__) logging.error("Exception args: %s", e.args) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup(args): """Test deploying Kubeflow.""" if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = run_label def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir, use_print=True) app_dir = os.path.join(args.test_dir, app_name) # TODO(jlewi): In presubmits we probably want to change this so we can # pull the changes on a branch. Its not clear whether that's well supported # in Ksonnet yet. kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages # TODO(jlewi): For presubmits how do we pull the package from the desired # branch at the desired commit. packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf: key = json.load(hf) apply_command.append("--as=" + key["client_email"]) util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) main_case = test_util.TestCase() main_case.class_name = "KubeFlow" main_case.name = "deploy-kubeflow" try: test_util.wrap_test(run, main_case) finally: # Delete the namespace logging.info("Deleting namespace %s", namespace_name) # We report teardown as a separate test case because this will help # us track down issues with garbage collecting namespaces. teardown = test_util.TestCase(main_case.class_name, "teardown") def run_teardown(): core_api = k8s_client.CoreV1Api(api_client) core_api.delete_namespace(namespace_name, {}) try: test_util.wrap_test(run_teardown, teardown) except Exception as e: # pylint: disable-msg=broad-except logging.error("There was a problem deleting namespace: %s; %s", namespace_name, e.message) junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([main_case, teardown], junit_path)
def setup(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone chart = args.chart machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, # TODO(jlewi): Stop pinning GKE version once 1.8 becomes the default. "initialClusterVersion": "1.8.1-gke.1", } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split( "=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() util.setup_cluster(api_client) if chart.startswith("gs://"): remote = chart chart = os.path.join(tempfile.gettempdir(), os.path.basename(chart)) gcs_client = storage.Client(project=project) bucket_name, path = util.split_gcs_uri(remote) bucket = gcs_client.get_bucket(bucket_name) blob = bucket.blob(path) logging.info("Downloading %s to %s", remote, chart) blob.download_to_filename(chart) t = test_util.TestCase() try: start = time.time() util.run([ "helm", "install", chart, "-n", "tf-job", "--wait", "--replace", "--set", "rbac.install=true,cloud=gke" ]) except subprocess.CalledProcessError as e: t.failure = "helm install failed;\n" + e.output finally: t.time = time.time() - start t.name = "helm-tfjob-install" t.class_name = "GKE" test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split( "=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) util.run([ "kubectl", "create", "clusterrolebinding", "default-admin", "--clusterrole=cluster-admin", "--user="******"tf-job-operator" logging.info("Verifying TfJob controller started.") # TODO(jlewi): We should verify the image of the operator is the correct. util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run(args, file_handler): src_dir = _get_src_dir() logging.info("Source directory: %s", src_dir) app_dir = os.path.join(src_dir, "test-infra") create_started_file(args.bucket) if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud " "to use service account.") # Since a service account is set tell gcloud to use it. util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS")]) util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() # Create the name for the workflow workflow_name = os.getenv("JOB_NAME") job_type = os.getenv("JOB_TYPE") if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(uuid.uuid4().hex[0:4]) util.run(["ks", "param", "set", "workflows", "name", workflow_name], cwd=app_dir) util.load_kube_config() api_client = k8s_client.ApiClient() # Set the prow environment variables. prow_env = [] names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME"] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run(["ks", "param", "set", COMPONENT, "prow_env", ",".join(prow_env)], cwd=app_dir) util.run(["ks", "param", "set", COMPONENT, "namespace", NAMESPACE], cwd=app_dir) util.run(["ks", "param", "set", COMPONENT, "bucket", args.bucket], cwd=app_dir) # For debugging print out the manifest util.run(["ks", "show", "prow", "-c", COMPONENT], cwd=app_dir) util.run(["ks", "apply", "prow", "-c", COMPONENT], cwd=app_dir) success = False try: results = argo_client.wait_for_workflow(api_client, NAMESPACE, workflow_name, status_callback=argo_client.log_status) if results["status"]["phase"] == "Succeeded": success = True logging.info("Workflow %s/%s finished phase: %s", NAMESPACE, workflow_name, results["status"]["phase"] ) except util.TimeoutError: success = False logging.error("Time out waiting for Workflow %s/%s to finish", NAMESPACE, workflow_name) finally: create_finished_file(args.bucket, success) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success
def run_test(args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() salt = uuid.uuid4().hex[0:4] # Create a new environment for this run env = "test-env-{0}".format(salt) util.run(["ks", "env", "add", env], cwd=args.app_dir) name = None namespace = None for pair in args.params.split(","): k, v = pair.split("=", 1) if k == "name": name = v if k == "namespace": namespace = v util.run(["ks", "param", "set", "--env=" + env, args.component, k, v], cwd=args.app_dir) if not name: raise ValueError("name must be provided as a parameter.") t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(name) if not namespace: raise ValueError("namespace must be provided as a parameter.") start = time.time() try: # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. # TODO(jlewi): We should make this an argument. num_trials = 2 for trial in range(num_trials): logging.info("Trial %s", trial) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format( trial, name, namespace, results.get("status", {}).get("state", None)) logging.error(t.failure) break runtime_id = results.get("spec", {}).get("RuntimeId") logging.info("Trial %s Job %s in namespace %s runtime ID %s", trial, name, namespace, runtime_id) # TODO(jlewi): We should check that pods were created for each replica pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) pods = list_pods(api_client, namespace, pod_selector) logging.info("Trial %s selector: %s matched %s pods", trial, pod_selector, len(pods.items)) if not pods.items: t.failure = ( "Trial {0} Job {1} in namespace {2} no pods found for " " selector {3}").format(trial, name, namespace, pod_selector) logging.error(t.failure) break tf_job_client.delete_tf_job(api_client, namespace, name) wait_for_delete(api_client, namespace, name, status_callback=tf_job_client.log_status) # Verify the pods have been deleted. tf_job_client uses foreground # deletion so there shouldn't be any resources for the job left # once the job is gone. pods = list_pods(api_client, namespace, pod_selector) logging.info("Trial %s selector: %s matched %s pods", trial, pod_selector, len(pods.items)) if pods.items: t.failure = ( "Trial {0} Job {1} in namespace {2} pods found for " " selector {3}; pods\n{4}").format(trial, name, namespace, pod_selector, pods) logging.error(t.failure) break logging.info("Trial %s all pods deleted.", trial) # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) logging.error(t.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.error("There was a problem running the job; Exception %s", e) logging.error( "There was a problem running the job; Exception " "message: %s", e.message) logging.error("Exception type: %s", e.__class__) logging.error("Exception args: %s", e.args) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)