def test_katib_is_ready(record_xml_attribute, namespace): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_katib_is_ready") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() deployment_names = [ "katib-controller", "katib-mysql", "katib-db-manager", "katib-ui", ] for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def check_statefulsets_ready(record_xml_attribute, namespace, name, stateful_sets): """Test that Kubeflow deployments are successfully deployed. Args: namespace: The namespace to check """ set_logging() # TODO(jlewi): Should we do this in the calling function)? util.set_pytest_junit(record_xml_attribute, name) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() for set_name in stateful_sets: logging.info("Verifying that stateful set %s.%s started...", namespace, set_name) try: util.wait_for_statefulset(api_client, namespace, set_name) except: # Collect debug information by running describe util.run(["kubectl", "-n", namespace, "describe", "statefulsets", set_name]) raise Exception(f"Stateful set {namespace}.{name} is not ready")
def handle_retriable_exception(exception): if isinstance(exception, rest.ApiException): # ApiException could store the exit code in status or it might # store it in HTTP response body # see: https://github.com/kubernetes-client/python/blob/5e512ff564c244c50cab780d821542ed56aa965a/kubernetes/client/rest.py#L289 # pylint: disable=line-too-long code = None if exception.body: if isinstance(exception.body, six.string_types): body = {} try: logging.info("Parsing ApiException body: %s", exception.body) body = json.loads(exception.body) except json.JSONDecodeError as e: logging.error("Error parsing body: %s", e) else: body = exception.body code = body.get("code") else: code = exception.status # UNAUTHORIZED and FORBIDDEN errors can be an indication we need to # refresh credentials logging.info("ApiException code=%s", code) # TODO(jlewi): In python3 we can switch to using http.HttpStatusCode if code in [httplib.UNAUTHORIZED, httplib.FORBIDDEN, httplib.GATEWAY_TIMEOUT]: # Due to https://github.com/kubernetes-client/python-base/issues/59, # we need to reload the kube config (which refreshes the GCP token). # TODO(richardsliu): Remove this workaround when the k8s client issue # is resolved. util.load_kube_config() return True return not isinstance(exception, util.TimeoutError)
def test_deploy_pytorchjob(record_xml_attribute, kfctl_repo_path, namespace): """Deploy PytorchJob.""" util.load_kube_config() util.load_kube_credentials() logging.info("using kfctl repo: %s" % kfctl_repo_path) util.run([ "kubectl", "apply", "-f", os.path.join( kfctl_repo_path, "py/kubeflow/kfctl/testing/pytests/testdata/pytorch_job.yaml") ]) api_client = k8s_client.ApiClient() api = k8s_client.CoreV1Api(api_client) # If the call throws exception, let it emit as an error case. resp = api.list_namespaced_pod(namespace) names = { "pytorch-mnist-ddp-cpu-master-0": False, "pytorch-mnist-ddp-cpu-worker-0": False, } for pod in resp.items: name = pod.metadata.name if name in names: names[name] = True msg = [] for n in names: if not names[n]: msg.append("pod %s is not found" % n) if msg: raise ValueError("; ".join(msg))
def run_test(test_case, test_func, args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone # TODO(jlewi): When using GKE we should copy the .kube config and any other # files to the test directory. We should then set the environment variable # KUBECONFIG to point at that file. This should prevent us from having # to rerun util.configure_kubectl on each step. Instead we could run it once # as part of GKE cluster creation and store the config in the NFS directory. # This would make the handling of credentials # and KUBECONFIG more consistent between GKE and minikube and eventually # this could be extended to other K8s deployments. if cluster_name: util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() start = time.time() try: # pylint: disable=too-many-nested-blocks # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. num_trials = args.num_trials logging.info("tfjob_version=%s", args.tfjob_version) for trial in range(num_trials): logging.info("Trial %s", trial) test_func() # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except tf_operator_util.JobTimeoutError as e: if e.job: spec = "Job:\n" + json.dumps(e.job, indent=2) else: spec = "JobTimeoutError did not contain job" test_case.failure = "Timeout waiting for job to finish: " + spec logging.exception(test_case.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.exception("There was a problem running the job; Exception %s", e) # We want to catch all exceptions because we want the test as failed. test_case.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: test_case.time = time.time() - start if args.artifacts_path: test_util.create_junit_xml_file( [test_case], args.artifacts_path + "/junit_" + test_func.__name__ + ".xml", gcs_client)
def test_wait_for_deployment(test_case): # pylint: disable=redefined-outer-name args = parse_args() util.maybe_activate_service_account() util.load_kube_config() end_time = datetime.datetime.now() + datetime.timedelta(0, args.timeout*60) wait_for_resource("crd/tfjobs.kubeflow.org", end_time) wait_for_resource("crd/pytorchjobs.kubeflow.org", end_time) logging.info("Found all resources successfully")
def run_test(args, test_case): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" util.load_kube_config() api_client = k8s_client.ApiClient() t = test_util.TestCase() t.class_name = "tfjob_test" namespace, name, env = test_runner.setup_ks_app(args) t.name = os.path.basename(name) try: # pylint: disable=too-many-nested-blocks util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) logging.info("Wait for conditions Failed") results = tf_job_client.wait_for_condition( api_client, namespace, name, ["Succeeded", "Failed"], status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # For v1alpha2 check for non-empty completionTime last_condition = results.get("status", {}).get("conditions", [])[-1] if last_condition.get("type", "").lower() != "failed": message = "Job {0} in namespace {1} did not fail; status {2}".format( name, namespace, results.get("status", {})) logging.error(message) test_case.add_failure_info(message) return pattern = ".*the spec is invalid.*" condition_message = last_condition.get("message", "") if not re.match(pattern, condition_message): message = "Condition message {0} did not match pattern {1}".format( condition_message, pattern) logging.error(message) test_case.add_failure_info(message) except tf_operator_util.JobTimeoutError as e: if e.job: spec = "Job:\n" + json.dumps(e.job, indent=2) else: spec = "JobTimeoutError did not contain job" message = ("Timeout waiting for {0} in namespace {1} to finish; " ).format(name, namespace) + spec logging.exception(message) test_case.add_failure_info(message) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. message = "There was a problem running the job; Exception {0}".format( e) logging.exception(message) test_case.add_failure_info(message)
def test_kf_is_ready(namespace, use_basic_auth): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. deployment_names = [ "argo-ui", "centraldashboard", "cert-manager", "cloud-endpoints-controller", "jupyter-web-app", "ml-pipeline", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "notebooks-controller", "tf-job-operator", "profiles", "pytorch-operator", "studyjob-controller", "workflow-controller", ] stateful_sets = [ "backend-updater", ] if use_basic_auth: deployment_names.extend(["basic-auth"]) else: deployment_names.extend(["iap-enabler"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name) for name in stateful_sets: logging.info("Verifying that statefulset %s started...", name) util.wait_for_statefulset(api_client, namespace, name)
def create_k8s_client(): # We need to load the kube config so that we can have credentials to # talk to the APIServer. util.load_kube_config(persist_config=False) # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() return api_client
def cleanup_workflows(args): logging.info("Cleanup Argo workflows") util.maybe_activate_service_account() util.run([ "gcloud", "container", "clusters", "get-credentials", args.testing_cluster, "--zone=" + args.testing_zone, "--project=" + args.testing_project, ]) # We need to load the kube config so that we can have credentials to # talk to the APIServer. util.load_kube_config(persist_config=False) client = k8s_client.ApiClient() crd_api = k8s_client.CustomObjectsApi(client) workflows = crd_api.list_namespaced_custom_object(argo_client.GROUP, argo_client.VERSION, args.namespace, argo_client.PLURAL) expired = [] unexpired = [] for w in workflows["items"]: is_expired = False start_time = date_parser.parse(w["status"]["startedAt"]) now = datetime.datetime.now(start_time.tzinfo) name = w["metadata"]["name"] age = now - start_time if age > datetime.timedelta(hours=args.max_wf_age_hours): logging.info("Deleting workflow: %s", name) is_expired = True if not args.dryrun: try: crd_api.delete_namespaced_custom_object( argo_client.GROUP, argo_client.VERSION, args.namespace, argo_client.PLURAL, name, k8s_client.V1DeleteOptions()) except Exception as e: # pylint: disable=broad-except logging.error( "There was a problem deleting workflow %s.%s; " "error: %s", args.namespace, args.name, e) if is_expired: expired.append(name) else: unexpired.append(name) logging.info("Unexpired workflows:\n%s", "\n".join(unexpired)) logging.info("expired workflows:\n%s", "\n".join(expired)) logging.info("Finished cleanup of Argo workflows")
def handle_retriable_exception(exception): if (isinstance(exception, rest.ApiException) and (exception.status == 401 or exception.status == 403)): # Due to https://github.com/kubernetes-client/python-base/issues/59, # we need to reload the kube config (which refreshes the GCP token). # TODO(richardsliu): Remove this workaround when the k8s client issue # is resolved. util.load_kube_config() return True return not isinstance(exception, util.TimeoutError)
def test_katib(test_case): # pylint: disable=redefined-outer-name args = parse_args() namespace = NAMESPACE name = "katib-studyjob-test" util.load_kube_config() api_client = k8s_client.ApiClient() create_app_and_job(args, namespace, name) try: wait_for_condition( api_client, namespace, name, ["Running"], status_callback=log_status) logging.info("StudyJob launched successfully") except Exception as e: logging.error("Test failed waiting for job; %s", e) test_case.add_failure_info(e.message)
def run_test(test_case, test_func, args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" util.load_kube_config() start = time.time() try: # pylint: disable=too-many-nested-blocks # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. num_trials = args.num_trials logging.info("tfjob_version=%s", args.tfjob_version) for trial in range(num_trials): logging.info("Trial %s", trial) test_func() # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except tf_operator_util.JobTimeoutError as e: if e.job: spec = "Job:\n" + json.dumps(e.job, indent=2) else: spec = "JobTimeoutError did not contain job" test_case.failure = "Timeout waiting for job to finish: " + spec logging.exception(test_case.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.exception("There was a problem running the job; Exception %s", e) # We want to catch all exceptions because we want the test as failed. test_case.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: test_case.time = time.time() - start if args.artifacts_path: test_util.create_junit_xml_file([test_case], args.artifacts_path + "/junit_" + test_func.__name__ + ".xml")
def _iter_blueprints(namespace, context=None): """Return an iterator over blueprints. Args: namespace: The namespace to look for blueprints context: The kube context to use. """ # We need to load the kube config so that we can have credentials to # talk to the APIServer. util.load_kube_config(persist_config=False, context=context) client = k8s_client.ApiClient() crd_api = cnrm_clients.CnrmClientApi(client, "containercluster") clusters = crd_api.list_namespaced(namespace) for c in clusters.get("items"): yield c
def test_tf_job_simple(test_case): # pylint: disable=redefined-outer-name args = parse_args() namespace = "default" name = "tf-job-simple" util.load_kube_config() api_client = k8s_client.ApiClient() create_app_and_job(args, namespace, name) try: tf_job_client.wait_for_condition( api_client, namespace, name, ["Running"], status_callback=tf_job_client.log_status) logging.info("TFJob launched successfully") except Exception as e: logging.error("Test failed waiting for job; %s", e) test_case.add_failure_info(e.message)
def test_kfam(record_xml_attribute): util.set_pytest_junit(record_xml_attribute, "test_kfam_e2e") util.load_kube_config() util.load_kube_credentials() getcmd = "kubectl get pods -n kubeflow -l=app=jupyter-web-app --template '{{range.items}}{{.metadata.name}}{{end}}'" jupyterpod = util.run(getcmd.split(' '))[1:-1] logging.info("accessing kfam svc from jupyter pod %s" % jupyterpod) sleep(10) # Profile Creation profile_name = "testprofile-%s" % uuid.uuid4().hex[0:7] util.run(['kubectl', 'exec', jupyterpod, '-n', 'kubeflow', '--', 'curl', '--silent', '-X', 'POST', '-d', '{"metadata":{"name":"%s"},"spec":{"owner":{"kind":"User","name":"*****@*****.**"}}}' % profile_name, 'profiles-kfam.kubeflow:8081/kfam/v1/profiles']) assert verify_profile_creation(jupyterpod, profile_name)
def create_k8s_client(args): if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient()
def test_jupyter(record_xml_attribute, kfctl_repo_path, namespace): """Test the jupyter notebook. Args: record_xml_attribute: Test fixture provided by pytest. env: ksonnet environment. namespace: namespace to run in. """ util.load_kube_config() util.load_kube_credentials() logging.info("using kfctl repo: %s" % kfctl_repo_path) util.run(["kubectl", "apply", "-f", os.path.join(kfctl_repo_path, "py/kubeflow/kfctl/testing/pytests/testdata/jupyter_test.yaml")]) api_client = k8s_client.ApiClient() api = k8s_client.CoreV1Api(api_client) resp = api.list_namespaced_service(namespace) names = [service.metadata.name for service in resp.items] if not "jupyter-test" in names: raise ValueError("not able to find jupyter-test service.")
def deploy_kubeflow(_): """Deploy Kubeflow.""" args = parse_args() namespace = args.namespace api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that Jupyter is actually deployed. jupyter_name = "jupyter" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name) # Verify that core components are actually deployed. deployment_names = [ "tf-job-operator-v1beta1", "pytorch-operator", "studyjob-controller" ] for deployment_name in deployment_names: logging.info("Verifying that %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name)
def deploy_kubeflow(_): """Deploy Kubeflow.""" args = parse_args() namespace = args.namespace api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator-v1beta1" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that Jupyter is actually deployed. jupyter_name = "jupyter" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name) # Verify that PyTorch Operator actually deployed pytorch_operator_deployment_name = "pytorch-operator" logging.info("Verifying PyTorchJob controller started.") util.wait_for_deployment(api_client, namespace, pytorch_operator_deployment_name)
def check_deployments_ready(record_xml_attribute, namespace, name, deployments): """Test that Kubeflow deployments are successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() # TODO(jlewi): Should we do this in the calling function)? util.set_pytest_junit(record_xml_attribute, name) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() for deployment_name in deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def cleanup_workflows(args): # We need to load the kube config so that we can have credentials to # talk to the APIServer. util.load_kube_config(persist_config=False) client = k8s_client.ApiClient() crd_api = k8s_client.CustomObjectsApi(client) workflows = crd_api.list_namespaced_custom_object(argo_client.GROUP, argo_client.VERSION, args.namespace, argo_client.PLURAL) expired = [] unexpired = [] for w in workflows["items"]: is_expired = False start_time = date_parser.parse(w["status"]["startedAt"]) now = datetime.datetime.now(start_time.tzinfo) name = w["metadata"]["name"] age = now - start_time if age > datetime.timedelta(hours=args.max_age_hours): logging.info("Deleting workflow: %s", name) is_expired = True if not args.dryrun: crd_api.delete_namespaced_custom_object( argo_client.GROUP, argo_client.VERSION, args.namespace, argo_client.PLURAL, name, k8s_client.V1DeleteOptions()) break if is_expired: expired.append(name) else: unexpired.append(name) logging.info("Unexpired workflows:\n%s", "\n".join(unexpired)) logging.info("expired workflows:\n%s", "\n".join(expired))
def wait_for_jobs(self, namespace, label_filter): """Wait for all the jobs with the specified label to finish. Args: label_filter: A label filter expression e.g. "group=mygroup" """ if not util.is_in_cluster(): util.load_kube_config(persist_config=False) else: config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() jobs = util.wait_for_jobs_with_label(api_client, namespace, label_filter) done = 0 succeeded = 0 for job in jobs.items: project = job.metadata.labels.get("project", "") if not job.status.conditions: logging.info("Project %s Job %s.%s missing condition", project, job.metadata.namespace, job.metadata.name) continue last_condition = job.status.conditions[-1] if last_condition.type in ["Failed", "Complete"]: logging.info("Project %s Job %s.%s has condition %s", project, job.metadata.namespace, job.metadata.name, last_condition.type) done += 1 if last_condition.type in ["Complete"]: succeeded += 1 logging.info("%s of %s jobs finished", done, len(jobs.items)) logging.info("%s of %s jobs finished successfully", succeeded, len(jobs.items))
def run(self, tekton_cluster_info, current_cluster_info): """Kicks off all the Tekton pipelines. Args: tekton_cluster_info: ClusterInfo having the info to run pipelines on. Tekton runs on different cluster right now. current_cluster_info: Current cluster info. Returns: a list of UI urls. """ urls = dict() try: # Currently only tekton tests run in kf-ci-v1. util.configure_kubectl(tekton_cluster_info.project, tekton_cluster_info.zone, tekton_cluster_info.cluster_name) # util.configure_kubectl(project, "us-east1-d", "kf-ci-v1") util.load_kube_config() for w in self.workflows: w.run() urls[w.name] = w.ui_url if w.teardown_runner: urls[w.teardown_runner.name] = w.teardown_runner.ui_url logging.info("URL for workflow: %s", w.ui_url) except Exception as e: # pylint: disable=broad-except logging.error( "Error when starting Tekton workflow: %s;\nstacktrace:\n%s", e, traceback.format_exc()) finally: # Restore kubectl util.configure_kubectl(current_cluster_info.project, current_cluster_info.zone, current_cluster_info.cluster_name) util.load_kube_config() return urls
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches job_type = os.getenv("JOB_TYPE") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") pull_base_sha = os.getenv("PULL_BASE_SHA") # For presubmit/postsubmit jobs, find the list of files changed by the PR. diff_command = [] if job_type == "presubmit": diff_command = ["git", "diff", "--name-only", "master"] elif job_type == "postsubmit": diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha] changed_files = [] if job_type == "presubmit" or job_type == "postsubmit": changed_files = util.run(diff_command, cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines() for f in changed_files: logging.info("File %s is modified.", f) if args.release: generate_env_from_head(args) workflows = [] if args.config_file: workflows.extend(parse_config_file(args.config_file, args.repos_dir)) create_started_file(args.bucket) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() workflow_names = [] ui_urls = {} for w in workflows: # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name ks_cmd = get_ksonnet_cmd(w) # Print ksonnet version util.run([ks_cmd, "version"]) # Skip this workflow if it is scoped to a different job type. if w.job_types and not job_type in w.job_types: logging.info("Skipping workflow %s because job type %s is not one of " "%s.", w.name, job_type, w.job_types) continue # If we are scoping this workflow to specific directories, check if any files # modified match the specified regex patterns. dir_modified = False if w.include_dirs: for f in changed_files: for d in w.include_dirs: if fnmatch.fnmatch(f, d): dir_modified = True logging.info("Triggering workflow %s because %s in dir %s is modified.", w.name, f, d) break if dir_modified: break # Only consider modified files when the job is pre or post submit, and if # the include_dirs stanza is defined. if job_type != "periodic" and w.include_dirs and not dir_modified: logging.info("Skipping workflow %s because no code modified in %s.", w.name, w.include_dirs) continue if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # Create a new environment for this run env = workflow_name util.run([ks_cmd, "env", "add", env], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "name", workflow_name], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME"] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket], cwd=w.app_dir) if args.release: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG")], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k])], cwd=w.app_dir) # For debugging print out the manifest util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) success = True workflow_phase = {} try: results = argo_client.wait_for_workflows(get_namespace(args), workflow_names, timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status) for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase if phase != "Succeeded": success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) except util.TimeoutError: success = False logging.exception("Time out waiting for Workflows %s to finish", ",".join(workflow_names)) except Exception as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.exception("Exception occurred: %s", e) raise finally: success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success
def test_kf_is_ready(namespace, use_basic_auth, use_istio): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. deployment_names = [ "argo-ui", "centraldashboard", "cloud-endpoints-controller", "jupyter-web-app-deployment", "metadata-db", "metadata-deployment", "metadata-ui", "ml-pipeline", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "notebook-controller-deployment", "tf-job-operator", "pytorch-operator", "katib-controller", "workflow-controller", ] stateful_set_names = [ "kfserving-controller-manager", ] ingress_related_deployments = [] ingress_related_stateful_sets = [] if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
def setup(args): """Test deploying Kubeflow.""" if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = run_label def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) main_case = test_util.TestCase() main_case.class_name = "KubeFlow" main_case.name = "deploy-kubeflow" try: test_util.wrap_test(run, main_case) finally: # Delete the namespace logging.info("Deleting namespace %s", namespace_name) # We report teardown as a separate test case because this will help # us track down issues with garbage collecting namespaces. teardown = test_util.TestCase(main_case.class_name, "teardown") def run_teardown(): core_api = k8s_client.CoreV1Api(api_client) core_api.delete_namespace(namespace_name, {}) try: test_util.wrap_test(run_teardown, teardown) except Exception as e: # pylint: disable-msg=broad-except logging.error("There was a problem deleting namespace: %s; %s", namespace_name, e.message) junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([main_case, teardown], junit_path)
def main(): parser = argparse.ArgumentParser('Label an image using Inception') parser.add_argument('-p', '--port', type=int, default=9000, help='Port at which Inception model is being served') parser.add_argument("--namespace", required=True, type=str, help=("The namespace to use.")) parser.add_argument("--service_name", required=True, type=str, help=("The TF serving service to use.")) parser.add_argument( "--artifacts_dir", default="", type=str, help="Directory to use for artifacts that should be preserved after " "the test runs. Defaults to test_dir if not set.") parser.add_argument("--input_path", required=True, type=str, help=("The input file to use.")) parser.add_argument("--result_path", type=str, help=("The expected result.")) parser.add_argument("--workflow_name", default="tfserving", type=str, help="The name of the workflow.") args = parser.parse_args() t = test_util.TestCase() t.class_name = "Kubeflow" t.name = args.workflow_name + "-" + args.service_name start = time.time() util.load_kube_config(persist_config=False) api_client = k8s_client.ApiClient() core_api = k8s_client.CoreV1Api(api_client) try: with open(args.input_path) as f: instances = json.loads(f.read()) service = core_api.read_namespaced_service(args.service_name, args.namespace) service_ip = service.spec.cluster_ip model_urls = [ "http://" + service_ip + ":8500/v1/models/mnist:predict", # tf serving's http server ] for model_url in model_urls: logging.info("Try predicting with endpoint {}".format(model_url)) num_try = 1 result = None while True: try: result = requests.post(model_url, json=instances) assert (result.status_code == 200) except Exception as e: num_try += 1 if num_try > 10: raise logging.info( 'prediction failed: {}. Retrying...'.format(e)) time.sleep(5) else: break logging.info('Got result: {}'.format(result.text)) if args.result_path: with open(args.result_path) as f: expected_result = json.loads(f.read()) logging.info('Expected result: {}'.format(expected_result)) assert (almost_equal(expected_result, json.loads(result.text))) except Exception as e: t.failure = "Test failed; " + e.message raise finally: t.time = time.time() - start junit_path = os.path.join( args.artifacts_dir, "junit_kubeflow-tf-serving-image-{}.xml".format(args.service_name)) logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([t], junit_path) # Pause to collect Stackdriver logs. time.sleep(60)
def test_kf_is_ready(namespace, use_basic_auth, use_istio, app_path): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. # TODO(yanniszark): This list is incomplete and missing a lot of components. deployment_names = [ "argo-ui", "centraldashboard", "jupyter-web-app-deployment", "minio", "ml-pipeline", "ml-pipeline-persistenceagent", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "ml-pipeline-viewer-controller-deployment", "mysql", "notebook-controller-deployment", "profiles-deployment", "pytorch-operator", "tf-job-operator", "workflow-controller", ] stateful_set_names = [] with open(os.path.join(app_path, "app.yaml")) as f: kfdef = yaml.safe_load(f) platform = kfdef["spec"]["platform"] ingress_related_deployments = [ "istio-citadel", "istio-egressgateway", "istio-galley", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "istio-tracing", "kiali", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
def run_papermill_job( notebook_path, name, namespace, # pylint: disable=too-many-branches,too-many-statements repos, image): """Generate a K8s job to run a notebook using papermill Args: notebook_path: Path to the notebook. This should be in the form "{REPO_OWNER}/{REPO}/path/to/notebook.ipynb" name: Name for the K8s job namespace: The namespace where the job should run. repos: Which repos to checkout; if None or empty tries to infer based on PROW environment variables image: The docker image to run the notebook in. """ util.maybe_activate_service_account() with open("job.yaml") as hf: job = yaml.load(hf) if notebook_path.startswith("/"): raise ValueError( "notebook_path={0} should not start with /".format(notebook_path)) # We need to checkout the correct version of the code # in presubmits and postsubmits. We should check the environment variables # for the prow environment variables to get the appropriate values. # We should probably also only do that if the # See # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables if not repos: repos = argo_build_util.get_repo_from_prow_env() if not repos: raise ValueError("Could not get repos from prow environment variable " "and --repos isn't explicitly set") repos += ",kubeflow/testing@HEAD" logging.info("Repos set to %s", repos) job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [ "/usr/local/bin/checkout_repos.sh", "--repos=" + repos, "--src_dir=/src", "--depth=all", ] job["spec"]["template"]["spec"]["containers"][0]["image"] = image full_notebook_path = os.path.join("/src", notebook_path) job["spec"]["template"]["spec"]["containers"][0]["command"] = [ "python3", "-m", "kubeflow.examples.notebook_tests.execute_notebook", "--notebook_path", full_notebook_path ] job["spec"]["template"]["spec"]["containers"][0][ "workingDir"] = os.path.dirname(full_notebook_path) # The prow bucket to use for results/artifacts prow_bucket = prow_artifacts.PROW_RESULTS_BUCKET if os.getenv("REPO_OWNER") and os.getenv("REPO_NAME"): # Running under prow prow_dir = prow_artifacts.get_gcs_dir(prow_bucket) logging.info("Prow artifacts dir: %s", prow_dir) prow_dir = os.path.join(prow_dir, "artifacts") if os.getenv("TEST_TARGET_NAME"): prow_dir = os.path.join(prow_dir, os.getenv("TEST_TARGET_NAME").lstrip("/")) prow_bucket, prow_path = util.split_gcs_uri(prow_dir) else: prow_path = "notebook-test" + datetime.datetime.now().strftime( "%H%M%S") prow_path = prow_path + "-" + uuid.uuid4().hex[0:3] prow_dir = util.to_gcs_uri(prow_bucket, prow_path) prow_path = os.path.join(prow_path, name + ".html") output_gcs = util.to_gcs_uri(NB_BUCKET, prow_path) job["spec"]["template"]["spec"]["containers"][0]["env"] = [ { "name": "OUTPUT_GCS", "value": output_gcs }, { "name": "PYTHONPATH", "value": "/src/kubeflow/testing/py:/src/kubeflow/examples/py" }, ] logging.info("Notebook will be written to %s", output_gcs) util.load_kube_config(persist_config=False) if name: job["metadata"]["name"] = name else: job["metadata"]["name"] = ("notebook-test-" + datetime.datetime.now().strftime("%H%M%S") + "-" + uuid.uuid4().hex[0:3]) name = job["metadata"]["name"] job["metadata"]["namespace"] = namespace # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() batch_api = k8s_client.BatchV1Api(api_client) logging.info("Creating job:\n%s", yaml.dump(job)) actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"], job) logging.info("Created job %s.%s:\n%s", namespace, name, yaml.safe_dump(actual_job.to_dict())) final_job = util.wait_for_job(api_client, namespace, name, timeout=datetime.timedelta(minutes=30)) logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict())) # Download notebook html to artifacts logging.info("Copying %s to bucket %s", output_gcs, prow_bucket) storage_client = storage.Client() bucket = storage_client.get_bucket(NB_BUCKET) blob = bucket.get_blob(prow_path) destination_bucket = storage_client.get_bucket(prow_bucket) bucket.copy_blob(blob, destination_bucket) if not final_job.status.conditions: raise RuntimeError("Job {0}.{1}; did not complete".format( namespace, name)) last_condition = final_job.status.conditions[-1] if last_condition.type not in ["Complete"]: logging.error("Job didn't complete successfully") raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
def run_test(args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone # TODO(jlewi): When using GKE we should copy the .kube config and any other # files to the test directory. We should then set the environment variable # KUBECONFIG to point at that file. This should prevent us from having # to rerun util.configure_kubectl on each step. Instead we could run it once # as part of GKE cluster creation and store the config in the NFS directory. # This would make the handling of credentials # and KUBECONFIG more consistent between GKE and minikube and eventually # this could be extended to other K8s deployments. if cluster_name: util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() masterHost = api_client.configuration.host t = test_util.TestCase() t.class_name = "tfjob_test" namespace, name, env = _setup_ks_app(args) t.name = os.path.basename(name) start = time.time() try: # pylint: disable=too-many-nested-blocks # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. # TODO(jlewi): We should make this an argument. num_trials = 2 for trial in range(num_trials): logging.info("Trial %s", trial) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) logging.info("tfjob_version=%s", args.tfjob_version) # Wait for the job to either be in Running state or a terminal state if args.tfjob_version == "v1alpha1": logging.info("Wait for Phase Running, Done, or Failed") results = tf_job_client.wait_for_phase( api_client, namespace, name, ["Running", "Done", "Failed"], status_callback=tf_job_client.log_status) else: logging.info( "Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, namespace, name, ["Running", "Succeeded", "Failed"], status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # The job is now either running or done. if args.shutdown_policy: logging.info("Enforcing shutdownPolicy %s", args.shutdown_policy) if args.shutdown_policy in ["master", "chief"]: if args.tfjob_version == "v1alpha1": replica = "master" else: replica = "chief" elif args.shutdown_policy in ["worker", "all_workers"]: replica = "worker" else: raise ValueError("Unrecognized shutdown_policy " "%s" % args.shutdown_policy) # Number of targets. num_targets = 1 if args.shutdown_policy in ["all_workers"]: # Assume v1alpha2 num_targets = results.get("spec", {}).get( "tfReplicaSpecs", {}).get("Worker", {}).get("replicas", 0) logging.info("There are %s worker replicas", num_targets) if args.tfjob_version == "v1alpha1": runtime_id = results.get("spec", {}).get("RuntimeId") target = "{name}-{replica}-{runtime}".format( name=name, replica=replica, runtime=runtime_id) pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) else: target = "{name}-{replica}".format(name=name, replica=replica) pod_labels = get_labels_v1alpha2(namespace, name) pod_selector = to_selector(pod_labels) # Wait for the pods to be ready before we shutdown # TODO(jlewi): We are get pods using a label selector so there is # a risk that the pod we actual care about isn't present. logging.info( "Waiting for pods to be running before shutting down.") wait_for_pods_to_be_in_phases( api_client, namespace, pod_selector, ["Running"], timeout=datetime.timedelta(minutes=4)) logging.info("Pods are ready") logging.info("Issuing the terminate request") for num in range(num_targets): full_target = target + "-{0}".format(num) terminateReplica(masterHost, namespace, full_target) logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, namespace, name, args.tfjob_version, status_callback=tf_job_client.log_status) if args.tfjob_version == "v1alpha1": if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format( trial, name, namespace, results.get("status", {}).get("state", None)) logging.error(t.failure) break else: # For v1alpha2 check for non-empty completionTime last_condition = results.get("status", {}).get("conditions", [])[-1] if last_condition.get("type", "").lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in status {3}".format( trial, name, namespace, results.get("status", {})) logging.error(t.failure) break runtime_id = results.get("spec", {}).get("RuntimeId") logging.info("Trial %s Job %s in namespace %s runtime ID %s", trial, name, namespace, runtime_id) uid = results.get("metadata", {}).get("uid") events = get_events(api_client, namespace, uid) for e in events: logging.info("K8s event: %s", e.message) # Print out the K8s events because it can be useful for debugging. for e in events: logging.info("Recieved K8s Event:\n%s", e) created_pods, created_services = parse_events(events) num_expected = 0 if args.tfjob_version == "v1alpha1": for replica in results.get("spec", {}).get("replicaSpecs", []): num_expected += replica.get("replicas", 0) else: for replicakey in results.get("spec", {}).get("tfReplicaSpecs", {}): replica_spec = results.get("spec", {}).get("tfReplicaSpecs", {}).get(replicakey, {}) if replica_spec: num_expected += replica_spec.get("replicas", 1) creation_failures = [] if len(created_pods) != num_expected: message = ("Expected {0} pods to be created but only " "got {1} create events.").format( num_expected, len(created_pods)) creation_failures.append(message) if len(created_services) != num_expected: message = ("Expected {0} services to be created but only " "got {1} create events.").format( num_expected, len(created_services)) creation_failures.append(message) if creation_failures: # TODO(jlewi): Starting with # https://github.com/kubeflow/tf-operator/pull/646 the number of events # no longer seems to match the expected; it looks like maybe events # are being combined? For now we just log a warning rather than an # error. logging.warning(creation_failures) if args.tfjob_version == "v1alpha1": pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) else: pod_labels = get_labels_v1alpha2(name) pod_selector = to_selector(pod_labels) # We don't wait for pods to be deleted in v1alpha2 because CleanPodPolicy # means completed pods won't be deleted. # TODO(jlewi): We should add a test to deal with deleted pods. if args.tfjob_version == "v1alpha1": wait_for_pods_to_be_deleted(api_client, namespace, pod_selector) tf_job_client.delete_tf_job(api_client, namespace, name, version=args.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", name, namespace) wait_for_delete(api_client, namespace, name, args.tfjob_version, status_callback=tf_job_client.log_status) # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) logging.exception(t.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.exception("There was a problem running the job; Exception %s", e) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)