def test(args): """Run the tests.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) t = test_util.TestCase() try: start = time.time() util.run(["helm", "test", "tf-job"]) except subprocess.CalledProcessError as e: t.failure = "helm test failed;\n" + (e.output or "") # Reraise the exception so that the prow job will fail and the test # is marked as a failure. # TODO(jlewi): It would be better to this wholistically; e.g. by # processing all the junit xml files and checking for any failures. This # should be more tractable when we migrate off Airflow to Argo. raise finally: t.time = time.time() - start t.name = "e2e-test" t.class_name = "GKE" test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def teardown_cluster(dag_run=None, ti=None, **_kwargs): conf = dag_run.conf if not conf: conf = {} dryrun = bool(conf.get("dryrun", False)) cluster = ti.xcom_pull("setup_cluster", key="cluster") gcs_path = run_path(dag_run.dag_id, dag_run.run_id) artifacts_path = conf.get("ARTIFACTS_PATH", gcs_path) logging.info("artifacts_path %s", artifacts_path) junit_path = os.path.join(artifacts_path, "junit_teardown.xml") logging.info("junit_path %s", junit_path) ti.xcom_push(key="cluster", value=cluster) args = ["python", "-m", "py.deploy", "teardown"] args.append("--cluster=" + cluster) args.append("--junit_path=" + junit_path) args.append("--project=" + GCB_PROJECT) # We want subprocess output to bypass logging module otherwise multiline # output is squashed together. util.run(args, use_print=True, dryrun=dryrun)
def build_container(use_gcb, src_dir, test_dir, project): """Build the CRD container. Args: use_gcb: Boolean indicating whether to build the image with GCB or Docker. src_dir: The directory containing the source. test_dir: Scratch directory for runner.py. project: Project to use. Returns: image: The URI of the newly built image. """ # Build and push the image # We use Google Container Builder because Prow currently doesn't allow using # docker build. if use_gcb: gcb_arg = "--gcb" else: gcb_arg = "--no-gcb" build_info_file = os.path.join(test_dir, "build_info.yaml") util.run([ "./images/tf_operator/build_and_push.py", gcb_arg, "--project=" + project, "--registry=gcr.io/mlkube-testing", "--output=" + build_info_file ], cwd=src_dir) with open(build_info_file) as hf: build_info = yaml.load(hf) return build_info["image"]
def run_py_checks(dag_run=None, ti=None, **_kwargs): """Run some of the python checks.""" conf = dag_run.conf if not conf: conf = {} dryrun = bool(conf.get("dryrun", False)) src_dir = ti.xcom_pull(None, key="src_dir") logging.info("src_dir %s", src_dir) gcs_path = run_path(dag_run.dag_id, dag_run.run_id) artifacts_path = conf.get("ARTIFACTS_PATH", gcs_path) logging.info("artifacts_path %s", artifacts_path) junit_path = os.path.join(artifacts_path, "junit_pychecks{0}.xml".format(command)) logging.info("junit_path %s", junit_path) args = ["python", "-m", "py.py_checks", command] args.append("--src_dir=" + src_dir) args.append("--junit_path=" + junit_path) args.append("--project=" + GCB_PROJECT) # We want subprocess output to bypass logging module otherwise multiline # output is squashed together. util.run(args, use_print=True, dryrun=dryrun)
def run(ti, *extra_args, **kwargs): # Set the PYTHONPATH env = kwargs.get("env", os.environ) env = env.copy() python_path = set(env.get("PYTHONPATH", "").split(":")) # Ensure the BOOTSTRAP_DIR isn't in the PYTHONPATH as this could cause # unexpected issues by unexpectedly pulling the version baked into the # container. if BOOTSTRAP_DIR in python_path: logging.info("Removing %s from PYTHONPATH", BOOTSTRAP_DIR) python_path.remove(BOOTSTRAP_DIR) src_dir = ti.xcom_pull(None, key="src_dir") if not src_dir: src_dir = BOOTSTRAP_DIR python_path.add(src_dir) env["PYTHONPATH"] = ":".join(python_path) # We need to delay the import of util because for all steps (except the # clone step) we want to use the version checked out from the github. # But airflow needs be able to import the module e2e_tests_daga.py from py import util kwargs["env"] = env # Printing out the file location of util should help us debug issues # with the path. logging.info("Using util located at %s", util.__file__) util.run(*extra_args, **kwargs)
def run_gpu_test(dag_run=None, ti=None, **_kwargs): conf = dag_run.conf if not conf: conf = {} cluster = ti.xcom_pull(None, key="cluster") src_dir = ti.xcom_pull(None, key="src_dir") logging.info("conf=%s", conf) artifacts_path = conf.get("ARTIFACTS_PATH", run_path(dag_run.dag_id, dag_run.run_id)) logging.info("artifacts_path %s", artifacts_path) # I think we can only have one underscore in the name for gubernator to # work. junit_path = os.path.join(artifacts_path, "junit_gpu-tests.xml") logging.info("junit_path %s", junit_path) ti.xcom_push(key="cluster", value=cluster) spec = os.path.join(src_dir, "examples/tf_job_gpu.yaml") args = ["python", "-m", "py.test_runner", "test"] args.append("--spec=" + spec) args.append("--zone=" + ZONE) args.append("--cluster=" + cluster) args.append("--junit_path=" + junit_path) args.append("--project=" + GCB_PROJECT) # tf_job_gpu.yaml has the image tag hardcoded so the tag doesn't matter. # TODO(jlewi): The example should be a template and we should rebuild and # and use the newly built sample container. args.append("--image_tag=notag") # We want subprocess output to bypass logging module otherwise multiline # output is squashed together. util.run(args, use_print=True)
def run_tests(args): # Print out the pylint version because different versions can produce # different results. util.run(["pylint", "--version"]) # kubeflow_testing is imported as a submodule so we should exclude it # TODO(jlewi): Perhaps we should get a list of submodules and exclude # them automatically? dir_excludes = ["kubeflow_testing", "vendor"] includes = ["*_test.py"] test_cases = [] env = os.environ.copy() # TODO(jlewi): Once we switch to using Argo I think we can stop setting # the PYTHONPATH here and just inheriting it from the environment. # When we use ARGO each step will run in its own pod and we can set the # PYTHONPATH environment variable as needed for that pod. env["PYTHONPATH"] = (args.src_dir + ":" + os.path.join(args.src_dir, "kubeflow_testing", "py")) num_failed = 0 for root, dirs, files in os.walk(args.src_dir, topdown=True): # excludes can be done with fnmatch.filter and complementary set, # but it's more annoying to read. dirs[:] = [d for d in dirs if d not in dir_excludes] for pat in includes: for f in fnmatch.filter(files, pat): full_path = os.path.join(root, f) test_case = test_util.TestCase() test_case.class_name = "pytest" test_case.name = full_path[len(args.src_dir):] start_time = time.time() test_cases.append(test_case) try: util.run(["python", full_path], cwd=args.src_dir, env=env) except subprocess.CalledProcessError: test_case.failure = "{0} failed.".format(test_case.name) num_failed += 1 finally: test_case.time = time.time() - start_time if num_failed: logging.error("%s tests failed.", num_failed) else: logging.info("No lint issues.") if not args.junit_path: logging.info("No --junit_path.") return gcs_client = None if args.junit_path.startswith("gs://"): gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file(test_cases, args.junit_path, gcs_client)
def clone_repo(dag_run=None, ti=None, **_kwargs): # pylint: disable=too-many-statements # Create a temporary directory suitable for checking out and building the # code. if not dag_run: # When running via airflow test dag_run isn't set logging.warn("Using fake dag_run") dag_run = FakeDagrun() logging.info("dag_id: %s", dag_run.dag_id) logging.info("run_id: %s", dag_run.run_id) conf = dag_run.conf if not conf: conf = {} logging.info("conf=%s", conf) # Pick the directory the top level directory to use for this run of the # pipeline. # This should be a persistent location that is accessible from subsequent # tasks; e.g. an NFS share or PD. The environment variable SRC_DIR is used # to allow the directory to be specified as part of the deployment run_dir = os.path.join(os.getenv("SRC_DIR", tempfile.gettempdir()), dag_run.dag_id.replace(":", "_"), dag_run.run_id.replace(":", "_")) logging.info("Using run_dir %s", run_dir) os.makedirs(run_dir) logging.info("xcom push: run_dir=%s", run_dir) ti.xcom_push(key="run_dir", value=run_dir) # Directory where we will clone the src src_dir = os.path.join(run_dir, "tensorflow_k8s") logging.info("xcom push: src_dir=%s", src_dir) ti.xcom_push(key="src_dir", value=src_dir) # Make sure pull_number is a string pull_number = "{0}".format(conf.get("PULL_NUMBER", "")) args = ["python", "-m", "py.release", "clone", "--src_dir=" + src_dir] if pull_number: commit = conf.get("PULL_PULL_SHA", "") args.append("pr") args.append("--pr=" + pull_number) if commit: args.append("--commit=" + commit) else: commit = conf.get("PULL_BASE_SHA", "") args.append("postsubmit") if commit: args.append("--commit=" + commit) util.run(args, use_print=True)
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir, use_print=True) app_dir = os.path.join(args.test_dir, app_name) # TODO(jlewi): In presubmits we probably want to change this so we can # pull the changes on a branch. Its not clear whether that's well supported # in Ksonnet yet. kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages # TODO(jlewi): For presubmits how do we pull the package from the desired # branch at the desired commit. packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf: key = json.load(hf) apply_command.append("--as=" + key["client_email"]) util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
def run_tests(args): # Print out the pylint version because different versions can produce # different results. util.run(["pylint", "--version"]) dir_excludes = ["vendor"] includes = ["*_test.py"] test_cases = [] env = os.environ.copy() env["PYTHONPATH"] = args.src_dir num_failed = 0 for root, dirs, files in os.walk(args.src_dir, topdown=True): # excludes can be done with fnmatch.filter and complementary set, # but it's more annoying to read. dirs[:] = [d for d in dirs if d not in dir_excludes] for pat in includes: for f in fnmatch.filter(files, pat): full_path = os.path.join(root, f) test_case = test_util.TestCase() test_case.class_name = "pytest" test_case.name = full_path.strip(args.src_dir) start_time = time.time() test_cases.append(test_case) try: util.run(["python", full_path], cwd=args.src_dir, env=env) except subprocess.CalledProcessError: test_case.failure = "{0} failed.".format(test_case.name) num_failed += 1 finally: test_case.time = time.time() - start_time if num_failed: logging.error("%s tests failed.", num_failed) else: logging.info("No lint issues.") if not args.junit_path: logging.info("No --junit_path.") return gcs_client = None if args.junit_path.startswith("gs://"): gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file(test_cases, args.junit_path, gcs_client)
def run_lint(src_dir): """Run lint. Args: src_dir: the directory containing the source. Returns: success: Boolean indicating success or failure """ try: util.run(["./lint.sh"], cwd=src_dir) except subprocess.CalledProcessError as e: logging.error("Lint checks failed; %s", e) return False return True
def run_lint(args): start_time = time.time() # Print out the pylint version because different versions can produce # different results. util.run(["pylint", "--version"]) dir_excludes = ["vendor"] includes = ["*.py"] failed_files = [] rc_file = os.path.join(args.src_dir, ".pylintrc") for root, dirs, files in os.walk(args.src_dir, topdown=True): # excludes can be done with fnmatch.filter and complementary set, # but it's more annoying to read. dirs[:] = [d for d in dirs if d not in dir_excludes] for pat in includes: for f in fnmatch.filter(files, pat): full_path = os.path.join(root, f) try: util.run(["pylint", "--rcfile=" + rc_file, full_path], cwd=args.src_dir) except subprocess.CalledProcessError: failed_files.append(full_path.strip(args.src_dir)) if failed_files: logging.error("%s files had lint errors.", len(failed_files)) else: logging.info("No lint issues.") if not args.junit_path: logging.info("No --junit_path.") return test_case = test_util.TestCase() test_case.class_name = "pylint" test_case.name = "pylint" test_case.time = time.time() - start_time if failed_files: test_case.failure = "Files with lint issues: {0}".format( ", ".join(failed_files)) gcs_client = None if args.junit_path.startswith("gs://"): gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([test_case], args.junit_path, gcs_client)
def create_cluster(gke, name, project, zone): """Create the cluster. Args: gke: Client for GKE. """ cluster_request = { "cluster": { "name": name, "description": "A GKE cluster for testing GPUs with Cloud ML", "initialNodeCount": 1, "nodeConfig": { "machineType": "n1-standard-8", }, } } request = gke.projects().zones().clusters().create(body=cluster_request, projectId=project, zone=zone) try: logging.info("Creating cluster; project=%s, zone=%s, name=%s", project, zone, name) response = request.execute() logging.info("Response %s", response) create_op = wait_for_operation(gke, project, zone, response["name"]) logging.info("Cluster creation done.\n %s", create_op) except errors.HttpError as e: logging.error("Exception occured creating cluster: %s, status: %s", e, e.resp["status"]) # Status appears to be a string. if e.resp["status"] == '409': # TODO(jlewi): What should we do if the cluster already exits? pass else: raise logging.info("Configuring kubectl") util.run([ "gcloud", "--project=" + project, "container", "clusters", "--zone=" + zone, "get-credentials", name ])
def test(args): """Run the tests.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) t = test_util.TestCase() try: start = time.time() util.run(["helm", "test", "tf-job"]) except subprocess.CalledProcessError as e: t.failure = "helm test failed;\n" + e.output finally: t.time = time.time() - start t.name = "e2e-test" t.class_name = "GKE" test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def _push_image(image, latest_image): if "gcr.io" in image: util.run(["gcloud", "docker", "--", "push", image]) logging.info("Pushed image: %s", image) util.run(["gcloud", "docker", "--", "push", latest_image]) logging.info("Pushed image: %s", latest_image) else: util.run(["docker", "push", image]) logging.info("Pushed image: %s", image) util.run(["docker", "push", latest_image]) logging.info("Pushed image: %s", latest_image)
def deploy_and_test(image, test_dir): """Deploy and test the CRD. Args: image: The Docker image for the CRD to use. test_dir: The directory where test outputs should be written. Returns: success: Boolean indicating success or failure """ target = os.path.join("github.com", GO_REPO_OWNER, GO_REPO_NAME, "test-infra", "helm-test") util.run(["go", "install", target]) binary = os.path.join(os.getenv("GOPATH"), "bin", "helm-test") try: util.run([binary, "--image=" + image, "--output_dir=" + test_dir]) except subprocess.CalledProcessError as e: logging.error("helm-test failed; %s", e) return False return True
def setup_cluster(dag_run=None, ti=None, **_kwargs): conf = dag_run.conf if not conf: conf = {} dryrun = bool(conf.get("dryrun", False)) chart = ti.xcom_pull("build_images", key="helm_chart") now = datetime.now() cluster = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] logging.info("conf=%s", conf) artifacts_path = conf.get("ARTIFACTS_PATH", run_path(dag_run.dag_id, dag_run.run_id)) logging.info("artifacts_path %s", artifacts_path) # Gubernator only recognizes XML files whos name matches # junit_[^_]*.xml which is why its "setupcluster" and not "setup_cluster" junit_path = os.path.join(artifacts_path, "junit_setupcluster.xml") logging.info("junit_path %s", junit_path) args = ["python", "-m", "py.deploy", "setup"] args.append("--cluster=" + cluster) args.append("--junit_path=" + junit_path) args.append("--project=" + GCB_PROJECT) args.append("--chart=" + chart) args.append("--zone=" + ZONE) args.append("--accelerator=nvidia-tesla-k80=1") # We want subprocess output to bypass logging module otherwise multiline # output is squashed together. util.run(args, use_print=True, dryrun=dryrun) values = { "cluster": cluster, } for k, v in six.iteritems(values): logging.info("xcom push: %s=%s", k, v) ti.xcom_push(key=k, value=v)
def copy_artifacts(args): """Sync artifacts to GCS.""" job_name = os.getenv("JOB_NAME") # GCS layout is defined here: # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout pull_number = os.getenv("PULL_NUMBER") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") if pull_number: output = _get_pr_gcs_dir(args.bucket) elif repo_owner: # It is a postsubmit job output = ("gs://{bucket}/logs/{owner}_{repo}/" "{job}/{build}").format( owner=repo_owner, repo=repo_name, job=job_name, build=os.getenv("BUILD_NUMBER")) else: # Its a periodic job output = ("gs://{bucket}/logs/{job}/{build}").format( bucket=bucket, job=job_name, build=os.getenv("BUILD_NUMBER")) if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud " "to use service account.") # Since a service account is set tell gcloud to use it. util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS")]) util.run(["gsutil", "-m", "rsync", "-r", args.artifacts_dir, output])
def copy_artifacts(args): """Sync artifacts to GCS.""" job_name = os.getenv("JOB_NAME") # GCS layout is defined here: # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout pull_number = os.getenv("PULL_NUMBER") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") output = get_gcs_dir(args.bucket) if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): logging.info( "GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud " "to use service account.") # Since a service account is set tell gcloud to use it. util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS") ]) util.run(["gsutil", "-m", "rsync", "-r", args.artifacts_dir, output])
def ks_deploy(app_dir, component, params, env=None, account=None): """Deploy the specified ksonnet component. Args: app_dir: The ksonnet directory component: Name of the component to deployed params: A dictionary of parameters to set; can be empty but should not be None. env: (Optional) The environment to use, if none is specified a new one is created. account: (Optional) The account to use. Raises: ValueError: If input arguments aren't valid. """ if not component: raise ValueError("component can't be None.") # TODO(jlewi): It might be better if the test creates the app and uses # the latest stable release of the ksonnet configs. That however will cause # problems when we make changes to the TFJob operator that require changes # to the ksonnet configs. One advantage of checking in the app is that # we can modify the files in vendor if needed so that changes to the code # and config can be submitted in the same pr. now = datetime.datetime.now() if not env: env = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] logging.info("Using app directory: %s", app_dir) util.run(["ks", "env", "add", env], cwd=app_dir) for k, v in params.iteritems(): util.run(["ks", "param", "set", "--env=" + env, component, k, v], cwd=app_dir) apply_command = ["ks", "apply", env, "-c", component] if account: apply_command.append("--as=" + account) util.run(apply_command, cwd=app_dir)
def run_test(args): """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() salt = uuid.uuid4().hex[0:4] # Create a new environment for this run env = "test-env-{0}".format(salt) util.run(["ks", "env", "add", env], cwd=args.app_dir) name = None namespace = None for pair in args.params.split(","): k, v = pair.split("=", 1) if k == "name": name = v if k == "namespace": namespace = v util.run(["ks", "param", "set", "--env=" + env, args.component, k, v], cwd=args.app_dir) if not name: raise ValueError("name must be provided as a parameter.") t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(name) if not namespace: raise ValueError("namespace must be provided as a parameter.") start = time.time() try: util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Job {0} in namespace {1} in state {2}".format( name, namespace, results.get("status", {}).get("state", None)) # TODO(jlewi): # Here are some validation checks to run: # 1. Check tensorboard is created if its part of the job spec. # 2. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.error("There was a problem running the job; Exception %s", e) logging.error( "There was a problem running the job; Exception " "message: %s", e.message) logging.error("Exception type: %s", e.__class__) logging.error("Exception args: %s", e.args) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def build_images(dag_run=None, ti=None, **_kwargs): # pylint: disable=too-many-statements """ Args: dag_run: A DagRun object. This is passed in as a result of setting provide_context to true for the operator. """ # Create a temporary directory suitable for checking out and building the # code. if not dag_run: # When running via airflow test dag_run isn't set logging.warn("Using fake dag_run") dag_run = FakeDagrun() logging.info("dag_id: %s", dag_run.dag_id) logging.info("run_id: %s", dag_run.run_id) run_dir = ti.xcom_pull(None, key="run_dir") logging.info("Using run_dir=%s", run_dir) src_dir = ti.xcom_pull(None, key="src_dir") logging.info("Using src_dir=%s", src_dir) gcs_path = run_path(dag_run.dag_id, dag_run.run_id) logging.info("gcs_path %s", gcs_path) conf = dag_run.conf if not conf: conf = {} logging.info("conf=%s", conf) artifacts_path = conf.get("ARTIFACTS_PATH", gcs_path) logging.info("artifacts_path %s", artifacts_path) # We use a GOPATH that is specific to this run because we don't want # interference from different runs. newenv = os.environ.copy() newenv["GOPATH"] = os.path.join(run_dir, "go") # Make sure pull_number is a string pull_number = "{0}".format(conf.get("PULL_NUMBER", "")) args = ["python", "-m", "py.release", "build", "--src_dir=" + src_dir] dryrun = bool(conf.get("dryrun", False)) build_info_file = os.path.join(gcs_path, "build_info.yaml") args.append("--build_info_path=" + build_info_file) args.append("--releases_path=" + gcs_path) args.append("--project=" + GCB_PROJECT) # We want subprocess output to bypass logging module otherwise multiline # output is squashed together. util.run(args, use_print=True, dryrun=dryrun, env=newenv) # Read the output yaml and publish relevant values to xcom. if not dryrun: gcs_client = storage.Client(project=GCB_PROJECT) logging.info("Reading %s", build_info_file) bucket_name, build_path = util.split_gcs_uri(build_info_file) bucket = gcs_client.get_bucket(bucket_name) blob = bucket.blob(build_path) contents = blob.download_as_string() build_info = yaml.load(contents) else: build_info = { "image": "gcr.io/dryrun/dryrun:latest", "commit": "1234abcd", "helm_chart": "gs://dryrun/dryrun.latest.", } for k, v in six.iteritems(build_info): logging.info("xcom push: %s=%s", k, v) ti.xcom_push(key=k, value=v)
def run_test(args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone # TODO(jlewi): When using GKE we should copy the .kube config and any other # files to the test directory. We should then set the environment variable # KUBECONFIG to point at that file. This should prevent us from having # to rerun util.configure_kubectl on each step. Instead we could run it once # as part of GKE cluster creation and store the config in the NFS directory. # This would make the handling of credentials # and KUBECONFIG more consistent between GKE and minikube and eventually # this could be extended to other K8s deployments. if cluster_name: util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() salt = uuid.uuid4().hex[0:4] # Create a new environment for this run env = "test-env-{0}".format(salt) util.run(["ks", "env", "add", env], cwd=args.app_dir) name = None namespace = None for pair in args.params.split(","): k, v = pair.split("=", 1) if k == "name": name = v if k == "namespace": namespace = v util.run(["ks", "param", "set", "--env=" + env, args.component, k, v], cwd=args.app_dir) if not name: raise ValueError("name must be provided as a parameter.") t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(name) if not namespace: raise ValueError("namespace must be provided as a parameter.") start = time.time() try: # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. # TODO(jlewi): We should make this an argument. num_trials = 2 for trial in range(num_trials): logging.info("Trial %s", trial) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format( trial, name, namespace, results.get("status", {}).get("state", None)) logging.error(t.failure) break runtime_id = results.get("spec", {}).get("RuntimeId") logging.info("Trial %s Job %s in namespace %s runtime ID %s", trial, name, namespace, runtime_id) uid = results.get("metadata", {}).get("uid") events = get_events(api_client, namespace, uid) created_pods, created_services = parse_events(events) num_expected = 0 for replica in results.get("spec", {}).get("replicaSpecs", []): num_expected += replica.get("replicas", 0) creation_failures = [] if len(created_pods) != num_expected: message = ("Expected {0} pods to be created but only " "got {1} create events.").format( num_expected, len(created_pods)) creation_failures.append(message) if len(created_services) != num_expected: message = ("Expected {0} services to be created but only " "got {1} create events.").format( num_expected, len(created_services)) creation_failures.append(message) if creation_failures: t.failure = "Trial {0} Job {1} in namespace {2}: {3}".format( trial, name, namespace, ", ".join(creation_failures)) logging.error(t.failure) break pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) wait_for_pods_to_be_deleted(api_client, namespace, pod_selector) tf_job_client.delete_tf_job(api_client, namespace, name) logging.info("Waiting for job %s in namespaces %s to be deleted.", name, namespace) wait_for_delete(api_client, namespace, name, status_callback=tf_job_client.log_status) # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) logging.error(t.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.error("There was a problem running the job; Exception %s", e) logging.error( "There was a problem running the job; Exception " "message: %s", e.message) logging.error("Exception type: %s", e.__class__) logging.error("Exception args: %s", e.args) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def build_operator_image(root_dir, registry, project=None, should_push=True, version_tag=None): """Build the main docker image for the TFJob CRD. Args: root_dir: Root directory of the repository. registry: The registry to use. project: If set it will be built using GCB. should_push: Should push the image to the registry, Defaule is True. version_tag: Optional tag for the version. If not specified derive the tag from the git hash. Returns: build_info: Dictionary containing information about the build. """ context_dir = tempfile.mkdtemp(prefix="tmpTFJobCrdContext") logging.info("context_dir: %s", context_dir) if not os.path.exists(context_dir): os.makedirs(context_dir) # Build the go binaries go_path = os.environ["GOPATH"] commit = build_and_push_image.GetGitHash(root_dir) targets = [ "github.com/kubeflow/tf-operator/cmd/tf-operator", "github.com/kubeflow/tf-operator/test/e2e", "github.com/kubeflow/tf-operator/dashboard/backend", ] for t in targets: if t == "github.com/kubeflow/tf-operator/cmd/tf-operator": util.run([ "go", "install", "-ldflags", "-X github.com/kubeflow/tf-operator/version.GitSHA={}".format( commit), t ]) util.run(["go", "install", t]) # Dashboard's frontend: # Resolving dashboard's front-end dependencies util.run( ["yarn", "--cwd", "{}/dashboard/frontend".format(root_dir), "install"]) # Building dashboard's front-end util.run( ["yarn", "--cwd", "{}/dashboard/frontend".format(root_dir), "build"]) # If the release is not done from a Linux machine # we need to grab the artefacts from /bin/linux_amd64 bin_path = "bin" if platform.system() != "Linux": bin_path += "/linux_amd64" # List of paths to copy relative to root. sources = [ "build/images/tf_operator/Dockerfile", "examples/tf_sample/tf_sample/tf_smoke.py", os.path.join(go_path, bin_path, "tf-operator"), os.path.join(go_path, bin_path, "e2e"), os.path.join(go_path, bin_path, "backend"), "dashboard/frontend/build" ] for s in sources: src_path = os.path.join(root_dir, s) dest_path = os.path.join(context_dir, os.path.basename(s)) if os.path.exists(dest_path): os.unlink(dest_path) if os.path.isdir(src_path): shutil.copytree(src_path, dest_path) else: shutil.copyfile(src_path, dest_path) image_base = registry + "/tf_operator" if not version_tag: logging.info("No version tag specified; computing tag automatically.") n = datetime.datetime.now() version_tag = n.strftime("v%Y%m%d") + "-" + commit logging.info("Using version tag: %s", version_tag) image = image_base + ":" + version_tag latest_image = image_base + ":latest" if project: util.run([ "gcloud", "container", "builds", "submit", context_dir, "--tag=" + image, "--project=" + project ]) # Add the latest tag. util.run([ "gcloud", "container", "images", "add-tag", "--quiet", image, latest_image ]) else: util.run(["docker", "build", "-t", image, context_dir]) logging.info("Built image: %s", image) util.run(["docker", "tag", image, latest_image]) if should_push: _push_image(image, latest_image) output = { "image": image, "commit": commit, } return output
def main(): # pylint: disable=too-many-locals logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals # create the top-level parser parser = argparse.ArgumentParser(description="Test Kubeflow E2E.") parser.add_argument( "--test_dir", default="", type=str, help="Directory to use for all the test files. If not set a temporary " "directory is created.") parser.add_argument( "--artifacts_dir", default="", type=str, help="Directory to use for artifacts that should be preserved after " "the test runs. Defaults to test_dir if not set.") parser.add_argument("--project", default=None, type=str, help="The project to use.") parser.add_argument( "--cluster", default=None, type=str, help=("The name of the cluster. If not set assumes the " "script is running in a cluster and uses that cluster.")) parser.add_argument("--zone", default="us-east1-d", type=str, help="The zone for the cluster.") parser.add_argument( "--github_token", default=None, type=str, help= ("The GitHub API token to use. This is needed since ksonnet uses the " "GitHub API and without it we get rate limited. For more info see: " "https://github.com/ksonnet/ksonnet/blob/master/docs" "/troubleshooting.md")) args = parser.parse_args() if not args.test_dir: logging.info("--test_dir not set; using a temporary directory.") now = datetime.datetime.now() label = "test_deploy-" + now.strftime( "%m%d-%H%M-") + uuid.uuid4().hex[0:4] # Create a temporary directory for this test run args.test_dir = os.path.join(tempfile.gettempdir(), label) if not args.artifacts_dir: args.artifacts_dir = args.test_dir # Setup a logging file handler. This way we can upload the log outputs # to gubernator. root_logger = logging.getLogger() test_log = os.path.join(args.artifacts_dir, "logs", "test_deploy.log.txt") if not os.path.exists(os.path.dirname(test_log)): os.makedirs(os.path.dirname(test_log)) file_handler = logging.FileHandler(test_log) root_logger.addHandler(file_handler) # We need to explicitly set the formatter because it will not pick up # the BasicConfig. formatter = logging.Formatter( fmt=("%(levelname)s|%(asctime)s" "|%(pathname)s|%(lineno)d| %(message)s"), datefmt="%Y-%m-%dT%H:%M:%S") file_handler.setFormatter(formatter) logging.info("Logging to %s", test_log) if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): logging.info( "GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud " "to use service account.") # Since a service account is set tell gcloud to use it. util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS") ]) setup(args)
def build_and_push_artifacts(go_dir, src_dir, registry, publish_path=None, gcb_project=None, build_info_path=None): """Build and push the artifacts. Args: go_dir: The GOPATH directory src_dir: The root directory where we checked out the repo. registry: Docker registry to use. publish_path: (Optional) The GCS path where artifacts should be published. Set to none to only build locally. gcb_project: The project to use with GCB to build docker images. If set to none uses docker to build. build_info_path: (Optional): GCS location to write YAML file containing information about the build. """ # Update the GOPATH to the temporary directory. env = os.environ.copy() if go_dir: env["GOPATH"] = go_dir bin_dir = os.path.join(src_dir, "bin") if not os.path.exists(bin_dir): os.makedirs(bin_dir) build_info = build_operator_image(src_dir, registry, project=gcb_project) # Copy the chart to a temporary directory because we will modify some # of its YAML files. chart_build_dir = tempfile.mkdtemp(prefix="tmpTFJobChartBuild") shutil.copytree(os.path.join(src_dir, "tf-job-operator-chart"), os.path.join(chart_build_dir, "tf-job-operator-chart")) version = build_info["image"].split(":")[-1] values_file = os.path.join(chart_build_dir, "tf-job-operator-chart", "values.yaml") update_values(values_file, build_info["image"]) chart_file = os.path.join(chart_build_dir, "tf-job-operator-chart", "Chart.yaml") update_chart(chart_file, version) # Delete any existing matches because we assume there is only 1 below. matches = glob.glob(os.path.join(bin_dir, "tf-job-operator-chart*.tgz")) for m in matches: logging.info("Delete previous build: %s", m) os.unlink(m) util.run([ "helm", "package", "--save=false", "--destination=" + bin_dir, "./tf-job-operator-chart" ], cwd=chart_build_dir) matches = glob.glob(os.path.join(bin_dir, "tf-job-operator-chart*.tgz")) if len(matches) != 1: raise ValueError( "Expected 1 chart archive to match but found {0}".format(matches)) chart_archive = matches[0] release_path = version targets = [ os.path.join(release_path, os.path.basename(chart_archive)), "latest/tf-job-operator-chart-latest.tgz", ] if publish_path: gcs_client = storage.Client(project=gcb_project) bucket_name, base_path = util.split_gcs_uri(publish_path) bucket = gcs_client.get_bucket(bucket_name) for t in targets: blob = bucket.blob(os.path.join(base_path, t)) gcs_path = util.to_gcs_uri(bucket_name, blob.name) if not t.startswith("latest"): build_info["helm_chart"] = gcs_path if blob.exists() and not t.startswith("latest"): logging.warn("%s already exists", gcs_path) continue logging.info("Uploading %s to %s.", chart_archive, gcs_path) blob.upload_from_filename(chart_archive) create_latest(bucket, build_info["commit"], util.to_gcs_uri(bucket_name, targets[0])) # Always write to the bin dir. paths = [os.path.join(bin_dir, "build_info.yaml")] if build_info_path: paths.append(build_info_path) write_build_info(build_info, paths, project=gcb_project)
def build_operator_image(root_dir, registry, project=None, should_push=True): """Build the main docker image for the TFJob CRD. Args: root_dir: Root directory of the repository. registry: The registry to use. project: If set it will be built using GCB. Returns: build_info: Dictionary containing information about the build. """ context_dir = tempfile.mkdtemp(prefix="tmpTFJobCrdContext") logging.info("context_dir: %s", context_dir) if not os.path.exists(context_dir): os.makedirs(context_dir) # Build the go binaries go_path = os.environ["GOPATH"] commit = build_and_push_image.GetGitHash(root_dir) targets = [ "github.com/tensorflow/k8s/cmd/tf_operator", "github.com/tensorflow/k8s/test/e2e", "github.com/tensorflow/k8s/dashboard/backend", ] for t in targets: if t == "github.com/tensorflow/k8s/cmd/tf_operator": util.run([ "go", "install", "-ldflags", "-X github.com/tensorflow/k8s/version.GitSHA={}".format( commit), t ]) util.run(["go", "install", t]) # Dashboard's frontend: # Resolving dashboard's front-end dependencies util.run( ["yarn", "--cwd", "{}/dashboard/frontend".format(root_dir), "install"]) # Building dashboard's front-end util.run( ["yarn", "--cwd", "{}/dashboard/frontend".format(root_dir), "build"]) # List of paths to copy relative to root. sources = [ "build/images/tf_operator/Dockerfile", os.path.join(go_path, "bin/tf_operator"), os.path.join(go_path, "bin/e2e"), os.path.join(go_path, "bin/backend"), "dashboard/frontend/build" ] for s in sources: src_path = os.path.join(root_dir, s) dest_path = os.path.join(context_dir, os.path.basename(s)) if os.path.exists(dest_path): os.unlink(dest_path) if os.path.isdir(src_path): shutil.copytree(src_path, dest_path) else: shutil.copyfile(src_path, dest_path) image_base = registry + "/tf_operator" n = datetime.datetime.now() image = (image_base + ":" + n.strftime("v%Y%m%d") + "-" + commit) latest_image = image_base + ":latest" if project: util.run([ "gcloud", "container", "builds", "submit", context_dir, "--tag=" + image, "--project=" + project ]) # Add the latest tag. util.run([ "gcloud", "container", "images", "add-tag", "--quiet", image, latest_image ]) else: util.run(["docker", "build", "-t", image, context_dir]) logging.info("Built image: %s", image) util.run(["docker", "tag", image, latest_image]) if should_push: util.run(["gcloud", "docker", "--", "push", image]) logging.info("Pushed image: %s", image) util.run(["gcloud", "docker", "--", "push", latest_image]) logging.info("Pushed image: %s", latest_image) output = { "image": image, "commit": commit, } return output
def setup(args): """Test deploying Kubeflow.""" if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = run_label def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir, use_print=True) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) source = os.path.join(args.test_dir, "src", "kubeflow") logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) main_case = test_util.TestCase() main_case.class_name = "KubeFlow" main_case.name = "deploy-kubeflow" try: test_util.wrap_test(run, main_case) finally: # Delete the namespace logging.info("Deleting namespace %s", namespace_name) # We report teardown as a separate test case because this will help # us track down issues with garbage collecting namespaces. teardown = test_util.TestCase(main_case.class_name, "teardown") def run_teardown(): core_api = k8s_client.CoreV1Api(api_client) core_api.delete_namespace(namespace_name, {}) try: test_util.wrap_test(run_teardown, teardown) except Exception as e: # pylint: disable-msg=broad-except logging.error("There was a problem deleting namespace: %s; %s", namespace_name, e.message) junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([main_case, teardown], junit_path)
def run_lint(args): start_time = time.time() # Print out the pylint version because different versions can produce # different results. util.run(["pylint", "--version"]) # kubeflow_testing is imported as a submodule so we should exclude it # TODO(jlewi): Perhaps we should get a list of submodules and exclude # them automatically? dir_excludes = [ "dashboard/frontend/node_modules", "kubeflow_testing", "test/test-app", "vendor", ] full_dir_excludes = [ os.path.join(os.path.abspath(args.src_dir), f) for f in dir_excludes ] includes = ["*.py"] failed_files = [] rc_file = os.path.join(args.src_dir, ".pylintrc") for root, dirs, files in os.walk(os.path.abspath(args.src_dir), topdown=True): # excludes can be done with fnmatch.filter and complementary set, # but it's more annoying to read. exclude = False for e in full_dir_excludes: if root.startswith(e): exclude = True break if exclude: continue dirs[:] = [d for d in dirs] for pat in includes: for f in fnmatch.filter(files, pat): full_path = os.path.join(root, f) try: util.run(["pylint", "--rcfile=" + rc_file, full_path], cwd=args.src_dir) except subprocess.CalledProcessError: failed_files.append(full_path[len(args.src_dir):]) if failed_files: failed_files.sort() logging.error("%s files had lint errors:\n%s", len(failed_files), "\n".join(failed_files)) else: logging.info("No lint issues.") if not args.junit_path: logging.info("No --junit_path.") return test_case = test_util.TestCase() test_case.class_name = "pylint" test_case.name = "pylint" test_case.time = time.time() - start_time if failed_files: test_case.failure = "Files with lint issues: {0}".format( ", ".join(failed_files)) gcs_client = None if args.junit_path.startswith("gs://"): gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([test_case], args.junit_path, gcs_client)
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir, use_print=True) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) source = os.path.join(args.test_dir, "src", "kubeflow") logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)