def run_test(test_case, test_func, args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone # TODO(jlewi): When using GKE we should copy the .kube config and any other # files to the test directory. We should then set the environment variable # KUBECONFIG to point at that file. This should prevent us from having # to rerun util.configure_kubectl on each step. Instead we could run it once # as part of GKE cluster creation and store the config in the NFS directory. # This would make the handling of credentials # and KUBECONFIG more consistent between GKE and minikube and eventually # this could be extended to other K8s deployments. if cluster_name: util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() start = time.time() try: # pylint: disable=too-many-nested-blocks # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. num_trials = args.num_trials logging.info("tfjob_version=%s", args.tfjob_version) for trial in range(num_trials): logging.info("Trial %s", trial) test_func() # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except tf_operator_util.JobTimeoutError as e: if e.job: spec = "Job:\n" + json.dumps(e.job, indent=2) else: spec = "JobTimeoutError did not contain job" test_case.failure = "Timeout waiting for job to finish: " + spec logging.exception(test_case.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.exception("There was a problem running the job; Exception %s", e) # We want to catch all exceptions because we want the test as failed. test_case.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: test_case.time = time.time() - start if args.artifacts_path: test_util.create_junit_xml_file( [test_case], args.artifacts_path + "/junit_" + test_func.__name__ + ".xml", gcs_client)
def setup(args): """Test deploying Kubeflow.""" api_client = create_k8s_client(args) app_dir = setup_kubeflow_ks_app(args, api_client) namespace = args.namespace # TODO(jlewi): We don't need to generate a core component if we are # just deploying TFServing. Might be better to refactor this code. # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name)
def get_gke_credentials(test_case): """Configure kubeconfig to talk to the supplied GKE cluster.""" args = parse_args() util.maybe_activate_service_account() config_file = os.path.expanduser(kube_config.KUBE_CONFIG_DEFAULT_LOCATION) logging.info("Using Kubernetes config file: %s", config_file) project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) # We want to modify the KUBECONFIG file to remove the gcloud commands # for any users that are authenticating using service accounts. # This will allow the script to be truly headless and not require gcloud. # More importantly, kubectl will properly attach auth.info scope so that # RBAC rules can be applied to the email and not the id. # See https://github.com/kubernetes/kubernetes/pull/58141 # # TODO(jlewi): We might want to check GOOGLE_APPLICATION_CREDENTIALS # to see whether we are actually using a service account. If we aren't # using a service account then we might not want to delete the gcloud # commands. logging.info("Modifying kubeconfig %s", config_file) with open(config_file, "r") as hf: config = yaml.load(hf) for user in config["users"]: auth_provider = user.get("user", {}).get("auth-provider", {}) if auth_provider.get("name") != "gcp": continue logging.info("Modifying user %s which has gcp auth provider", user["name"]) if "config" in auth_provider: logging.info("Deleting config from user %s", user["name"]) del auth_provider["config"] # This is a hack because the python client library will complain # about an invalid config if there is no config field. # # It looks like the code checks here but that doesn't seem to work # https://github.com/kubernetes-client/python-base/blob/master/config/kube_config.py#L209 auth_provider["config"] = { "dummy": "dummy", } logging.info("Writing update kubeconfig:\n %s", yaml.dump(config)) with open(config_file, "w") as hf: yaml.dump(config, hf)
def create_k8s_client(args): if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient()
def run(self, tekton_cluster_info, current_cluster_info): """Kicks off all the Tekton pipelines. Args: tekton_cluster_info: ClusterInfo having the info to run pipelines on. Tekton runs on different cluster right now. current_cluster_info: Current cluster info. Returns: a list of UI urls. """ urls = dict() try: # Currently only tekton tests run in kf-ci-v1. util.configure_kubectl(tekton_cluster_info.project, tekton_cluster_info.zone, tekton_cluster_info.cluster_name) # util.configure_kubectl(project, "us-east1-d", "kf-ci-v1") util.load_kube_config() for w in self.workflows: w.run() urls[w.name] = w.ui_url if w.teardown_runner: urls[w.teardown_runner.name] = w.teardown_runner.ui_url logging.info("URL for workflow: %s", w.ui_url) except Exception as e: # pylint: disable=broad-except logging.error( "Error when starting Tekton workflow: %s;\nstacktrace:\n%s", e, traceback.format_exc()) finally: # Restore kubectl util.configure_kubectl(current_cluster_info.project, current_cluster_info.zone, current_cluster_info.cluster_name) util.load_kube_config() return urls
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches # Check https://github.com/kubernetes/test-infra/blob/master/prow/jobs.md # for a description of the injected environment variables. job_type = os.getenv("JOB_TYPE") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") base_branch_name = os.getenv("PULL_BASE_REF") pull_base_sha = os.getenv("PULL_BASE_SHA") # For presubmit/postsubmit jobs, find the list of files changed by the PR. diff_command = [] if job_type == "presubmit": # We need to get a common ancestor for the PR and the base branch cloned_repo_dir = os.path.join(args.repos_dir, repo_owner, repo_name) _ = util.run([ "git", "fetch", "origin", base_branch_name + ":refs/remotes/origin/" + base_branch_name ], cwd=cloned_repo_dir) diff_command = ["git", "diff", "--name-only"] diff_branch = "remotes/origin/{}".format(base_branch_name) try: common_ancestor = util.run( ["git", "merge-base", "HEAD", diff_branch], cwd=cloned_repo_dir) diff_command.append(common_ancestor) except subprocess.CalledProcessError as e: logging.warning( "git merge-base failed; see " "https://github.com/kubeflow/kubeflow/issues/3523. Diff " "will be computed against the current master and " "therefore files not changed in the PR might be " "considered when determining which tests to trigger") diff_command.append(diff_branch) elif job_type == "postsubmit": # See: https://git-scm.com/docs/git-diff # This syntax compares the commit before pull_base_sha with the commit # at pull_base_sha diff_command = [ "git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha ] changed_files = [] if job_type in ("presubmit", "postsubmit"): changed_files = util.run(diff_command, cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines() for f in changed_files: logging.info("File %s is modified.", f) if args.release: generate_env_from_head(args) workflows = [] config = {} if args.config_file: config, new_workflows = parse_config_file(args.config_file, args.repos_dir) workflows.extend(new_workflows) # Add any paths to the python path extra_py_paths = [] for p in config.get("python_paths", []): # Assume that python_paths are in the format $REPO_OWNER/$REPO_NAME/path, # we need to ensure that the repo is checked out if it is different from # the current one, and if the repo is not kubeflow/testing (which is already # checked out). segments = p.split("/") if ((segments[0] != repo_owner or segments[1] != repo_name) and not p.startswith("kubeflow/testing")): logging.info("Need to clone %s/%s", segments[0], segments[1]) util.clone_repo( os.path.join(args.repos_dir, segments[0], segments[1]), segments[0], segments[1]) path = os.path.join(args.repos_dir, p) extra_py_paths.append(path) kf_test_path = os.path.join(args.repos_dir, "kubeflow/testing/py") if kf_test_path not in extra_py_paths: logging.info("Adding %s to extra python paths", kf_test_path) extra_py_paths.append(kf_test_path) logging.info("Extra python paths: %s", ":".join(extra_py_paths)) # Create an initial version of the file with no urls create_started_file(args.bucket, {}) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() workflow_names = [] ui_urls = {} for w in workflows: # pylint: disable=too-many-nested-blocks # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name # Skip this workflow if it is scoped to a different job type. if w.job_types and not job_type in w.job_types: logging.info( "Skipping workflow %s because job type %s is not one of " "%s.", w.name, job_type, w.job_types) continue # If we are scoping this workflow to specific directories, check if any files # modified match the specified regex patterns. dir_modified = False if w.include_dirs: for f in changed_files: for d in w.include_dirs: if fnmatch.fnmatch(f, d): dir_modified = True logging.info( "Triggering workflow %s because %s in dir %s is modified.", w.name, f, d) break if dir_modified: break # Only consider modified files when the job is pre or post submit, and if # the include_dirs stanza is defined. if job_type != "periodic" and w.include_dirs and not dir_modified: logging.info( "Skipping workflow %s because no code modified in %s.", w.name, w.include_dirs) continue if job_type == "presubmit": # When not running under prow we might not set all environment variables if os.getenv("PULL_NUMBER"): workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) if os.getenv("PULL_PULL_SHA"): workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": if os.getenv("PULL_BASE_SHA"): workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) # Append the last 4 digits of the build number if os.getenv("BUILD_NUMBER"): workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")[-4:]) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # check if ks workflow and run if w.app_dir: ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir) # Print ksonnet version util.run([ks_cmd, "version"]) # Create a new environment for this run env = workflow_name util.run([ ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "name", workflow_name ], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = [ "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME" ] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket ], cwd=w.app_dir) if args.release: util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG") ], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k]) ], cwd=w.app_dir) # For debugging print out the manifest util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ( "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) else: w.kwargs["name"] = workflow_name w.kwargs["namespace"] = get_namespace(args) if TEST_TARGET_ARG_NAME not in w.kwargs: w.kwargs[TEST_TARGET_ARG_NAME] = w.name logging.info( "Workflow %s doesn't set arg %s; defaulting to %s", w.name, TEST_TARGET_ARG_NAME, w.kwargs[TEST_TARGET_ARG_NAME]) # TODO(https://github.com/kubeflow/testing/issues/467): We shell out # to e2e_tool in order to dumpy the Argo workflow to a file which then # reimport. We do this because importing the py_func module appears # to break when we have to dynamically adjust sys.path to insert # new paths. Setting PYTHONPATH before launching python however appears # to work which is why we shell out to e2e_tool. command = [ "python", "-m", "kubeflow.testing.e2e_tool", "show", w.py_func ] for k, v in w.kwargs.items(): # The fire module turns underscores in parameter names into hyphens # so we convert underscores in parameter names to hyphens command.append("--{0}={1}".format(k.replace("_", "-"), v)) with tempfile.NamedTemporaryFile(delete=False) as hf: workflow_file = hf.name command.append("--output=" + hf.name) env = os.environ.copy() env["PYTHONPATH"] = ":".join(extra_py_paths) util.run(command, env=env) with open(workflow_file) as hf: wf_result = yaml.load(hf) group, version = wf_result['apiVersion'].split('/') k8s_co = k8s_client.CustomObjectsApi() workflow_name = wf_result["metadata"]["name"] py_func_result = k8s_co.create_namespaced_custom_object( group=group, version=version, namespace=wf_result["metadata"]["namespace"], plural='workflows', body=wf_result) logging.info("Created workflow:\n%s", yaml.safe_dump(py_func_result)) ui_url = ( "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) # We delay creating started.json until we know the Argo workflow URLs create_started_file(args.bucket, ui_urls) workflow_success = False workflow_phase = {} workflow_status_yamls = {} results = [] try: results = argo_client.wait_for_workflows( get_namespace(args), workflow_names, timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status) workflow_success = True except util.ExceptionWithWorkflowResults as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.exception("Exception occurred: %s", e) results = e.workflow_results raise finally: prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts_dir, "build-log.txt")) # Upload workflow status to GCS. for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase workflow_status_yamls[name] = yaml.safe_dump( r, default_flow_style=False) if phase != "Succeeded": workflow_success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) for wf_name, wf_status in workflow_status_yamls.items(): util.upload_to_gcs( wf_status, os.path.join(prow_artifacts_dir, '{}.yaml'.format(wf_name))) all_tests_success = prow_artifacts.finalize_prow_job( args.bucket, workflow_success, workflow_phase, ui_urls) return all_tests_success
def setup_cluster(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split( "=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) util.run([ "kubectl", "create", "clusterrolebinding", "default-admin", "--clusterrole=cluster-admin", "--user="******"setup-cluster failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "setup-cluster" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup(args): """Test deploying Kubeflow.""" api_client = create_k8s_client(args) now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = args.namespace namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) if args.github_token: logging.info("Setting GITHUB_TOKEN to %s.", args.github_token) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token if not os.getenv("GITHUB_TOKEN"): logging.warn("GITHUB_TOKEN not set; you will probably hit Github API " "limits.") # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) if args.deploy_tf_serving: logging.info("Deploying tf-serving.") util.run([ "ks", "generate", "tf-serving", "modelServer", "--name=inception", "--namespace=" + namespace.metadata.name, "--model_path=gs://kubeflow-models/inception", "--model_server_image=" + args.model_server_image ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "modelServer", ] util.run(apply_command, cwd=app_dir) core_api = k8s_client.CoreV1Api(api_client) deploy = core_api.read_namespaced_service("inception", namespace.metadata.name) cluster_ip = deploy.spec.cluster_ip util.wait_for_deployment(api_client, namespace.metadata.name, "inception") logging.info("Verified TF serving started.")
def setup(args): """Test deploying Kubeflow.""" if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = run_label def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) main_case = test_util.TestCase() main_case.class_name = "KubeFlow" main_case.name = "deploy-kubeflow" try: test_util.wrap_test(run, main_case) finally: # Delete the namespace logging.info("Deleting namespace %s", namespace_name) # We report teardown as a separate test case because this will help # us track down issues with garbage collecting namespaces. teardown = test_util.TestCase(main_case.class_name, "teardown") def run_teardown(): core_api = k8s_client.CoreV1Api(api_client) core_api.delete_namespace(namespace_name, {}) try: test_util.wrap_test(run_teardown, teardown) except Exception as e: # pylint: disable-msg=broad-except logging.error("There was a problem deleting namespace: %s; %s", namespace_name, e.message) junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([main_case, teardown], junit_path)
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches # Print ksonnet version util.run(["ks", "version"]) if args.release: generate_env_from_head(args) workflows = [] if args.config_file: workflows.extend(parse_config_file(args.config_file, args.repos_dir)) if args.app_dir and args.component: # TODO(jlewi): We can get rid of this branch once all repos are using a prow_config.xml file. workflows.append( WorkflowComponent("legacy", args.app_dir, args.component, {})) create_started_file(args.bucket) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() api_client = k8s_client.ApiClient() workflow_names = [] ui_urls = {} for w in workflows: # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name job_type = os.getenv("JOB_TYPE") if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # Create a new environment for this run env = workflow_name util.run(["ks", "env", "add", env], cwd=w.app_dir) util.run([ "ks", "param", "set", "--env=" + env, w.component, "name", workflow_name ], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = [ "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME" ] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ "ks", "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env) ], cwd=w.app_dir) util.run([ "ks", "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args) ], cwd=w.app_dir) util.run([ "ks", "param", "set", "--env=" + env, w.component, "bucket", args.bucket ], cwd=w.app_dir) if args.release: util.run([ "ks", "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG") ], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ "ks", "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k]) ], cwd=w.app_dir) # For debugging print out the manifest util.run(["ks", "show", env, "-c", w.component], cwd=w.app_dir) util.run(["ks", "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ( "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) success = True workflow_phase = {} try: results = argo_client.wait_for_workflows( api_client, get_namespace(args), workflow_names, status_callback=argo_client.log_status) for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase if phase != "Succeeded": success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) except util.TimeoutError: success = False logging.error("Time out waiting for Workflows %s to finish", ",".join(workflow_names)) except Exception as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.error("Exception occurred: %s", e) raise finally: success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
def setup(args): """Test deploying Kubeflow.""" if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = run_label def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) main_case = test_util.TestCase() main_case.class_name = "KubeFlow" main_case.name = "deploy-kubeflow" try: test_util.wrap_test(run, main_case) finally: # Delete the namespace logging.info("Deleting namespace %s", namespace_name) # We report teardown as a separate test case because this will help # us track down issues with garbage collecting namespaces. teardown = test_util.TestCase(main_case.class_name, "teardown") def run_teardown(): core_api = k8s_client.CoreV1Api(api_client) core_api.delete_namespace(namespace_name, {}) try: test_util.wrap_test(run_teardown, teardown) except Exception as e: # pylint: disable-msg=broad-except logging.error("There was a problem deleting namespace: %s; %s", namespace_name, e.message) junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([main_case, teardown], junit_path)
def setup(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split("=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, "tfJobVersion": args.tf_job_version, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) util.run([ "kubectl", "create", "clusterrolebinding", "default-admin", "--clusterrole=cluster-admin", "--user="******"v1alpha1": tf_job_deployment_name = "tf-job-operator" elif args.tf_job_version == "v1alpha2": tf_job_deployment_name = "tf-job-operator-v1alpha2" else: raise ValueError( "Unrecognized value for tf_job_version %s" % args.tf_job_version) logging.info("Verifying TfJob deployment %s started.", tf_job_deployment_name) # TODO(jlewi): We should verify the image of the operator is the correct. util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run_test(args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone # TODO(jlewi): When using GKE we should copy the .kube config and any other # files to the test directory. We should then set the environment variable # KUBECONFIG to point at that file. This should prevent us from having # to rerun util.configure_kubectl on each step. Instead we could run it once # as part of GKE cluster creation and store the config in the NFS directory. # This would make the handling of credentials # and KUBECONFIG more consistent between GKE and minikube and eventually # this could be extended to other K8s deployments. if cluster_name: util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() masterHost = api_client.configuration.host t = test_util.TestCase() t.class_name = "tfjob_test" namespace, name, env = _setup_ks_app(args) t.name = os.path.basename(name) start = time.time() try: # pylint: disable=too-many-nested-blocks # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. # TODO(jlewi): We should make this an argument. num_trials = 2 for trial in range(num_trials): logging.info("Trial %s", trial) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) logging.info("tfjob_version=%s", args.tfjob_version) # Wait for the job to either be in Running state or a terminal state if args.tfjob_version == "v1alpha1": logging.info("Wait for Phase Running, Done, or Failed") results = tf_job_client.wait_for_phase( api_client, namespace, name, ["Running", "Done", "Failed"], status_callback=tf_job_client.log_status) else: logging.info( "Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, namespace, name, ["Running", "Succeeded", "Failed"], status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # The job is now either running or done. if args.shutdown_policy: logging.info("Enforcing shutdownPolicy %s", args.shutdown_policy) if args.shutdown_policy in ["master", "chief"]: if args.tfjob_version == "v1alpha1": replica = "master" else: replica = "chief" elif args.shutdown_policy in ["worker", "all_workers"]: replica = "worker" else: raise ValueError("Unrecognized shutdown_policy " "%s" % args.shutdown_policy) # Number of targets. num_targets = 1 if args.shutdown_policy in ["all_workers"]: # Assume v1alpha2 num_targets = results.get("spec", {}).get( "tfReplicaSpecs", {}).get("Worker", {}).get("replicas", 0) logging.info("There are %s worker replicas", num_targets) if args.tfjob_version == "v1alpha1": runtime_id = results.get("spec", {}).get("RuntimeId") target = "{name}-{replica}-{runtime}".format( name=name, replica=replica, runtime=runtime_id) pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) else: target = "{name}-{replica}".format(name=name, replica=replica) pod_labels = get_labels_v1alpha2(namespace, name) pod_selector = to_selector(pod_labels) # Wait for the pods to be ready before we shutdown # TODO(jlewi): We are get pods using a label selector so there is # a risk that the pod we actual care about isn't present. logging.info( "Waiting for pods to be running before shutting down.") wait_for_pods_to_be_in_phases( api_client, namespace, pod_selector, ["Running"], timeout=datetime.timedelta(minutes=4)) logging.info("Pods are ready") logging.info("Issuing the terminate request") for num in range(num_targets): full_target = target + "-{0}".format(num) terminateReplica(masterHost, namespace, full_target) logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, namespace, name, args.tfjob_version, status_callback=tf_job_client.log_status) if args.tfjob_version == "v1alpha1": if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format( trial, name, namespace, results.get("status", {}).get("state", None)) logging.error(t.failure) break else: # For v1alpha2 check for non-empty completionTime last_condition = results.get("status", {}).get("conditions", [])[-1] if last_condition.get("type", "").lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in status {3}".format( trial, name, namespace, results.get("status", {})) logging.error(t.failure) break runtime_id = results.get("spec", {}).get("RuntimeId") logging.info("Trial %s Job %s in namespace %s runtime ID %s", trial, name, namespace, runtime_id) uid = results.get("metadata", {}).get("uid") events = get_events(api_client, namespace, uid) for e in events: logging.info("K8s event: %s", e.message) # Print out the K8s events because it can be useful for debugging. for e in events: logging.info("Recieved K8s Event:\n%s", e) created_pods, created_services = parse_events(events) num_expected = 0 if args.tfjob_version == "v1alpha1": for replica in results.get("spec", {}).get("replicaSpecs", []): num_expected += replica.get("replicas", 0) else: for replicakey in results.get("spec", {}).get("tfReplicaSpecs", {}): replica_spec = results.get("spec", {}).get("tfReplicaSpecs", {}).get(replicakey, {}) if replica_spec: num_expected += replica_spec.get("replicas", 1) creation_failures = [] if len(created_pods) != num_expected: message = ("Expected {0} pods to be created but only " "got {1} create events.").format( num_expected, len(created_pods)) creation_failures.append(message) if len(created_services) != num_expected: message = ("Expected {0} services to be created but only " "got {1} create events.").format( num_expected, len(created_services)) creation_failures.append(message) if creation_failures: # TODO(jlewi): Starting with # https://github.com/kubeflow/tf-operator/pull/646 the number of events # no longer seems to match the expected; it looks like maybe events # are being combined? For now we just log a warning rather than an # error. logging.warning(creation_failures) if args.tfjob_version == "v1alpha1": pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) else: pod_labels = get_labels_v1alpha2(name) pod_selector = to_selector(pod_labels) # We don't wait for pods to be deleted in v1alpha2 because CleanPodPolicy # means completed pods won't be deleted. # TODO(jlewi): We should add a test to deal with deleted pods. if args.tfjob_version == "v1alpha1": wait_for_pods_to_be_deleted(api_client, namespace, pod_selector) tf_job_client.delete_tf_job(api_client, namespace, name, version=args.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", name, namespace) wait_for_delete(api_client, namespace, name, args.tfjob_version, status_callback=tf_job_client.log_status) # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) logging.exception(t.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.exception("There was a problem running the job; Exception %s", e) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches # Check https://github.com/kubernetes/test-infra/blob/master/prow/jobs.md # for a description of the injected environment variables. job_type = os.getenv("JOB_TYPE") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") base_branch_name = os.getenv("PULL_BASE_REF") pull_base_sha = os.getenv("PULL_BASE_SHA") # For presubmit/postsubmit jobs, find the list of files changed by the PR. diff_command = [] if job_type == "presubmit": # We need to get a common ancestor for the PR and the base branch cloned_repo_dir = os.path.join(args.repos_dir, repo_owner, repo_name) _ = util.run(["git", "fetch", "origin", base_branch_name + ":refs/remotes/origin/" + base_branch_name], cwd=cloned_repo_dir) diff_command = ["git", "diff", "--name-only"] diff_branch = "remotes/origin/{}".format(base_branch_name) try: common_ancestor = util.run(["git", "merge-base", "HEAD", diff_branch], cwd=cloned_repo_dir) diff_command.append(common_ancestor) except subprocess.CalledProcessError as e: logging.warning("git merge-base failed; see " "https://github.com/kubeflow/kubeflow/issues/3523. Diff " "will be computed against the current master and " "therefore files not changed in the PR might be " "considered when determining which tests to trigger") diff_command.append(diff_branch) elif job_type == "postsubmit": # See: https://git-scm.com/docs/git-diff # This syntax compares the commit before pull_base_sha with the commit # at pull_base_sha diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha] changed_files = [] if job_type in ("presubmit", "postsubmit"): changed_files = util.run(diff_command, cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines() for f in changed_files: logging.info("File %s is modified.", f) if args.release: generate_env_from_head(args) workflows = [] if args.config_file: workflows.extend(parse_config_file(args.config_file, args.repos_dir)) # Create an initial version of the file with no urls create_started_file(args.bucket, {}) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() workflow_names = [] ui_urls = {} for w in workflows: # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir) # Print ksonnet version util.run([ks_cmd, "version"]) # Skip this workflow if it is scoped to a different job type. if w.job_types and not job_type in w.job_types: logging.info("Skipping workflow %s because job type %s is not one of " "%s.", w.name, job_type, w.job_types) continue # If we are scoping this workflow to specific directories, check if any files # modified match the specified regex patterns. dir_modified = False if w.include_dirs: for f in changed_files: for d in w.include_dirs: if fnmatch.fnmatch(f, d): dir_modified = True logging.info("Triggering workflow %s because %s in dir %s is modified.", w.name, f, d) break if dir_modified: break # Only consider modified files when the job is pre or post submit, and if # the include_dirs stanza is defined. if job_type != "periodic" and w.include_dirs and not dir_modified: logging.info("Skipping workflow %s because no code modified in %s.", w.name, w.include_dirs) continue if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) # Append the last 4 digits of the build number workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")[-4:]) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # Create a new environment for this run env = workflow_name util.run([ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "name", workflow_name], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME"] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket], cwd=w.app_dir) if args.release: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG")], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k])], cwd=w.app_dir) # For debugging print out the manifest util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) # We delay creating started.json until we know the Argo workflow URLs create_started_file(args.bucket, ui_urls) workflow_success = False workflow_phase = {} workflow_status_yamls = {} results = [] try: results = argo_client.wait_for_workflows( get_namespace(args), workflow_names, timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status ) workflow_success = True except util.ExceptionWithWorkflowResults as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.exception("Exception occurred: %s", e) results = e.workflow_results raise finally: prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts_dir, "build-log.txt")) # Upload workflow status to GCS. for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase workflow_status_yamls[name] = yaml.safe_dump(r, default_flow_style=False) if phase != "Succeeded": workflow_success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) for wf_name, wf_status in workflow_status_yamls.items(): util.upload_to_gcs( wf_status, os.path.join(prow_artifacts_dir, '{}.yaml'.format(wf_name))) all_tests_success = prow_artifacts.finalize_prow_job( args.bucket, workflow_success, workflow_phase, ui_urls) return all_tests_success
def setup_kubeflow(args): """Setup Kubeflow. Args: args: Command line arguments that control the setup process. """ project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, "tfJobVersion": args.tf_job_version, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) ks_deploy(args.test_app_dir, component, params, account=account) # Verify that the TfJob operator is actually deployed. if args.tf_job_version == "v1alpha2": tf_job_deployment_name = "tf-job-operator-v1alpha2" elif args.tf_job_version == "v1beta1": tf_job_deployment_name = "tf-job-operator-v1beta1" else: raise ValueError("Unrecognized value for tf_job_version %s" % args.tf_job_version) logging.info("Verifying TfJob deployment %s started.", tf_job_deployment_name) # TODO(jlewi): We should verify the image of the operator is the correct # one. try: util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) finally: # Run kubectl describe to get useful information about the deployment. # This will help troubleshoot any errors. util.run([ "kubectl", "-n", args.namespace, "describe", "deploy", tf_job_deployment_name ]) util.run([ "kubectl", "-n", args.namespace, "describe", "pods", "-l", "name=tf-job-operator" ]) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches job_type = os.getenv("JOB_TYPE") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") pull_base_sha = os.getenv("PULL_BASE_SHA") # For presubmit/postsubmit jobs, find the list of files changed by the PR. diff_command = [] if job_type == "presubmit": diff_command = ["git", "diff", "--name-only", "master"] elif job_type == "postsubmit": diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha] changed_files = [] if job_type == "presubmit" or job_type == "postsubmit": changed_files = util.run(diff_command, cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines() for f in changed_files: logging.info("File %s is modified.", f) if args.release: generate_env_from_head(args) workflows = [] if args.config_file: workflows.extend(parse_config_file(args.config_file, args.repos_dir)) create_started_file(args.bucket) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() workflow_names = [] ui_urls = {} for w in workflows: # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name ks_cmd = get_ksonnet_cmd(w) # Print ksonnet version util.run([ks_cmd, "version"]) # Skip this workflow if it is scoped to a different job type. if w.job_types and not job_type in w.job_types: logging.info("Skipping workflow %s because job type %s is not one of " "%s.", w.name, job_type, w.job_types) continue # If we are scoping this workflow to specific directories, check if any files # modified match the specified regex patterns. dir_modified = False if w.include_dirs: for f in changed_files: for d in w.include_dirs: if fnmatch.fnmatch(f, d): dir_modified = True logging.info("Triggering workflow %s because %s in dir %s is modified.", w.name, f, d) break if dir_modified: break # Only consider modified files when the job is pre or post submit, and if # the include_dirs stanza is defined. if job_type != "periodic" and w.include_dirs and not dir_modified: logging.info("Skipping workflow %s because no code modified in %s.", w.name, w.include_dirs) continue if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # Create a new environment for this run env = workflow_name util.run([ks_cmd, "env", "add", env], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "name", workflow_name], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME"] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket], cwd=w.app_dir) if args.release: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG")], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k])], cwd=w.app_dir) # For debugging print out the manifest util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) success = True workflow_phase = {} try: results = argo_client.wait_for_workflows(get_namespace(args), workflow_names, timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status) for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase if phase != "Succeeded": success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) except util.TimeoutError: success = False logging.exception("Time out waiting for Workflows %s to finish", ",".join(workflow_names)) except Exception as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.exception("Exception occurred: %s", e) raise finally: success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success
def run(args, file_handler): create_started_file(args.bucket) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") job_type = os.getenv("JOB_TYPE") if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) # Create a new environment for this run env = workflow_name util.run(["ks", "env", "add", env], cwd=args.app_dir) util.run([ "ks", "param", "set", "--env=" + env, args.component, "name", workflow_name ], cwd=args.app_dir) util.load_kube_config() api_client = k8s_client.ApiClient() # Set the prow environment variables. prow_env = [] names = [ "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME" ] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ "ks", "param", "set", "--env=" + env, args.component, "prow_env", ",".join(prow_env) ], cwd=args.app_dir) util.run([ "ks", "param", "set", "--env=" + env, args.component, "namespace", NAMESPACE ], cwd=args.app_dir) util.run([ "ks", "param", "set", "--env=" + env, args.component, "bucket", args.bucket ], cwd=args.app_dir) # For debugging print out the manifest util.run(["ks", "show", env, "-c", args.component], cwd=args.app_dir) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) ui_url = ( "http://testing-argo.kubeflow.io/timeline/kubeflow-test-infra/{0}" ";tab=workflow".format(workflow_name)) logging.info("URL for workflow: %s", ui_url) success = False try: results = argo_client.wait_for_workflow( api_client, NAMESPACE, workflow_name, status_callback=argo_client.log_status) if results["status"]["phase"] == "Succeeded": success = True logging.info("Workflow %s/%s finished phase: %s", NAMESPACE, workflow_name, results["status"]["phase"]) except util.TimeoutError: success = False logging.error("Time out waiting for Workflow %s/%s to finish", NAMESPACE, workflow_name) finally: create_finished_file(args.bucket, success) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success