def check_statefulsets_ready(record_xml_attribute, namespace, name, stateful_sets): """Test that Kubeflow deployments are successfully deployed. Args: namespace: The namespace to check """ set_logging() # TODO(jlewi): Should we do this in the calling function)? util.set_pytest_junit(record_xml_attribute, name) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() for set_name in stateful_sets: logging.info("Verifying that stateful set %s.%s started...", namespace, set_name) try: util.wait_for_statefulset(api_client, namespace, set_name) except: # Collect debug information by running describe util.run(["kubectl", "-n", namespace, "describe", "statefulsets", set_name]) raise Exception(f"Stateful set {namespace}.{name} is not ready")
def test_build_kfctl_go(record_xml_attribute, app_path, project, use_basic_auth, use_istio, config_path, build_and_apply, kfctl_repo_path, cluster_name, values): """Test building and deploying Kubeflow. Args: app_path: The path to the Kubeflow app. project: The GCP project to use. use_basic_auth: Whether to use basic_auth. use_istio: Whether to use Istio or not config_path: Path to the KFDef spec file. cluster_name: Name of EKS cluster build_and_apply: whether to build and apply or apply kfctl_repo_path: path to the kubeflow/kfctl repo. values: Comma separated list of variables to substitute into config_path """ util.set_pytest_junit(record_xml_attribute, "test_deploy_kubeflow") if values: pairs = values.split(",") path_vars = {} for p in pairs: k, v = p.split("=") path_vars[k] = v config_path = config_path.format(**path_vars) logging.info("config_path after substitution: %s", config_path) kfctl_path = os.path.join(kfctl_repo_path, "bin", "kfctl") app_path = kfctl_util.kfctl_deploy_kubeflow(app_path, config_path, kfctl_path, build_and_apply, cluster_name) logging.info("kubeflow app path: %s", app_path)
def test_endpoint_is_ready(record_xml_attribute, project, app_path, app_name, use_basic_auth): """Test that Kubeflow was successfully deployed. Args: project: The gcp project that we deployed kubeflow app_name: The name of the kubeflow deployment """ util.set_pytest_junit(record_xml_attribute, "test_endpoint_is_ready") url = "https://{}.endpoints.{}.cloud.goog".format(app_name, project) if use_basic_auth: with open(os.path.join(app_path, "login.json"), "r") as f: login = json.load(f) # Let it fail if login info cannot be found. username = login["KUBEFLOW_USERNAME"] password = login["KUBEFLOW_PASSWORD"] if not gcp_util.basic_auth_is_ready(url, username, password): raise Exception("Basic auth endpoint is not ready") else: # Owned by project kubeflow-ci-deployment. os.environ[ "CLIENT_ID"] = "29647740582-7meo6c7a9a76jvg54j0g2lv8lrsb4l8g.apps.googleusercontent.com" if not gcp_util.iap_is_ready(url): raise Exception("IAP endpoint is not ready")
def test_run_notebook(record_xml_attribute, namespace, # pylint: disable=too-many-branches,too-many-statements image_file, notebook_path, test_target_name, artifacts_gcs): if not image_file: raise ValueError("image_file must provided") notebook_name = os.path.basename( notebook_path).replace(".ipynb", "").replace("_", "-") junit_name = "_".join(["test", notebook_name]) util.set_pytest_junit(record_xml_attribute, junit_name, test_target_name) name = "-".join([notebook_name, datetime.datetime.now().strftime("%H%M%S"), uuid.uuid4().hex[0:3]]) logging.info(f"Reading file {image_file}") contents = util.read_file(image_file) image_data = yaml.load(contents) if not "image" in image_data: raise ValueError(f"File {image_file} is missing field image containing " f"the URI of the docker image to run the notebook in") image = image_data["image"] logging.info(f"Using image {image}") nb_test_util.run_papermill_job(notebook_path, name, namespace, image, artifacts_gcs)
def test_kfctl_delete(record_xml_attribute, kfctl_path, app_path, cluster_name): util.set_pytest_junit(record_xml_attribute, "test_kfctl_delete") # TODO(PatrickXYS): do we need to load kubeconfig again? if not kfctl_path: raise ValueError("kfctl_path is required") if not app_path: raise ValueError("app_path is required") logging.info("Using kfctl path %s", kfctl_path) logging.info("Using app path %s", app_path) kfdef_path = os.path.join(app_path, "tmp.yaml") logging.info("Using kfdef file path %s", kfdef_path) kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) # We see failures because delete operation will delete cert-manager and # knative-serving, and encounter timeout. To deal with this we do retries. # This has a potential downside of hiding errors that are fixed by retrying. @retry(stop_max_delay=60 * 3 * 1000) def run_delete(): util.run([kfctl_path, "delete", "-V", "-f", kfdef_path], cwd=app_path) run_delete()
def test_katib_is_ready(record_xml_attribute, namespace): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_katib_is_ready") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() deployment_names = [ "katib-controller", "katib-mysql", "katib-db-manager", "katib-ui", ] for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def test_deploy(record_xml_attribute, deploy_name, namespace, model_dir, export_dir): util.set_pytest_junit(record_xml_attribute, "test_deploy") util.maybe_activate_service_account() app_dir = os.path.join(os.path.dirname(__file__), "../serving/GCS") app_dir = os.path.abspath(app_dir) logging.info("--app_dir not set defaulting to: %s", app_dir) # TODO (@jinchihe) Using kustomize 2.0.3 to work around below issue: # https://github.com/kubernetes-sigs/kustomize/issues/1295 kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \ 'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64' util.run(['wget', '-q', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=app_dir) # TODO (@jinchihe): The kubectl need to be upgraded to 1.14.0 due to below issue. # Invalid object doesn't have additional properties ... kusUrl = 'https://storage.googleapis.com/kubernetes-release/' \ 'release/v1.14.0/bin/linux/amd64/kubectl' util.run(['wget', '-q', '-O', '/usr/local/bin/kubectl', kusUrl], cwd=app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kubectl'], cwd=app_dir) # Configure custom parameters using kustomize configmap = 'mnist-map-serving' util.run(['kustomize', 'edit', 'set', 'namespace', namespace], cwd=app_dir) util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=name' + '=' + deploy_name ], cwd=app_dir) util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=modelBasePath=' + model_dir ], cwd=app_dir) util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=exportDir=' + export_dir ], cwd=app_dir) # Apply the components util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'], cwd=app_dir) util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir) kube_config.load_kube_config() api_client = k8s_client.ApiClient() util.wait_for_deployment(api_client, namespace, deploy_name, timeout_minutes=4)
def test_kfctl_delete(record_xml_attribute, kfctl_path, app_path, project, cluster_deletion_script): util.set_pytest_junit(record_xml_attribute, "test_kfctl_delete") # TODO(yanniszark): split this into a separate workflow step if cluster_deletion_script: logging.info("cluster_deletion_script specified: %s", cluster_deletion_script) util.run(["/bin/bash", "-c", cluster_deletion_script]) return if not kfctl_path: raise ValueError("kfctl_path is required") if not app_path: raise ValueError("app_path is required") logging.info("Using kfctl path %s", kfctl_path) logging.info("Using app path %s", app_path) kfdef_path = os.path.join(app_path, "tmp.yaml") logging.info("Using kfdef file path %s", kfdef_path) kfdef = {} with open(kfdef_path) as f: kfdef = yaml.load(f) for plugin in kfdef.get("spec", {}).get("plugins", []): if plugin.get("kind", "") == "KfGcpPlugin": if not "spec" in plugin: raise ValueError("Invalid GCP plugin spec %s", str(plugin)) plugin["spec"]["deleteStorage"] = True with open(kfdef_path, "w") as f: yaml.dump(kfdef, f) # We see failures because delete will try to update the IAM policy which only allows # 1 update at a time. To deal with this we do retries. # This has a potential downside of hiding errors that are fixed by retrying. @retry(stop_max_delay=60 * 3 * 1000) def run_delete(): util.run([ kfctl_path, "delete", "-V", "-f", os.path.join(app_path, "tmp.yaml") ], cwd=app_path) run_delete() # Use services.list instead of services.get because error returned is not # 404, it's 403 which is confusing. name = os.path.basename(app_path) endpoint_name = "{deployment}.endpoints.{project}.cloud.goog".format( deployment=name, project=project) logging.info("Verify endpoint service is deleted: " + endpoint_name) if endpoint_name in get_endpoints_list(project): msg = "Endpoint is not deleted: " + endpoint_name logging.error(msg) raise AssertionError(msg) else: logging.info("Verified endpoint service is deleted.")
def test_gcp_kf_admin_wi(record_xml_attribute, namespace, app_name, platform, project): """Test that the kubeflow admin SA has proper workload identity binding. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_gcp_kf_admin_wi") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]]) api_client = deploy_utils.create_k8s_client() if platform != "gcp": pytest.skip("Not running on GCP") return cred = GoogleCredentials.get_application_default() # Create the Cloud IAM service object service = googleapiclient.discovery.build('iam', 'v1', credentials=cred) adminGcpSa = ('projects/%s/serviceAccounts/' '%s-admin@%s.iam.gserviceaccount.com') % ( project, app_name, project) adminSa = 'serviceAccount:%s-admin@%s.iam.gserviceaccount.com' % (app_name, project) request = service.projects().serviceAccounts().getIamPolicy( resource=adminGcpSa) response = request.execute() roleToMembers = {} for binding in response['bindings']: roleToMembers[binding['role']] = set(binding['members']) workloadIdentityRole = 'roles/iam.workloadIdentityUser' if workloadIdentityRole not in roleToMembers: raise Exception("roles/iam.workloadIdentityUser missing in iam-policy of " "service account %s" % adminGcpSa) account_str = "{project}.svc.id.goog[{namespace}/{account}]" # Expected workload identity users of the admin service account expected_wi_sa = [(namespace, "kf-admin"), (namespace, "profiles-controller-service-account"), ("istio-system", "kf-admin")] for sa in expected_wi_sa: gcp_sa = account_str.format(project=project, namespace=sa[0], account=sa[1]) error_message = ("GCP SA {0} missing workload identity binding for " "{1}").format(adminGcpSa, gcp_sa) binding = "serviceAccount:" + gcp_sa assert binding in roleToMembers[workloadIdentityRole], error_message
def test_lint(record_xml_attribute, src_dir, rcfile): # pylint: disable=redefined-outer-name # Override the classname attribute in the junit file. # This makes it easy to group related tests in test grid. # http://doc.pytest.org/en/latest/usage.html#record-xml-attribute util.set_pytest_junit(record_xml_attribute, "test_py_lint") logging.info('Running test_lint') # Print out the pylint version because different versions can produce # different results. util.run(["pylint", "--version"]) # kubeflow_testing is imported as a submodule so we should exclude it # TODO(jlewi): We should make this an argument. dir_excludes = [ "dashboard/frontend/node_modules", "kubeflow_testing", "dev-kubeflow-org/ks-app/vendor", "release-infra", ] full_dir_excludes = [ os.path.join(os.path.abspath(src_dir), f) for f in dir_excludes ] # TODO(jlewi): Use pathlib once we switch to python3. includes = ["*.py"] failed_files = [] if not rcfile: rcfile = os.path.join(src_dir, ".pylintrc") for root, dirs, files in os.walk(os.path.abspath(src_dir), topdown=True): # Exclude vendor directories and all sub files. if "vendor" in root.split(os.sep): continue # excludes can be done with fnmatch.filter and complementary set, # but it's more annoying to read. if should_exclude(root, full_dir_excludes): continue dirs[:] = [d for d in dirs] for pat in includes: for f in fnmatch.filter(files, pat): full_path = os.path.join(root, f) try: util.run(["pylint", "--rcfile=" + rcfile, full_path], cwd=src_dir) except subprocess.CalledProcessError: failed_files.append(full_path[len(src_dir):]) if failed_files: failed_files.sort() logging.error("%s files had lint errors:\n%s", len(failed_files), "\n".join(failed_files)) else: logging.info("No lint issues.") assert not failed_files
def test_jupyter(record_xml_attribute, env, namespace): """Test the jupyter notebook. Args: record_xml_attribute: Test fixture provided by pytest. env: ksonnet environment. namespace: namespace to run in. """ util.set_pytest_junit(record_xml_attribute, "jupyter_test") app_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") if app_credentials: logging.info("Activate service account") util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + app_credentials ]) # util.load_kube_config appears to hang on python3 kube_config.load_kube_config() api_client = k8s_client.ApiClient() host = api_client.configuration.host logging.info("Kubernetes master: %s", host) master = host.rsplit("/", 1)[-1] this_dir = os.path.dirname(__file__) app_dir = os.path.join(this_dir, "test_app") ks_cmd = ks_util.get_ksonnet_cmd(app_dir) name = "jupyter-test" service = "jupyter-test" component = "jupyter" params = "" ks_util.setup_ks_app(app_dir, env, namespace, component, params) util.run([ks_cmd, "apply", env, "-c", component], cwd=app_dir) conditions = ["Running"] results = util.wait_for_cr_condition(api_client, GROUP, PLURAL, VERSION, namespace, name, conditions) logging.info("Result of CRD:\n%s", results) # We proxy the request through the APIServer so that we can connect # from outside the cluster. url = ("https://{master}/api/v1/namespaces/{namespace}/services/{service}:80" "/proxy/default/jupyter/lab?").format( master=master, namespace=namespace, service=service) logging.info("Request: %s", url) r = send_request(url, verify=False) if r.status_code != requests.codes.OK: msg = "Request to {0} exited with status code: {1} and content: {2}".format( url, r.status_code, r.content) logging.error(msg) raise RuntimeError(msg)
def test_kfctl_delete(record_xml_attribute, cluster_deletion_script, cluster_name): util.set_pytest_junit(record_xml_attribute, "test_cluster_delete") if cluster_deletion_script: logging.info("cluster_deletion_script specified: %s", cluster_deletion_script) os.environ["CLUSTER_NAME"] = cluster_name util.run(["/bin/bash", "-c", cluster_deletion_script]) return
def test_gcp_access(record_xml_attribute, namespace, app_path, project): """Test that Kubeflow gcp was configured with workload_identity and GCP service account credentails. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_gcp_access") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() platform, app_name = get_platform_app_name(app_path) if platform == "gcp": # check secret util.check_secret(api_client, namespace, "user-gcp-sa") cred = GoogleCredentials.get_application_default() # Create the Cloud IAM service object service = googleapiclient.discovery.build('iam', 'v1', credentials=cred) userSa = 'projects/%s/serviceAccounts/%s-user@%s.iam.gserviceaccount.com' % ( project, app_name, project) adminSa = 'serviceAccount:%s-admin@%s.iam.gserviceaccount.com' % ( app_name, project) request = service.projects().serviceAccounts().getIamPolicy( resource=userSa) response = request.execute() roleToMembers = {} for binding in response['bindings']: roleToMembers[binding['role']] = set(binding['members']) if 'roles/owner' not in roleToMembers: raise Exception("roles/owner missing in iam-policy of %s" % userSa) if adminSa not in roleToMembers['roles/owner']: raise Exception("Admin %v should be owner of user %s" % (adminSa, userSa)) workloadIdentityRole = 'roles/iam.workloadIdentityUser' if workloadIdentityRole not in roleToMembers: raise Exception( "roles/iam.workloadIdentityUser missing in iam-policy of %s" % userSa)
def test_mnist_gcp(record_xml_attribute, name, namespace, # pylint: disable=too-many-branches,too-many-statements repos, image): '''Generate Job and summit.''' util.set_pytest_junit(record_xml_attribute, "test_mpioperator") if not name: name = "mpioperator_notebook-" + datetime.datetime.now().strftime("%H%M%S") + "-" name = name + uuid.uuid4().hex[0:3] util.set_pytest_junit(record_xml_attribute, "test_mpioperator_notebook") notebook_path = "kubeflow/mpi-operator/examples/v1alpha2/mpi_notebook.ipynb" nb_test_util.run_papermill_job(notebook_path, name, namespace, repos, image)
def test_build_kfctl_go(record_xml_attribute): """Test building of kfctl go. """ util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) kfctl_path = kfctl_util.build_kfctl_go() logging.info("kfctl go binary path %s", kfctl_path)
def test_endpoint_is_ready(record_xml_attribute, project, app_name): """Test that Kubeflow was successfully deployed. Args: project: The gcp project that we deployed kubeflow app_name: The name of the kubeflow deployment """ util.set_pytest_junit(record_xml_attribute, "test_endpoint_is_ready") # Owned by project kubeflow-ci-deployment. os.environ["CLIENT_ID"] = "29647740582-7meo6c7a9a76jvg54j0g2lv8lrsb4l8g.apps.googleusercontent.com" if not gcp_util.endpoint_is_ready( "https://{}.endpoints.{}.cloud.goog".format(app_name, project), wait_min=25): raise Exception("Endpoint not ready")
def test_kfctl_delete_wrong_cluster(record_xml_attribute, kfctl_path, app_path, project, cluster_deletion_script): util.set_pytest_junit(record_xml_attribute, "test_kfctl_delete_wrong_cluster") if not kfctl_path: raise ValueError("kfctl_path is required") if not app_path: raise ValueError("app_path is required") logging.info("Using kfctl path %s", kfctl_path) logging.info("Using app path %s", app_path) kfdef_path = os.path.join(app_path, "tmp.yaml") kfdef = {} with open(kfdef_path, "r") as f: kfdef = yaml.safe_load(f) # Make sure we copy the correct host instead of string reference. cluster = kfdef.get("metadata", {}).get("clusterName", "")[:] if not cluster: raise ValueError("cluster is not written to kfdef") @retry(stop_max_delay=60 * 3 * 1000) def run_delete(): try: # Put an obvious wrong cluster into KfDef kfdef["metadata"]["clusterName"] = "dummy" with open(kfdef_path, "w") as f: yaml.dump(kfdef, f) util.run([ kfctl_path, "delete", "--delete_storage", "-V", "-f", kfdef_path ], cwd=app_path) except subprocess.CalledProcessError as e: if e.output.find("cluster name doesn't match") != -1: return else: # Re-throw error if it's not expected. raise e finally: # Restore the correct host info. kfdef["metadata"]["clusterName"] = cluster[:] with open(kfdef_path, "w") as f: yaml.dump(kfdef, f) run_delete()
def check_deployments_ready(record_xml_attribute, namespace, name, deployments, cluster_name): """Test that Kubeflow deployments are successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, name) kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) api_client = deploy_utils.create_k8s_client() for deployment_name in deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def test_kfam(record_xml_attribute): util.set_pytest_junit(record_xml_attribute, "test_kfam_e2e") util.load_kube_config() util.load_kube_credentials() getcmd = "kubectl get pods -n kubeflow -l=app=jupyter-web-app --template '{{range.items}}{{.metadata.name}}{{end}}'" jupyterpod = util.run(getcmd.split(' '))[1:-1] logging.info("accessing kfam svc from jupyter pod %s" % jupyterpod) sleep(10) # Profile Creation profile_name = "testprofile-%s" % uuid.uuid4().hex[0:7] util.run(['kubectl', 'exec', jupyterpod, '-n', 'kubeflow', '--', 'curl', '--silent', '-X', 'POST', '-d', '{"metadata":{"name":"%s"},"spec":{"owner":{"kind":"User","name":"*****@*****.**"}}}' % profile_name, 'profiles-kfam.kubeflow:8081/kfam/v1/profiles']) assert verify_profile_creation(jupyterpod, profile_name)
def test_profiles(record_xml_attribute, profileFile="profile_v1beta1_profile.yaml"): util.set_pytest_junit(record_xml_attribute, "test_profile_e2e") app_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") util.maybe_activate_service_account() # util.load_kube_config appears to hang on python3 kube_config.load_kube_config() api_client = k8s_client.ApiClient() profileYamlFile = profileFile #Profile Creation group, version, name = createProfile(api_client, profileYamlFile) verifyProfileCreation(api_client, group, version, name) verifyNamespaceCreation(api_client, name) verifyServiceAccounts(api_client, name) verifyRolebindings(api_client, name) #Profile deletion deleteProfile(api_client, group, version, name) verifyProfileDeletion(api_client, group, version, name)
def test_run_notebook( record_xml_attribute, namespace, # pylint: disable=too-many-branches,too-many-statements repos, image, notebook_path): notebook_name = os.path.basename(notebook_path).replace(".ipynb", "").replace( "_", "-") junit_name = "_".join(["test", notebook_name]) util.set_pytest_junit(record_xml_attribute, junit_name) name = "-".join([ notebook_name, datetime.datetime.now().strftime("%H%M%S"), uuid.uuid4().hex[0:3] ]) util.set_pytest_junit(record_xml_attribute, junit_name) nb_test_util.run_papermill_job(notebook_path, name, namespace, repos, image)
def test_deploy_kfctl_go(record_xml_attribute, app_path, project, use_basic_auth, use_istio, config_path, kfctl_path): """Test deploying Kubeflow. Args: app_path: The path to the Kubeflow app. project: The GCP project to use. """ util.set_pytest_junit(record_xml_attribute, "test_deploy_kfctl_go") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) kfctl_util.kfctl_deploy_kubeflow(app_path, project, use_basic_auth, use_istio, config_path, kfctl_path) kfctl_util.verify_kubeconfig(app_path)
def test_create_cluster(record_xml_attribute, cluster_name, eks_cluster_version, cluster_creation_script, values): """Test Create Cluster For E2E Test. Args: cluster_name: Name of EKS cluster eks_cluster_version: Version of EKS cluster cluster_creation_script: script invoked to create a new cluster values: Comma separated list of variables to substitute into config_path """ util.set_pytest_junit(record_xml_attribute, "test_create_cluster") if values: pairs = values.split(",") path_vars = {} for p in pairs: k, v = p.split("=") path_vars[k] = v # Create EKS Cluster logging.info("Creating EKS Cluster") os.environ["CLUSTER_NAME"] = cluster_name os.environ["EKS_CLUSTER_VERSION"] = eks_cluster_version util.run(["/bin/bash", "-c", cluster_creation_script])
def test_build_kfctl_go(record_xml_attribute, app_path, project, use_basic_auth, use_istio, config_path, build_and_apply, kfctl_repo_path, cluster_creation_script): """Test building and deploying Kubeflow. Args: app_path: The path to the Kubeflow app. project: The GCP project to use. use_basic_auth: Whether to use basic_auth. use_istio: Whether to use Istio or not config_path: Path to the KFDef spec file. cluster_creation_script: script invoked to create a new cluster build_and_apply: whether to build and apply or apply kfctl_repo_path: path to the kubeflow/kfctl repo. """ util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) # TODO(yanniszark): split this into a separate workflow step if cluster_creation_script: logging.info("Cluster creation script specified: %s", cluster_creation_script) util.run(["/bin/bash", "-c", cluster_creation_script]) logging.info("using kfctl repo: %s" % kfctl_repo_path) kfctl_path = kfctl_util.build_kfctl_go(kfctl_repo_path) app_path = kfctl_util.kfctl_deploy_kubeflow(app_path, project, use_basic_auth, use_istio, config_path, kfctl_path, build_and_apply) if not cluster_creation_script: kfctl_util.verify_kubeconfig(app_path)
def check_deployments_ready(record_xml_attribute, namespace, name, deployments): """Test that Kubeflow deployments are successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() # TODO(jlewi): Should we do this in the calling function)? util.set_pytest_junit(record_xml_attribute, name) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() for deployment_name in deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def test_build_kfctl_go(record_xml_attribute, config_path, kfctl_repo_path, values): """Test building and deploying Kubeflow. Args: config_path: Path to the KFDef spec file. kfctl_repo_path: path to the kubeflow/kfctl repo. values: Comma separated list of variables to substitute into config_path """ util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go") logging.info("using kfctl repo: %s" % kfctl_repo_path) if values: pairs = values.split(",") path_vars = {} for p in pairs: k, v = p.split("=") path_vars[k] = v config_path = config_path.format(**path_vars) logging.info("config_path after substitution: %s", config_path) kfctl_util.build_kfctl_go(kfctl_repo_path)
def test_training( record_xml_attribute, tfjob_name, namespace, trainer_image, num_ps, #pylint: disable=too-many-arguments num_workers, train_steps, batch_size, learning_rate, model_dir, export_dir): util.set_pytest_junit(record_xml_attribute, "test_mnist") util.maybe_activate_service_account() app_dir = os.path.join(os.path.dirname(__file__), "../training/GCS") app_dir = os.path.abspath(app_dir) logging.info("--app_dir not set defaulting to: %s", app_dir) # TODO (@jinchihe) Using kustomize 2.0.3 to work around below issue: # https://github.com/kubernetes-sigs/kustomize/issues/1295 kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \ 'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64' util.run(['wget', '-q', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=app_dir) # TODO (@jinchihe): The kubectl need to be upgraded to 1.14.0 due to below issue. # Invalid object doesn't have additional properties ... kusUrl = 'https://storage.googleapis.com/kubernetes-release/' \ 'release/v1.14.0/bin/linux/amd64/kubectl' util.run(['wget', '-q', '-O', '/usr/local/bin/kubectl', kusUrl], cwd=app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kubectl'], cwd=app_dir) # Configurate custom parameters using kustomize util.run(['kustomize', 'edit', 'set', 'namespace', namespace], cwd=app_dir) util.run([ 'kustomize', 'edit', 'set', 'image', 'training-image=' + trainer_image ], cwd=app_dir) util.run(['../base/definition.sh', '--numPs', num_ps], cwd=app_dir) util.run(['../base/definition.sh', '--numWorkers', num_workers], cwd=app_dir) trainning_config = { "name": tfjob_name, "trainSteps": train_steps, "batchSize": batch_size, "learningRate": learning_rate, "modelDir": model_dir, "exportDir": export_dir, } configmap = 'mnist-map-training' for key, value in trainning_config.items(): util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=' + key + '=' + value ], cwd=app_dir) # Created the TFJobs. util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'], cwd=app_dir) util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir) logging.info("Created job %s in namespaces %s", tfjob_name, namespace) kube_config.load_kube_config() api_client = k8s_client.ApiClient() # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, namespace, tfjob_name, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # Check for errors creating pods and services. Can potentially # help debug failed test runs. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, namespace, results) if creation_failures: logging.warning(creation_failures) if not tf_job_client.job_succeeded(results): failure = "Job {0} in namespace {1} in status {2}".format( # pylint: disable=attribute-defined-outside-init tfjob_name, namespace, results.get("status", {})) logging.error(failure) # if the TFJob failed, print out the pod logs for debugging. pod_names = tf_job_client.get_pod_names(api_client, namespace, tfjob_name) logging.info("The Pods name:\n %s", pod_names) core_api = k8s_client.CoreV1Api(api_client) for pod in pod_names: logging.info("Getting logs of Pod %s.", pod) try: pod_logs = core_api.read_namespaced_pod_log(pod, namespace) logging.info("The logs of Pod %s log:\n %s", pod, pod_logs) except k8s_client.rest.ApiException as e: logging.info( "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n", e) return
def test_build_kfctl_go(record_xml_attribute, app_name, app_path, project, use_basic_auth, use_istio, config_path, build_and_apply, kfctl_repo_path, cluster_creation_script, self_signed_cert, values): """Test building and deploying Kubeflow. Args: app_name: kubeflow deployment name. app_path: The path to the Kubeflow app. project: The GCP project to use. use_basic_auth: Whether to use basic_auth. use_istio: Whether to use Istio or not config_path: Path to the KFDef spec file. cluster_creation_script: script invoked to create a new cluster build_and_apply: whether to build and apply or apply kfctl_repo_path: path to the kubeflow/kfctl repo. self_signed_cert: whether to use self-signed cert for ingress. values: Comma separated list of variables to substitute into config_path """ util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) # TODO(yanniszark): split this into a separate workflow step if cluster_creation_script: logging.info("Cluster creation script specified: %s", cluster_creation_script) util.run(["/bin/bash", "-c", cluster_creation_script]) logging.info("using kfctl repo: %s" % kfctl_repo_path) if values: pairs = values.split(",") path_vars = {} for p in pairs: k, v = p.split("=") path_vars[k] = v config_path = config_path.format(**path_vars) logging.info("config_path after substitution: %s", config_path) kfctl_path = kfctl_util.build_kfctl_go(kfctl_repo_path) app_path = kfctl_util.kfctl_deploy_kubeflow(app_path, project, use_basic_auth, use_istio, config_path, kfctl_path, build_and_apply) if not cluster_creation_script: kfctl_util.verify_kubeconfig(app_path) # Use self-signed cert for testing to prevent quota limiting. if self_signed_cert: logging.info("Configuring self signed certificate") util.load_kube_credentials() api_client = k8s_client.ApiClient() ingress_namespace = "istio-system" ingress_name = "envoy-ingress" tls_endpoint = "{0}.endpoints.{1}.cloud.goog".format(app_name, project) logging.info("Configuring self signed cert for %s", tls_endpoint) util.use_self_signed_for_ingress(ingress_namespace, ingress_name, tls_endpoint, api_client)
def test_xgboost_synthetic( record_xml_attribute, name, namespace, # pylint: disable=too-many-branches,too-many-statements repos, image, notebook_artifacts_dir): '''Generate Job and summit.''' util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic") util.maybe_activate_service_account() with open("job.yaml") as hf: job = yaml.load(hf) # We need to checkout the correct version of the code # in presubmits and postsubmits. We should check the environment variables # for the prow environment variables to get the appropriate values. # We should probably also only do that if the # See # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables if not repos: repos = argo_build_util.get_repo_from_prow_env() repos += ",kubeflow/testing@HEAD" logging.info("Repos set to %s", repos) job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [ "/usr/local/bin/checkout_repos.sh", "--repos=" + repos, "--src_dir=/src", "--depth=all", ] nb_bucket = "kubeflow-ci-deployment" nb_path = os.path.join("xgboost_synthetic_testing", os.getenv("JOB_TYPE"), os.getenv("HOSTNAME"), "notebook.html") output_gcs = util.to_gcs_uri(nb_bucket, nb_path) logging.info("Tested notebook will be outputed to: %s", output_gcs) job["spec"]["template"]["spec"]["containers"][0]["env"] = [ { "name": "PYTHONPATH", "value": "/src/kubeflow/testing/py" }, { "name": "OUTPUT_GCS", "value": output_gcs }, ] job["spec"]["template"]["spec"]["containers"][0]["image"] = image util.load_kube_config(persist_config=False) if name: job["metadata"]["name"] = name else: job["metadata"]["name"] = ("xgboost-test-" + datetime.datetime.now().strftime("%H%M%S") + "-" + uuid.uuid4().hex[0:3]) name = job["metadata"]["name"] job["metadata"]["namespace"] = namespace # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() batch_api = k8s_client.BatchV1Api(api_client) logging.info("Creating job:\n%s", yaml.dump(job)) actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"], job) logging.info("Created job %s.%s:\n%s", namespace, name, yaml.safe_dump(actual_job.to_dict())) final_job = util.wait_for_job(api_client, namespace, name, timeout=datetime.timedelta(minutes=30)) logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict())) if not final_job.status.conditions: raise RuntimeError("Job {0}.{1}; did not complete".format( namespace, name)) last_condition = final_job.status.conditions[-1] # Download notebook html to artifacts notebook_artifacts_path = os.path.join(notebook_artifacts_dir, "notebook.html") logging.info("Writing notebook artifact to: %s", notebook_artifacts_path) os.makedirs(notebook_artifacts_dir, exist_ok=True) storage_client = storage.Client() bucket = storage_client.get_bucket(nb_bucket) blob = bucket.get_blob(nb_path) blob.download_to_filename(notebook_artifacts_path) if last_condition.type not in ["Complete"]: logging.error("Job didn't complete successfully") raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth, use_istio, app_path): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. # TODO(yanniszark): This list is incomplete and missing a lot of components. deployment_names = [ "argo-ui", "centraldashboard", "jupyter-web-app-deployment", "minio", "ml-pipeline", "ml-pipeline-persistenceagent", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "ml-pipeline-viewer-controller-deployment", "mysql", "notebook-controller-deployment", "profiles-deployment", "pytorch-operator", "tf-job-operator", "workflow-controller", ] stateful_set_names = [] platform, _ = get_platform_app_name(app_path) ingress_related_deployments = [ "istio-egressgateway", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "istio-tracing", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)