def test_deploy_pytorchjob(record_xml_attribute, kfctl_repo_path, namespace): """Deploy PytorchJob.""" util.load_kube_config() util.load_kube_credentials() logging.info("using kfctl repo: %s" % kfctl_repo_path) util.run([ "kubectl", "apply", "-f", os.path.join( kfctl_repo_path, "py/kubeflow/kfctl/testing/pytests/testdata/pytorch_job.yaml") ]) api_client = k8s_client.ApiClient() api = k8s_client.CoreV1Api(api_client) # If the call throws exception, let it emit as an error case. resp = api.list_namespaced_pod(namespace) names = { "pytorch-mnist-ddp-cpu-master-0": False, "pytorch-mnist-ddp-cpu-worker-0": False, } for pod in resp.items: name = pod.metadata.name if name in names: names[name] = True msg = [] for n in names: if not names[n]: msg.append("pod %s is not found" % n) if msg: raise ValueError("; ".join(msg))
def test_kfam(record_xml_attribute): util.set_pytest_junit(record_xml_attribute, "test_kfam_e2e") util.load_kube_config() util.load_kube_credentials() getcmd = "kubectl get pods -n kubeflow -l=app=jupyter-web-app --template '{{range.items}}{{.metadata.name}}{{end}}'" jupyterpod = util.run(getcmd.split(' '))[1:-1] logging.info("accessing kfam svc from jupyter pod %s" % jupyterpod) sleep(10) # Profile Creation profile_name = "testprofile-%s" % uuid.uuid4().hex[0:7] util.run(['kubectl', 'exec', jupyterpod, '-n', 'kubeflow', '--', 'curl', '--silent', '-X', 'POST', '-d', '{"metadata":{"name":"%s"},"spec":{"owner":{"kind":"User","name":"*****@*****.**"}}}' % profile_name, 'profiles-kfam.kubeflow:8081/kfam/v1/profiles']) assert verify_profile_creation(jupyterpod, profile_name)
def test_jupyter(record_xml_attribute, kfctl_repo_path, namespace): """Test the jupyter notebook. Args: record_xml_attribute: Test fixture provided by pytest. env: ksonnet environment. namespace: namespace to run in. """ util.load_kube_config() util.load_kube_credentials() logging.info("using kfctl repo: %s" % kfctl_repo_path) util.run(["kubectl", "apply", "-f", os.path.join(kfctl_repo_path, "py/kubeflow/kfctl/testing/pytests/testdata/jupyter_test.yaml")]) api_client = k8s_client.ApiClient() api = k8s_client.CoreV1Api(api_client) resp = api.list_namespaced_service(namespace) names = [service.metadata.name for service in resp.items] if not "jupyter-test" in names: raise ValueError("not able to find jupyter-test service.")
def test_build_kfctl_go(record_xml_attribute, app_name, app_path, project, use_basic_auth, use_istio, config_path, build_and_apply, kfctl_repo_path, cluster_creation_script, self_signed_cert, values): """Test building and deploying Kubeflow. Args: app_name: kubeflow deployment name. app_path: The path to the Kubeflow app. project: The GCP project to use. use_basic_auth: Whether to use basic_auth. use_istio: Whether to use Istio or not config_path: Path to the KFDef spec file. cluster_creation_script: script invoked to create a new cluster build_and_apply: whether to build and apply or apply kfctl_repo_path: path to the kubeflow/kfctl repo. self_signed_cert: whether to use self-signed cert for ingress. values: Comma separated list of variables to substitute into config_path """ util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) # TODO(yanniszark): split this into a separate workflow step if cluster_creation_script: logging.info("Cluster creation script specified: %s", cluster_creation_script) util.run(["/bin/bash", "-c", cluster_creation_script]) logging.info("using kfctl repo: %s" % kfctl_repo_path) if values: pairs = values.split(",") path_vars = {} for p in pairs: k, v = p.split("=") path_vars[k] = v config_path = config_path.format(**path_vars) logging.info("config_path after substitution: %s", config_path) kfctl_path = kfctl_util.build_kfctl_go(kfctl_repo_path) app_path = kfctl_util.kfctl_deploy_kubeflow(app_path, project, use_basic_auth, use_istio, config_path, kfctl_path, build_and_apply) if not cluster_creation_script: kfctl_util.verify_kubeconfig(app_path) # Use self-signed cert for testing to prevent quota limiting. if self_signed_cert: logging.info("Configuring self signed certificate") util.load_kube_credentials() api_client = k8s_client.ApiClient() ingress_namespace = "istio-system" ingress_name = "envoy-ingress" tls_endpoint = "{0}.endpoints.{1}.cloud.goog".format(app_name, project) logging.info("Configuring self signed cert for %s", tls_endpoint) util.use_self_signed_for_ingress(ingress_namespace, ingress_name, tls_endpoint, api_client)
def check_if_kfapp_exists(project, name, zone): # pylint: disable=too-many-branches """Check if a deployment with the specified name already exists.""" credentials = GoogleCredentials.get_application_default() dm = discovery.build("deploymentmanager", "v2", credentials=credentials) deployments_client = dm.deployments() enable_api = False try: deployments_client.get(project=project, deployment=name).execute() except errors.HttpError as e: if not e.content: raise error_content = json.loads(e.content) if error_content.get("error", {}).get("code", 0) == 404: # pylint: disable=no-else-return return False elif error_content.get("error", {}).get("code", 0) == 403: # We get a 403 if the deployment manager API isn't enabled logging.info("Fetching deployment %s in project %s returned error:\n%s", name, project, error_content) enable_api = True else: raise if enable_api: logging.info("Enabling the deployment manager api.") util.run(["gcloud", "--project=" + project, "services", "enable", "deploymentmanager.googleapis.com"]) logging.info("Api enabled; raising ApiNotEnabledError to force retry") raise ApiNotEnabledError # TODO(jlewi): It would be better to get the actual zone of the deployment util.run(["gcloud", "--project=" + project, "container", "clusters", "get-credentials", "--zone=" + zone, name]) logging.info("Checking if project %s kfapp %s finished setup.", project, name) util.load_kube_credentials() # TODO(jlewi): This is a bit of a hack for v0.6. For v0.6 we check if the # ingress already exists and if it does we report it as true and otherwise # false. The reasoning is if the ingress doesn't exist we want to see # if we can fix/resume the deployment by running reapply # With v0.7 kfctl apply should be an idempotent operation so we can always # rerun apply; but with v0.6 rerunning apply if the ingress exists results # in an error. api_client = k8s_client.ApiClient() v1 = k8s_client.CoreV1Api(api_client) ingress_namespace = "istio-system" ingress_name = "envoy-ingress" extensions = k8s_client.ExtensionsV1beta1Api(api_client) missing_ingress = True try: logging.info("Trying to read ingress %s.%s", ingress_name, ingress_namespace) extensions.read_namespaced_ingress(ingress_name, ingress_namespace) missing_ingress = False logging.info("Ingress %s.%s exists", ingress_name, ingress_namespace) except rest.ApiException as e: if e.status == 404: logging.info("Project: %s, KFApp: %s is missing ingress %s.%s", project, name, ingress_namespace, ingress_name) missing_ingress = True else: raise if missing_ingress: # Check if the service istio-ingressgateway already exists # if it does we need to delete it before rerunning apply. service_name = "istio-ingressgateway" logging.info("ingress %s.%s exists; checking if service %s.%s exists", ingress_namespace, ingress_name, ingress_namespace, service_name) has_service = False try: v1.read_namespaced_service(service_name, ingress_namespace) has_service = True except rest.ApiException as e: if e.status == 404: logging.info("Project: %s, KFApp: %s is missing service %s.%s", project, name, ingress_namespace, service_name) else: raise if has_service: logging.info("Deleting service: %s.%s", ingress_namespace, service_name) v1.delete_namespaced_service(service_name, ingress_namespace, body=k8s_client.V1DeleteOptions()) logging.info("Deleted service: %s.%s", ingress_namespace, service_name) return False return True
def deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=None): # pylint: disable=too-many-branches """Deploy Kubeflow using kfctl go binary.""" # username and password are passed as env vars and won't appear in the logs # # We need to edit and rewrite the config file to the app dir because # kfctl uses the path of the config file as the app dir.s logging.warning("Loading configs %s.", args.kfctl_config) if args.kfctl_config.startswith("http"): response = requests.get(args.kfctl_config) raw_config = response.content else: with open(args.kfctl_config) as hf: raw_config = hf.read() config_spec = yaml.load(raw_config) # We need to specify a valid email because # 1. We need to create appropriate RBAC rules to allow the current user # to create the required K8s resources. # 2. Setting the IAM policy will fail if the email is invalid. email = args.email if not email: logging.info("email not set trying to get default from gcloud") email = util.run(["gcloud", "auth", "list", "--filter", "status:ACTIVE", "--format", "value(account)"]) if not email: raise ValueError("Could not determine GCP account being used.") kfdef_version = config_spec["apiVersion"].strip().lower() if kfdef_version == KFDEF_V1ALPHA1: config_spec = build_v06_spec(config_spec, args.project, email, args.zone, args.setup_project) else: config_spec = build_v07_spec(config_spec, args.project, email, args.zone, args.setup_project) config_spec["spec"] = util.filter_spartakus(config_spec["spec"]) # Remove name because we will auto infer from directory. if "name" in config_spec["metadata"]: logging.info("Deleting name in kfdef spec.") del config_spec["metadata"]["name"] app_name = os.path.basename(app_dir) if not "labels" in config_spec["metadata"]: config_spec["metadata"]["labels"] = {} if labels: config_spec["metadata"]["labels"].update(labels) logging.info("KFDefSpec:\n%s", yaml.safe_dump(config_spec)) if kfdef_version == KFDEF_V1ALPHA1: logging.info("Deploying using v06 syntax") logging.info("Checking if deployment %s already exists in project %s", args.project, app_name) if check_if_kfapp_exists(args.project, app_name, args.zone): # With v0.6 kfctl can't successfully run apply a 2nd time so if # the deployment already exists we can't redeploy. logging.info("Deployment %s already exists in project %s; not " "redeploying", args.project, app_name) return with tempfile.NamedTemporaryFile(prefix="tmpkf_config", suffix=".yaml", delete=False) as hf: config_file = hf.name logging.info("Writing file %s", config_file) yaml.dump(config_spec, hf) util.run([kfctl_path, "init", app_dir, "-V", "--config=" + config_file], env=env) util.run([kfctl_path, "generate", "-V", "all"], env=env, cwd=app_dir) util.run([kfctl_path, "apply", "-V", "all"], env=env, cwd=app_dir) else: logging.info("Deploying using v07 syntax") if not os.path.exists(app_dir): logging.info("Creating app dir %s", app_dir) os.makedirs(app_dir) config_file = os.path.join(app_dir, "kf_config.yaml") with open(config_file, "w") as hf: logging.info("Writing file %s", config_file) yaml.dump(config_spec, hf) util.run([kfctl_path, "apply", "-V", "-f", config_file], env=env) # We will hit lets encrypt rate limiting with the managed certificates # So create a self signed certificate and update the ingress to use it. if args.use_self_cert: logging.info("Configuring self signed certificate") util.load_kube_credentials() api_client = k8s_client.ApiClient() ingress_namespace = "istio-system" ingress_name = "envoy-ingress" tls_endpoint = "{0}.endpoints.{1}.cloud.goog".format(app_name, args.project) logging.info("Configuring self signed cert for %s", tls_endpoint) util.use_self_signed_for_ingress(ingress_namespace, ingress_name, tls_endpoint, api_client)
def delete( self, project_base_name, start_index, end_index, kfname, # pylint: disable=too-many-arguments job_file=None, output_dir=None, namespace=DEFAULT_NAMESPACE): """Fire off a bunch of K8s jobs to delete many Kubeflow instances. Args: project_base_name: The base name for the projects. Should end with "-" start_index: The start index end_index: The index non inclusive kfname: The name of the of the kubeflow app job_file: The path to the YAML file containing a K8s Job that serves as the template for the jobs to be launched. output_dir: Directory to write the job specs to. """ util.load_kube_credentials() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() batch_api = k8s_client.BatchV1Api(api_client) if not job_file: job_file = self._default_job_file() if not os.path.exists(job_file): raise ValueError("job file {0} does not exist".format(job_file)) logging.info("Job file: %s", job_file) if not output_dir: output_dir = tempfile.mkdtemp() logging.info("output_dir: %s", output_dir) # Generate a common label for all the jobs. This way we can potentially # wait for all the jobs based on the label. group_label = (datetime.datetime.now().strftime("%Y%m%d-%H%M%S-") + uuid.uuid4().hex[0:4]) for index in range(start_index, end_index): project = "{0}{1}".format(project_base_name, index) logging.info("Processing project=%s", project) job = self._create_delete_job_spec(job_file, group_label, project, kfname, namespace) output_file = os.path.join(output_dir, "delete-{0}.yaml".format(project)) logging.info("Writing job spec to %s", output_file) with open(output_file, "w") as hf: yaml.safe_dump(job, hf) # submit the job logging.info("Creating job") actual_job = batch_api.create_namespaced_job( job["metadata"]["namespace"], job) logging.info("Created job %s.%s:\n%s", actual_job.metadata.namespace, actual_job.metadata.name, yaml.safe_dump(actual_job.to_dict())) self.wait_for_jobs(namespace, "group=" + group_label)