Example #1
0
def test_katib_is_ready(record_xml_attribute, namespace):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_katib_is_ready")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    deployment_names = [
        "katib-controller",
        "katib-mysql",
        "katib-db-manager",
        "katib-ui",
    ]
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)
Example #2
0
def deploy_model(args):
    """Deploy a TF model using the TF serving component."""
    api_client = create_k8s_client(args)
    app_dir = setup_kubeflow_ks_app(args, api_client)

    component = "modelServer"
    logging.info("Deploying tf-serving.")
    generate_command = ["ks", "generate", "tf-serving", component]

    util.run(generate_command, cwd=app_dir)

    params = {}
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        params[k] = v

    if "namespace" not in params:
        raise ValueError("namespace must be supplied via --params.")
    namespace = params["namespace"]

    ks_deploy(app_dir, component, params, env=None, account=None)

    core_api = k8s_client.CoreV1Api(api_client)
    deploy = core_api.read_namespaced_service(args.deploy_name, args.namespace)
    cluster_ip = deploy.spec.cluster_ip

    if not cluster_ip:
        raise ValueError("inception service wasn't assigned a cluster ip.")
    util.wait_for_deployment(api_client,
                             namespace,
                             args.deploy_name + "-v1",
                             timeout_minutes=10)
    logging.info("Verified TF serving started.")
Example #3
0
def setup(args):
  """Test deploying Kubeflow."""
  api_client = create_k8s_client(args)
  app_dir = setup_kubeflow_ks_app(args, api_client)

  namespace = args.namespace
  # TODO(jlewi): We don't need to generate a core component if we are
  # just deploying TFServing. Might be better to refactor this code.
  # Deploy Kubeflow
  util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace], cwd=app_dir)

  # TODO(jlewi): For reasons I don't understand even though we ran
  # configure_kubectl above, if we don't rerun it we get rbac errors
  # when we do ks apply; I think because we aren't using the proper service
  # account. This might have something to do with the way ksonnet gets
  # its credentials; maybe we need to configure credentials after calling
  # ks init?
  if args.cluster:
    util.configure_kubectl(args.project, args.zone, args.cluster)

  apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

  util.run(apply_command, cwd=app_dir)

  # Verify that the TfJob operator is actually deployed.
  tf_job_deployment_name = "tf-job-operator"
  logging.info("Verifying TfJob controller started.")
  util.wait_for_deployment(api_client, namespace,
                           tf_job_deployment_name)

  # Verify that JupyterHub is actually deployed.
  jupyter_name = "tf-hub"
  logging.info("Verifying TfHub started.")
  util.wait_for_statefulset(api_client, namespace, jupyter_name)
Example #4
0
def deploy_kubeflow(test_case):
    """Deploy Kubeflow."""
    args = parse_args()
    test_dir = test_case.test_suite.test_dir
    namespace = args.namespace
    api_client = deploy_utils.create_k8s_client()
    app_dir = deploy_utils.setup_kubeflow_ks_app(test_dir, namespace,
                                                 args.github_token, api_client)

    # TODO(jlewi): We don't need to generate a core component if we are
    # just deploying TFServing. Might be better to refactor this code.
    # Deploy Kubeflow
    util.run([
        "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
        "--namespace=" + namespace
    ],
             cwd=app_dir)

    util.run([
        "ks", "generate", "pytorch-operator", "pytorch-operator",
        "--name=pytorch-operator", "--namespace=" + namespace
    ],
             cwd=app_dir)

    apply_command = [
        "ks",
        "apply",
        "default",
        "-c",
        "kubeflow-core",
        "-c",
        "pytorch-operator",
    ]

    if args.as_gcloud_user:
        account = deploy_utils.get_gcp_identity()
        logging.info("Impersonate %s", account)

        # If we don't use --as to impersonate the service account then we
        # observe RBAC errors when doing certain operations. The problem appears
        # to be that we end up using the in cluster config (e.g. pod service account)
        # and not the GCP service account which has more privileges.
        apply_command.append("--as=" + account)
    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace, tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyterhub_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace, jupyterhub_name)

    # Verify that PyTorch Operator actually deployed
    pytorch_operator_deployment_name = "pytorch-operator"
    logging.info("Verifying PyTorchJob controller started.")
    util.wait_for_deployment(api_client, namespace,
                             pytorch_operator_deployment_name)
Example #5
0
def test_kf_is_ready(namespace, use_basic_auth):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """

    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "cert-manager",
        "cloud-endpoints-controller",
        "jupyter-web-app",
        "ml-pipeline",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "notebooks-controller",
        "tf-job-operator",
        "profiles",
        "pytorch-operator",
        "studyjob-controller",
        "workflow-controller",
    ]

    stateful_sets = [
        "backend-updater",
    ]

    if use_basic_auth:
        deployment_names.extend(["basic-auth"])
    else:
        deployment_names.extend(["iap-enabler"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name)

    for name in stateful_sets:
        logging.info("Verifying that statefulset %s started...", name)
        util.wait_for_statefulset(api_client, namespace, name)
def test_deploy(record_xml_attribute, deploy_name, namespace, model_dir,
                export_dir):

    util.set_pytest_junit(record_xml_attribute, "test_deploy")

    util.maybe_activate_service_account()

    app_dir = os.path.join(os.path.dirname(__file__), "../serving/GCS")
    app_dir = os.path.abspath(app_dir)
    logging.info("--app_dir not set defaulting to: %s", app_dir)

    # TODO (@jinchihe) Using kustomize 2.0.3 to work around below issue:
    # https://github.com/kubernetes-sigs/kustomize/issues/1295
    kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
             'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
    util.run(['wget', '-q', '-O', '/usr/local/bin/kustomize', kusUrl],
             cwd=app_dir)
    util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=app_dir)

    # TODO (@jinchihe): The kubectl need to be upgraded to 1.14.0 due to below issue.
    # Invalid object doesn't have additional properties ...
    kusUrl = 'https://storage.googleapis.com/kubernetes-release/' \
             'release/v1.14.0/bin/linux/amd64/kubectl'
    util.run(['wget', '-q', '-O', '/usr/local/bin/kubectl', kusUrl],
             cwd=app_dir)
    util.run(['chmod', 'a+x', '/usr/local/bin/kubectl'], cwd=app_dir)

    # Configure custom parameters using kustomize
    configmap = 'mnist-map-serving'
    util.run(['kustomize', 'edit', 'set', 'namespace', namespace], cwd=app_dir)
    util.run([
        'kustomize', 'edit', 'add', 'configmap', configmap,
        '--from-literal=name' + '=' + deploy_name
    ],
             cwd=app_dir)

    util.run([
        'kustomize', 'edit', 'add', 'configmap', configmap,
        '--from-literal=modelBasePath=' + model_dir
    ],
             cwd=app_dir)
    util.run([
        'kustomize', 'edit', 'add', 'configmap', configmap,
        '--from-literal=exportDir=' + export_dir
    ],
             cwd=app_dir)

    # Apply the components
    util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'],
             cwd=app_dir)
    util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir)

    kube_config.load_kube_config()
    api_client = k8s_client.ApiClient()
    util.wait_for_deployment(api_client,
                             namespace,
                             deploy_name,
                             timeout_minutes=4)
Example #7
0
def deploy_model(args):
    """Deploy a TF model using the TF serving component."""
    api_client = create_k8s_client(args)
    app_dir = setup_kubeflow_ks_app(args, api_client)

    logging.info("Deploying tf-serving.")
    params = {}
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        if k != "namespace":
            params[k] = v
        else:
            namespace = v

    if namespace == None:
        raise ValueError("namespace must be supplied in args.")

    # deployment component
    deployComponent = "modelServer"
    generate_command = [
        "ks", "generate", "tf-serving-deployment-gcp", deployComponent
    ]
    util.run(generate_command, cwd=app_dir)
    ks_deploy(app_dir,
              deployComponent,
              params,
              env=None,
              account=None,
              namespace=namespace)

    # service component
    serviceComponent = "modelServer-service"
    generate_command = [
        "ks", "generate", "tf-serving-service", serviceComponent
    ]
    util.run(generate_command, cwd=app_dir)
    ks_deploy(app_dir,
              serviceComponent,
              params,
              env=None,
              account=None,
              namespace=namespace)

    core_api = k8s_client.CoreV1Api(api_client)
    deploy = core_api.read_namespaced_service(args.deploy_name, args.namespace)
    cluster_ip = deploy.spec.cluster_ip

    if not cluster_ip:
        raise ValueError("inception service wasn't assigned a cluster ip.")
    util.wait_for_deployment(api_client,
                             namespace,
                             args.deploy_name,
                             timeout_minutes=10)
    logging.info("Verified TF serving started.")
Example #8
0
def deploy_kubeflow(test_case):
    """Deploy Kubeflow."""
    args = parse_args()
    test_dir = test_case.test_suite.test_dir
    src_root_dir = args.src_root_dir
    namespace = args.namespace
    api_client = deploy_utils.create_k8s_client()
    app_dir = deploy_utils.setup_ks_app(test_dir, src_root_dir, namespace,
                                        args.github_token, api_client)

    # Deploy Kubeflow
    util.run(["ks", "generate", "tf-job-operator", "tf-job-operator"],
             cwd=app_dir)
    util.run(
        ["ks", "generate", "argo", "kubeflow-argo", "--name=kubeflow-argo"],
        cwd=app_dir)
    cmd = "ks param set tf-job-operator namespace " + namespace
    util.run(cmd.split(), cwd=app_dir)
    # cmd = "ks param set tf-job-operator tfJobImage \
    #         gcr.io/kubeflow-images-public/tf_operator:v20180522-77375baf"
    # util.run(cmd.split(), cwd=app_dir)
    cmd = "ks param set tf-job-operator tfJobVersion v1beta1"
    util.run(cmd.split(), cwd=app_dir)
    cmd = "ks param set kubeflow-argo namespace " + namespace
    util.run(cmd.split(), cwd=app_dir)
    apply_command = [
        "ks", "apply", "default", "-c", "tf-job-operator", "-c",
        "kubeflow-argo"
    ]
    if args.as_gcloud_user:
        account = deploy_utils.get_gcp_identity()
        logging.info("Impersonate %s", account)
        # If we don't use --as to impersonate the service account then we
        # observe RBAC errors when doing certain operations. The problem appears
        # to be that we end up using the in cluster config (e.g. pod service account)
        # and not the GCP service account which has more privileges.
        apply_command.append("--as=" + account)
    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator-v1beta1"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace, tf_job_deployment_name)

    # Verify that the Argo operator is deployed.
    argo_deployment_name = "workflow-controller"
    logging.info("Verifying Argo controller started.")
    util.wait_for_deployment(api_client, namespace, argo_deployment_name)

    # change the namespace to default to set up nfs-volume and nfs-server
    namespace = "default"

    deploy_utils.set_clusterrole(namespace)
Example #9
0
def deploy_kubeflow(test_case):  # pylint: disable=unused-argument
    """Deploy Kubeflow."""
    args = parse_args()
    src_root_dir = args.src_root_dir
    namespace = args.namespace
    api_client = deploy_utils.create_k8s_client()

    manifest_repo_dir = path.join(src_root_dir, "kubeflow", "manifests")
    argo_manifest_dir = path.join(manifest_repo_dir, "argo", "base")
    tfoperator_manifest_dir = path.join(manifest_repo_dir, "tf-training",
                                        "tf-job-operator", "base")

    deploy_utils.setup_test(api_client, namespace)

    apply_args = "-f -"
    if args.as_gcloud_user:
        account = deploy_utils.get_gcp_identity()
        logging.info("Impersonate %s", account)
        # If we don't use --as to impersonate the service account then we
        # observe RBAC errors when doing certain operations. The problem appears
        # to be that we end up using the in cluster config (e.g. pod service account)
        # and not the GCP service account which has more privileges.
        apply_args = " ".join(["--as=" + account, apply_args])

    # Deploy argo
    logging.info("Deploying argo")
    util.run(["kustomize", "edit", "set", "namespace", namespace],
             cwd=argo_manifest_dir)
    util.run(["sh", "-c", "kustomize build | kubectl apply " + apply_args],
             cwd=argo_manifest_dir)

    # Deploy tf-job-operator
    logging.info("Deploying tf-job-operator")
    util.run(["kustomize", "edit", "set", "namespace", namespace],
             cwd=tfoperator_manifest_dir)
    util.run(["sh", "-c", "kustomize build | kubectl apply " + apply_args],
             cwd=tfoperator_manifest_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace, tf_job_deployment_name)

    # Verify that the Argo operator is deployed.
    argo_deployment_name = "workflow-controller"
    logging.info("Verifying Argo controller started.")
    util.wait_for_deployment(api_client, namespace, argo_deployment_name)

    deploy_utils.set_clusterrole(namespace)
Example #10
0
def install_kubebench_nfs(api_client, app_dir, namespace):
    """Deploy required kubeflow packages to run benchmark"""
    util.run(["ks", "pkg", "install", "kubebench/kubebench-quickstarter"],
             cwd=app_dir)
    util.run([
        "ks", "generate", "kubebench-quickstarter-service",
        "kubebench-quickstarter-service"
    ],
             cwd=app_dir)
    util.run([
        "ks", "generate", "kubebench-quickstarter-volume",
        "kubebench-quickstarter-volume"
    ],
             cwd=app_dir)

    util.run([
        "ks", "param", "set", "kubebench-quickstarter-service", "namespace",
        namespace
    ],
             cwd=app_dir)
    util.run([
        "ks", "param", "set", "kubebench-quickstarter-volume", "namespace",
        namespace
    ],
             cwd=app_dir)

    apply_command = [
        "ks", "apply", "default", "-c", "kubebench-quickstarter-service"
    ]
    util.run(apply_command, cwd=app_dir)

    kubebench_nfs_deployment_name = "kubebench-nfs-deploy"
    kubebench_nfs_service_name = "kubebench-nfs-svc"
    logging.info("Verifying NFS deployment started")
    util.wait_for_deployment(api_client, namespace,
                             kubebench_nfs_deployment_name)

    service = get_k8s_service(api_client, namespace,
                              kubebench_nfs_service_name)
    util.run([
        "ks", "param", "set", "kubebench-quickstarter-volume", "nfsServiceIP",
        service.spec.cluster_ip
    ],
             cwd=app_dir)
    apply_command = [
        "ks", "apply", "default", "-c", "kubebench-quickstarter-volume"
    ]
    util.run(apply_command, cwd=app_dir)
Example #11
0
def check_deployments_ready(record_xml_attribute, namespace, name, deployments,
                            cluster_name):
    """Test that Kubeflow deployments are successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, name)

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    api_client = deploy_utils.create_k8s_client()

    for deployment_name in deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)
Example #12
0
    def test_wait_for_deployment(self):
        api_client = mock.MagicMock(spec=k8s_client.ApiClient)

        response = k8s_client.ExtensionsV1beta1Deployment()
        response.status = k8s_client.ExtensionsV1beta1DeploymentStatus()
        response.status.ready_replicas = 1
        api_client.call_api.return_value = response
        result = util.wait_for_deployment(api_client, "some-namespace",
                                          "some-deployment")
        self.assertIsNotNone(result)
Example #13
0
  def test_serve(self):
    # We repeat the test multiple times.
    # This ensures that if we delete the job we can create a new job with the
    # same name.
    api_client = k8s_client.ApiClient()

    # Apply the components
    for component in ["mnist-deploy-gcp", "mnist-service"]:
      # Setup the ksonnet app
      ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                           self.params)

      util.run([self.ks_cmd, "apply", self.env, "-c", component],
               cwd=self.app_dir)

      logging.info("Created deployment %s in namespaces %s", self.name, self.namespace)

    util.wait_for_deployment(api_client, self.namespace, self.name,
                             timeout_minutes=4)
Example #14
0
def deploy_kubeflow(_):
    """Deploy Kubeflow."""
    args = parse_args()
    namespace = args.namespace
    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that Jupyter is actually deployed.
    jupyter_name = "jupyter"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace, jupyter_name)

    # Verify that core components are actually deployed.
    deployment_names = [
        "tf-job-operator-v1beta1", "pytorch-operator", "studyjob-controller"
    ]
    for deployment_name in deployment_names:
        logging.info("Verifying that %s started...", deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name)
Example #15
0
def deploy_kubeflow(_):
  """Deploy Kubeflow."""
  args = parse_args()
  namespace = args.namespace
  api_client = deploy_utils.create_k8s_client()

  util.load_kube_config()
  # Verify that the TfJob operator is actually deployed.
  tf_job_deployment_name = "tf-job-operator-v1beta1"
  logging.info("Verifying TfJob controller started.")
  util.wait_for_deployment(api_client, namespace, tf_job_deployment_name)

  # Verify that Jupyter is actually deployed.
  jupyter_name = "jupyter"
  logging.info("Verifying TfHub started.")
  util.wait_for_statefulset(api_client, namespace, jupyter_name)

  # Verify that PyTorch Operator actually deployed
  pytorch_operator_deployment_name = "pytorch-operator"
  logging.info("Verifying PyTorchJob controller started.")
  util.wait_for_deployment(api_client, namespace, pytorch_operator_deployment_name)
Example #16
0
def check_deployments_ready(record_xml_attribute, namespace, name, deployments):
  """Test that Kubeflow deployments are successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
  set_logging()
  # TODO(jlewi): Should we do this in the calling function)?
  util.set_pytest_junit(record_xml_attribute, name)

  # Need to activate account for scopes.
  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    util.run(["gcloud", "auth", "activate-service-account",
              "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]])

  api_client = deploy_utils.create_k8s_client()

  util.load_kube_config()

  for deployment_name in deployments:
    logging.info("Verifying that deployment %s started...", deployment_name)
    util.wait_for_deployment(api_client, namespace, deployment_name, 10)
Example #17
0
    def test_serve(self):
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.
        api_client = k8s_client.ApiClient()

        # Apply the components
        for pair in self.params.split(","):
            k, v = pair.split("=", 1)
            if k == "namespace":
                util.run(["kustomize edit set", k, v], cwd=self.app_dir)
            else:
                util.run([
                    "kustomize edit add configmap", configmap,
                    "--from-literal=", k, "=", v
                ],
                         cwd=self.app_dir)

        util.wait_for_deployment(api_client,
                                 self.namespace,
                                 self.name,
                                 timeout_minutes=4)
Example #18
0
    def test_serve(self):
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.
        api_client = k8s_client.ApiClient()

        # TODO (jinchihe) beflow code will be removed once new test-worker image
        # is publish in https://github.com/kubeflow/testing/issues/373.
        kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
             'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
        util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl],
                 cwd=self.app_dir)
        util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'],
                 cwd=self.app_dir)

        # Apply the components
        configmap = 'mnist-map-serving'
        for pair in self.params.split(","):
            k, v = pair.split("=", 1)
            if k == "namespace":
                util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir)
            else:
                util.run([
                    'kustomize', 'edit', 'add', 'configmap', configmap,
                    '--from-literal=' + k + '=' + v
                ],
                         cwd=self.app_dir)

        # Seems the util.run cannot handle pipes case, using check_call.
        subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -'
        subprocess.check_call(subCmd, shell=True)

        util.wait_for_deployment(api_client,
                                 self.namespace,
                                 self.name,
                                 timeout_minutes=4)
Example #19
0
def wait_for_kubeflow_install(api_client, namespace):
  """Wait until kubeflow components are up."""
  # Verify that the Argo operator is deployed.
  argo_deployment_name = "workflow-controller"
  logging.info("Verifying Argo controller started.")
  util.wait_for_deployment(api_client, namespace, argo_deployment_name)

  # Verify that the TfJob operator is actually deployed.
  tf_job_deployment_name = "tf-job-operator"
  logging.info("Verifying TfJob controller started.")
  util.wait_for_deployment(api_client, namespace, tf_job_deployment_name)

  # Verify that the Argo operator is deployed.
  mpi_job_deployment_name = "mpi-operator"
  logging.info("Verifying MPIJob controller started.")
  util.wait_for_deployment(api_client, namespace, mpi_job_deployment_name)
Example #20
0
def setup(args):
    """Test deploying Kubeflow."""
    api_client = create_k8s_client(args)

    now = datetime.datetime.now()
    run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    if not os.path.exists(args.test_dir):
        os.makedirs(args.test_dir)

    logging.info("Using test directory: %s", args.test_dir)

    namespace_name = args.namespace

    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    if args.github_token:
        logging.info("Setting GITHUB_TOKEN to %s.", args.github_token)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

    if not os.getenv("GITHUB_TOKEN"):
        logging.warn("GITHUB_TOKEN not set; you will probably hit Github API "
                     "limits.")
    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run([
        "ks",
        "init",
        app_name,
    ], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
             cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
        util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run([
        "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
        "--namespace=" + namespace.metadata.name
    ],
             cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
        util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = [
        "ks",
        "apply",
        "default",
        "-c",
        "kubeflow-core",
    ]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name,
                              jupyter_name)

    if args.deploy_tf_serving:
        logging.info("Deploying tf-serving.")
        util.run([
            "ks", "generate", "tf-serving", "modelServer", "--name=inception",
            "--namespace=" + namespace.metadata.name,
            "--model_path=gs://kubeflow-models/inception",
            "--model_server_image=" + args.model_server_image
        ],
                 cwd=app_dir)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "modelServer",
        ]
        util.run(apply_command, cwd=app_dir)

        core_api = k8s_client.CoreV1Api(api_client)
        deploy = core_api.read_namespaced_service("inception",
                                                  namespace.metadata.name)
        cluster_ip = deploy.spec.cluster_ip

        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 "inception")
        logging.info("Verified TF serving started.")
Example #21
0
def test_kf_is_ready(namespace, use_basic_auth, use_istio):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """

    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "cloud-endpoints-controller",
        "jupyter-web-app-deployment",
        "metadata-db",
        "metadata-deployment",
        "metadata-ui",
        "ml-pipeline",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "notebook-controller-deployment",
        "tf-job-operator",
        "pytorch-operator",
        "katib-controller",
        "workflow-controller",
    ]

    stateful_set_names = [
        "kfserving-controller-manager",
    ]

    ingress_related_deployments = []
    ingress_related_stateful_sets = []

    if use_basic_auth:
        deployment_names.extend(["basic-auth-login"])
        ingress_related_stateful_sets.extend(["backend-updater"])
    else:
        ingress_related_deployments.extend(["iap-enabler"])
        ingress_related_stateful_sets.extend(["backend-updater"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]
    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
Example #22
0
def test_kf_is_ready(namespace, use_basic_auth, use_istio, app_path):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    # TODO(yanniszark): This list is incomplete and missing a lot of components.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "jupyter-web-app-deployment",
        "minio",
        "ml-pipeline",
        "ml-pipeline-persistenceagent",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "ml-pipeline-viewer-controller-deployment",
        "mysql",
        "notebook-controller-deployment",
        "profiles-deployment",
        "pytorch-operator",
        "tf-job-operator",
        "workflow-controller",
    ]

    stateful_set_names = []

    with open(os.path.join(app_path, "app.yaml")) as f:
        kfdef = yaml.safe_load(f)
    platform = kfdef["spec"]["platform"]

    ingress_related_deployments = [
        "istio-citadel",
        "istio-egressgateway",
        "istio-galley",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "istio-tracing",
        "kiali",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
Example #23
0
def setup(args):
  """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
  gke = discovery.build("container", "v1")

  project = args.project
  cluster_name = args.cluster
  zone = args.zone
  machine_type = "n1-standard-8"

  cluster_request = {
    "cluster": {
      "name": cluster_name,
      "description": "A GKE cluster for TF.",
      "initialNodeCount": 1,
      "nodeConfig": {
        "machineType": machine_type,
        "oauthScopes": [
          "https://www.googleapis.com/auth/cloud-platform",
        ],
      },
    }
  }

  if args.accelerators:
    # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
    cluster_request["cluster"]["enableKubernetesAlpha"] = True

    cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
    for accelerator_spec in args.accelerators:
      accelerator_type, accelerator_count = accelerator_spec.split("=", 1)
      cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
        "acceleratorCount":
        accelerator_count,
        "acceleratorType":
        accelerator_type,
      })

  util.create_cluster(gke, project, zone, cluster_request)

  util.configure_kubectl(project, zone, cluster_name)

  util.load_kube_config()
  # Create an API client object to talk to the K8s master.
  api_client = k8s_client.ApiClient()

  t = test_util.TestCase()
  try:
    start = time.time()

    params = {
      "tfJobImage": args.image,
      "name": "kubeflow-core",
      "namespace": args.namespace,
      "tfJobVersion":  args.tf_job_version,
    }

    component = "core"

    account = util.run_and_output(
      ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
    logging.info("Using GCP account %s", account)
    util.run([
      "kubectl", "create", "clusterrolebinding", "default-admin",
      "--clusterrole=cluster-admin", "--user="******"v1alpha1":
      tf_job_deployment_name = "tf-job-operator"
    elif args.tf_job_version == "v1alpha2":
      tf_job_deployment_name = "tf-job-operator-v1alpha2"
    else:
      raise ValueError(
        "Unrecognized value for tf_job_version %s" % args.tf_job_version)
    logging.info("Verifying TfJob deployment %s started.",
                 tf_job_deployment_name)

    # TODO(jlewi): We should verify the image of the operator is the correct.
    util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name)

  # Reraise the exception so that the step fails because there's no point
  # continuing the test.
  except subprocess.CalledProcessError as e:
    t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
    raise
  except util.TimeoutError as e:
    t.failure = e.message
    raise
  finally:
    t.time = time.time() - start
    t.name = "kubeflow-deploy"
    t.class_name = "GKE"
    gcs_client = storage.Client(project=args.project)
    test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #24
0
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
Example #25
0
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth, app_path,
                     cluster_name):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready")

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    api_client = deploy_utils.create_k8s_client()

    # Verify that components are actually deployed.
    deployment_names = []

    stateful_set_names = []

    platform, _ = get_platform_app_name(app_path)

    # TODO(PatrickXYS): not sure why istio-galley can't found
    ingress_related_deployments = [
        "cluster-local-gateway",
        "istio-citadel",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "autoscaler-hpa",
        "controller",
        "networking-istio",
        "webhook",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []
    elif platform == "aws":
        # TODO(PatrickXYS): Extend List with AWS Deployment
        deployment_names.extend(["alb-ingress-controller"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system"
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    ingress_names = ["istio-ingress"]
    # Check if Ingress is Ready and Healthy
    if platform in ["aws"]:
        for ingress_name in ingress_names:
            logging.info("Verifying that ingress %s started...", ingress_name)
            util.wait_for_ingress(api_client, ingress_namespace, ingress_name,
                                  10)

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)

    # Check if Dex is Ready and Healthy
    dex_deployment_names = ["dex"]
    dex_namespace = "auth"
    for dex_deployment_name in dex_deployment_names:
        logging.info("Verifying that deployment %s started...",
                     dex_deployment_name)
        util.wait_for_deployment(api_client, dex_namespace,
                                 dex_deployment_name, 10)

    # Check if Cert-Manager is Ready and Healthy
    cert_manager_deployment_names = [
        "cert-manager",
        "cert-manager-cainjector",
        "cert-manager-webhook",
    ]
    cert_manager_namespace = "cert-manager"
    for cert_manager_deployment_name in cert_manager_deployment_names:
        logging.info("Verifying that deployment %s started...",
                     cert_manager_deployment_name)
        util.wait_for_deployment(api_client, cert_manager_namespace,
                                 cert_manager_deployment_name, 10)
Example #26
0
def setup_kubeflow(args):
    """Setup Kubeflow.

  Args:
    args: Command line arguments that control the setup process.
  """
    project = args.project
    cluster_name = args.cluster
    zone = args.zone

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    try:
        start = time.time()

        params = {
            "tfJobImage": args.image,
            "name": "kubeflow-core",
            "namespace": args.namespace,
            "tfJobVersion": args.tf_job_version,
        }

        component = "core"

        account = util.run_and_output(
            ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
        logging.info("Using GCP account %s", account)

        ks_deploy(args.test_app_dir, component, params, account=account)

        # Verify that the TfJob operator is actually deployed.
        if args.tf_job_version == "v1alpha2":
            tf_job_deployment_name = "tf-job-operator-v1alpha2"
        elif args.tf_job_version == "v1beta1":
            tf_job_deployment_name = "tf-job-operator-v1beta1"
        else:
            raise ValueError("Unrecognized value for tf_job_version %s" %
                             args.tf_job_version)
        logging.info("Verifying TfJob deployment %s started.",
                     tf_job_deployment_name)

        # TODO(jlewi): We should verify the image of the operator is the correct
        # one.
        try:
            util.wait_for_deployment(api_client, args.namespace,
                                     tf_job_deployment_name)
        finally:
            # Run kubectl describe to get useful information about the deployment.
            # This will help troubleshoot any errors.
            util.run([
                "kubectl", "-n", args.namespace, "describe", "deploy",
                tf_job_deployment_name
            ])
            util.run([
                "kubectl", "-n", args.namespace, "describe", "pods", "-l",
                "name=tf-job-operator"
            ])

    # Reraise the exception so that the step fails because there's no point
    # continuing the test.
    except subprocess.CalledProcessError as e:
        t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
        raise
    except util.TimeoutError as e:
        t.failure = e.message
        raise
    finally:
        t.time = time.time() - start
        t.name = "kubeflow-deploy"
        t.class_name = "GKE"
        gcs_client = storage.Client(project=args.project)
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Example #27
0
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth,
                     use_istio, app_path):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    # TODO(yanniszark): This list is incomplete and missing a lot of components.
    deployment_names = [
        "workflow-controller",
    ]

    stateful_set_names = []

    platform, _ = get_platform_app_name(app_path)

    ingress_related_deployments = [
        "istio-egressgateway",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "istio-tracing",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
Example #28
0
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)