Ejemplo n.º 1
0
def setup(args):
  """Test deploying Kubeflow."""
  api_client = create_k8s_client(args)
  app_dir = setup_kubeflow_ks_app(args, api_client)

  namespace = args.namespace
  # TODO(jlewi): We don't need to generate a core component if we are
  # just deploying TFServing. Might be better to refactor this code.
  # Deploy Kubeflow
  util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace], cwd=app_dir)

  # TODO(jlewi): For reasons I don't understand even though we ran
  # configure_kubectl above, if we don't rerun it we get rbac errors
  # when we do ks apply; I think because we aren't using the proper service
  # account. This might have something to do with the way ksonnet gets
  # its credentials; maybe we need to configure credentials after calling
  # ks init?
  if args.cluster:
    util.configure_kubectl(args.project, args.zone, args.cluster)

  apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

  util.run(apply_command, cwd=app_dir)

  # Verify that the TfJob operator is actually deployed.
  tf_job_deployment_name = "tf-job-operator"
  logging.info("Verifying TfJob controller started.")
  util.wait_for_deployment(api_client, namespace,
                           tf_job_deployment_name)

  # Verify that JupyterHub is actually deployed.
  jupyter_name = "tf-hub"
  logging.info("Verifying TfHub started.")
  util.wait_for_statefulset(api_client, namespace, jupyter_name)
Ejemplo n.º 2
0
def check_statefulsets_ready(record_xml_attribute, namespace, name, stateful_sets):
  """Test that Kubeflow deployments are successfully deployed.

  Args:
    namespace: The namespace to check
  """
  set_logging()
  # TODO(jlewi): Should we do this in the calling function)?
  util.set_pytest_junit(record_xml_attribute, name)

  # Need to activate account for scopes.
  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    util.run(["gcloud", "auth", "activate-service-account",
              "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]])

  api_client = deploy_utils.create_k8s_client()

  util.load_kube_config()

  for set_name in stateful_sets:
    logging.info("Verifying that stateful set %s.%s started...", namespace,
                 set_name)
    try:
      util.wait_for_statefulset(api_client, namespace, set_name)
    except:
      # Collect debug information by running describe
      util.run(["kubectl", "-n", namespace, "describe", "statefulsets",
                set_name])
      raise Exception(f"Stateful set {namespace}.{name} is not ready")
Ejemplo n.º 3
0
def deploy_kubeflow(test_case):
    """Deploy Kubeflow."""
    args = parse_args()
    test_dir = test_case.test_suite.test_dir
    namespace = args.namespace
    api_client = deploy_utils.create_k8s_client()
    app_dir = deploy_utils.setup_kubeflow_ks_app(test_dir, namespace,
                                                 args.github_token, api_client)

    # TODO(jlewi): We don't need to generate a core component if we are
    # just deploying TFServing. Might be better to refactor this code.
    # Deploy Kubeflow
    util.run([
        "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
        "--namespace=" + namespace
    ],
             cwd=app_dir)

    util.run([
        "ks", "generate", "pytorch-operator", "pytorch-operator",
        "--name=pytorch-operator", "--namespace=" + namespace
    ],
             cwd=app_dir)

    apply_command = [
        "ks",
        "apply",
        "default",
        "-c",
        "kubeflow-core",
        "-c",
        "pytorch-operator",
    ]

    if args.as_gcloud_user:
        account = deploy_utils.get_gcp_identity()
        logging.info("Impersonate %s", account)

        # If we don't use --as to impersonate the service account then we
        # observe RBAC errors when doing certain operations. The problem appears
        # to be that we end up using the in cluster config (e.g. pod service account)
        # and not the GCP service account which has more privileges.
        apply_command.append("--as=" + account)
    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace, tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyterhub_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace, jupyterhub_name)

    # Verify that PyTorch Operator actually deployed
    pytorch_operator_deployment_name = "pytorch-operator"
    logging.info("Verifying PyTorchJob controller started.")
    util.wait_for_deployment(api_client, namespace,
                             pytorch_operator_deployment_name)
Ejemplo n.º 4
0
def test_kf_is_ready(namespace, use_basic_auth):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """

    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "cert-manager",
        "cloud-endpoints-controller",
        "jupyter-web-app",
        "ml-pipeline",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "notebooks-controller",
        "tf-job-operator",
        "profiles",
        "pytorch-operator",
        "studyjob-controller",
        "workflow-controller",
    ]

    stateful_sets = [
        "backend-updater",
    ]

    if use_basic_auth:
        deployment_names.extend(["basic-auth"])
    else:
        deployment_names.extend(["iap-enabler"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name)

    for name in stateful_sets:
        logging.info("Verifying that statefulset %s started...", name)
        util.wait_for_statefulset(api_client, namespace, name)
Ejemplo n.º 5
0
    def test_wait_for_statefulset(self):
        api_client = mock.MagicMock(spec=k8s_client.ApiClient)

        response = k8s_client.V1beta1StatefulSet()
        response.status = k8s_client.V1beta1StatefulSetStatus(ready_replicas=1,
                                                              replicas=1)
        api_client.call_api.return_value = response
        result = util.wait_for_statefulset(api_client, "some-namespace",
                                           "some-set")
        self.assertIsNotNone(result)
Ejemplo n.º 6
0
def deploy_kubeflow(_):
    """Deploy Kubeflow."""
    args = parse_args()
    namespace = args.namespace
    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that Jupyter is actually deployed.
    jupyter_name = "jupyter"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace, jupyter_name)

    # Verify that core components are actually deployed.
    deployment_names = [
        "tf-job-operator-v1beta1", "pytorch-operator", "studyjob-controller"
    ]
    for deployment_name in deployment_names:
        logging.info("Verifying that %s started...", deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name)
Ejemplo n.º 7
0
def deploy_kubeflow(_):
  """Deploy Kubeflow."""
  args = parse_args()
  namespace = args.namespace
  api_client = deploy_utils.create_k8s_client()

  util.load_kube_config()
  # Verify that the TfJob operator is actually deployed.
  tf_job_deployment_name = "tf-job-operator-v1beta1"
  logging.info("Verifying TfJob controller started.")
  util.wait_for_deployment(api_client, namespace, tf_job_deployment_name)

  # Verify that Jupyter is actually deployed.
  jupyter_name = "jupyter"
  logging.info("Verifying TfHub started.")
  util.wait_for_statefulset(api_client, namespace, jupyter_name)

  # Verify that PyTorch Operator actually deployed
  pytorch_operator_deployment_name = "pytorch-operator"
  logging.info("Verifying PyTorchJob controller started.")
  util.wait_for_deployment(api_client, namespace, pytorch_operator_deployment_name)
Ejemplo n.º 8
0
def setup(args):
    """Test deploying Kubeflow."""
    api_client = create_k8s_client(args)

    now = datetime.datetime.now()
    run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    if not os.path.exists(args.test_dir):
        os.makedirs(args.test_dir)

    logging.info("Using test directory: %s", args.test_dir)

    namespace_name = args.namespace

    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    if args.github_token:
        logging.info("Setting GITHUB_TOKEN to %s.", args.github_token)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

    if not os.getenv("GITHUB_TOKEN"):
        logging.warn("GITHUB_TOKEN not set; you will probably hit Github API "
                     "limits.")
    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run([
        "ks",
        "init",
        app_name,
    ], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
             cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
        util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run([
        "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
        "--namespace=" + namespace.metadata.name
    ],
             cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
        util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = [
        "ks",
        "apply",
        "default",
        "-c",
        "kubeflow-core",
    ]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name,
                              jupyter_name)

    if args.deploy_tf_serving:
        logging.info("Deploying tf-serving.")
        util.run([
            "ks", "generate", "tf-serving", "modelServer", "--name=inception",
            "--namespace=" + namespace.metadata.name,
            "--model_path=gs://kubeflow-models/inception",
            "--model_server_image=" + args.model_server_image
        ],
                 cwd=app_dir)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "modelServer",
        ]
        util.run(apply_command, cwd=app_dir)

        core_api = k8s_client.CoreV1Api(api_client)
        deploy = core_api.read_namespaced_service("inception",
                                                  namespace.metadata.name)
        cluster_ip = deploy.spec.cluster_ip

        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 "inception")
        logging.info("Verified TF serving started.")
Ejemplo n.º 9
0
def test_kf_is_ready(namespace, use_basic_auth, use_istio):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """

    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "cloud-endpoints-controller",
        "jupyter-web-app-deployment",
        "metadata-db",
        "metadata-deployment",
        "metadata-ui",
        "ml-pipeline",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "notebook-controller-deployment",
        "tf-job-operator",
        "pytorch-operator",
        "katib-controller",
        "workflow-controller",
    ]

    stateful_set_names = [
        "kfserving-controller-manager",
    ]

    ingress_related_deployments = []
    ingress_related_stateful_sets = []

    if use_basic_auth:
        deployment_names.extend(["basic-auth-login"])
        ingress_related_stateful_sets.extend(["backend-updater"])
    else:
        ingress_related_deployments.extend(["iap-enabler"])
        ingress_related_stateful_sets.extend(["backend-updater"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]
    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
Ejemplo n.º 10
0
def test_kf_is_ready(namespace, use_basic_auth, use_istio, app_path):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    # TODO(yanniszark): This list is incomplete and missing a lot of components.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "jupyter-web-app-deployment",
        "minio",
        "ml-pipeline",
        "ml-pipeline-persistenceagent",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "ml-pipeline-viewer-controller-deployment",
        "mysql",
        "notebook-controller-deployment",
        "profiles-deployment",
        "pytorch-operator",
        "tf-job-operator",
        "workflow-controller",
    ]

    stateful_set_names = []

    with open(os.path.join(app_path, "app.yaml")) as f:
        kfdef = yaml.safe_load(f)
    platform = kfdef["spec"]["platform"]

    ingress_related_deployments = [
        "istio-citadel",
        "istio-egressgateway",
        "istio-galley",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "istio-tracing",
        "kiali",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
Ejemplo n.º 11
0
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth,
                     use_istio, app_path):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    # TODO(yanniszark): This list is incomplete and missing a lot of components.
    deployment_names = [
        "workflow-controller",
    ]

    stateful_set_names = []

    platform, _ = get_platform_app_name(app_path)

    ingress_related_deployments = [
        "istio-egressgateway",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "istio-tracing",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
Ejemplo n.º 12
0
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
Ejemplo n.º 13
0
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth, app_path,
                     cluster_name):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready")

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    api_client = deploy_utils.create_k8s_client()

    # Verify that components are actually deployed.
    deployment_names = []

    stateful_set_names = []

    platform, _ = get_platform_app_name(app_path)

    # TODO(PatrickXYS): not sure why istio-galley can't found
    ingress_related_deployments = [
        "cluster-local-gateway",
        "istio-citadel",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "autoscaler-hpa",
        "controller",
        "networking-istio",
        "webhook",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []
    elif platform == "aws":
        # TODO(PatrickXYS): Extend List with AWS Deployment
        deployment_names.extend(["alb-ingress-controller"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system"
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    ingress_names = ["istio-ingress"]
    # Check if Ingress is Ready and Healthy
    if platform in ["aws"]:
        for ingress_name in ingress_names:
            logging.info("Verifying that ingress %s started...", ingress_name)
            util.wait_for_ingress(api_client, ingress_namespace, ingress_name,
                                  10)

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)

    # Check if Dex is Ready and Healthy
    dex_deployment_names = ["dex"]
    dex_namespace = "auth"
    for dex_deployment_name in dex_deployment_names:
        logging.info("Verifying that deployment %s started...",
                     dex_deployment_name)
        util.wait_for_deployment(api_client, dex_namespace,
                                 dex_deployment_name, 10)

    # Check if Cert-Manager is Ready and Healthy
    cert_manager_deployment_names = [
        "cert-manager",
        "cert-manager-cainjector",
        "cert-manager-webhook",
    ]
    cert_manager_namespace = "cert-manager"
    for cert_manager_deployment_name in cert_manager_deployment_names:
        logging.info("Verifying that deployment %s started...",
                     cert_manager_deployment_name)
        util.wait_for_deployment(api_client, cert_manager_namespace,
                                 cert_manager_deployment_name, 10)
Ejemplo n.º 14
0
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)