コード例 #1
0
def test(args):
    """Run the tests."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)

    t = test_util.TestCase()
    try:
        start = time.time()
        util.run(["helm", "test", "tf-job"])
    except subprocess.CalledProcessError as e:
        t.failure = "helm test failed;\n" + (e.output or "")
        # Reraise the exception so that the prow job will fail and the test
        # is marked as a failure.
        # TODO(jlewi): It would be better to this wholistically; e.g. by
        # processing all the junit xml files and checking for any failures. This
        # should be more tractable when we migrate off Airflow to Argo.
        raise
    finally:
        t.time = time.time() - start
        t.name = "e2e-test"
        t.class_name = "GKE"
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
コード例 #2
0
def run_test(args):
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(args.spec)

    loader = jinja2.FileSystemLoader(os.path.dirname(args.spec))

    if not args.image_tag:
        raise ValueError("--image_tag must be provided.")

    logging.info("Loading spec from %s with image_tag=%s", args.spec,
                 args.image_tag)
    spec_contents = jinja2.Environment(loader=loader).get_template(
        os.path.basename(args.spec)).render(image_tag=args.image_tag)

    spec = yaml.load(spec_contents)

    # Make the job name unique.
    spec["metadata"]["name"] += "-" + uuid.uuid4().hex[0:4]
    try:
        start = time.time()
        api_response = tf_job_client.create_tf_job(api_client, spec)
        namespace = api_response["metadata"]["namespace"]
        name = api_response["metadata"]["name"]

        logging.info("Created job %s in namespaces %s", name, namespace)
        results = tf_job_client.wait_for_job(
            api_client,
            namespace,
            name,
            status_callback=tf_job_client.log_status)

        if results["status"]["state"] != "succeeded":
            t.failure = "Job {0} in namespace {1} in state {2}".format(
                name, namespace, results["status"]["state"])

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check tensorboard is created if its part of the job spec.
        #  2. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
コード例 #3
0
def test(args):
    """Run the tests."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)

    t = test_util.TestCase()
    try:
        start = time.time()
        util.run(["helm", "test", "tf-job"])
    except subprocess.CalledProcessError as e:
        t.failure = "helm test failed;\n" + e.output
    finally:
        t.time = time.time() - start
        t.name = "e2e-test"
        t.class_name = "GKE"
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
コード例 #4
0
ファイル: test_deploy.py プロジェクト: zomglings/kubeflow
    def run():
        namespace = _setup_test(api_client, namespace_name)
        logging.info("Using namespace: %s", namespace)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

        # Initialize a ksonnet app.
        app_name = "kubeflow-test"
        util.run([
            "ks",
            "init",
            app_name,
        ], cwd=args.test_dir, use_print=True)

        app_dir = os.path.join(args.test_dir, app_name)

        kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
        util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
                 cwd=app_dir)

        # Install required packages
        packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

        for p in packages:
            util.run(["ks", "pkg", "install", p], cwd=app_dir)

        # Delete the vendor directory and replace with a symlink to the src
        # so that we use the code at the desired commit.
        target_dir = os.path.join(app_dir, "vendor", "kubeflow")

        logging.info("Deleting %s", target_dir)
        shutil.rmtree(target_dir)

        source = os.path.join(args.test_dir, "src", "kubeflow")
        logging.info("Creating link %s -> %s", target_dir, source)
        os.symlink(source, target_dir)

        # Deploy Kubeflow
        util.run([
            "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace.metadata.name
        ],
                 cwd=app_dir)

        # TODO(jlewi): For reasons I don't understand even though we ran
        # configure_kubectl above, if we don't rerun it we get rbac errors
        # when we do ks apply; I think because we aren't using the proper service
        # account. This might have something to do with the way ksonnet gets
        # its credentials; maybe we need to configure credentials after calling
        # ks init?
        if args.cluster:
            util.configure_kubectl(args.project, args.zone, args.cluster)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "kubeflow-core",
        ]

        util.run(apply_command, cwd=app_dir)

        # Verify that the TfJob operator is actually deployed.
        tf_job_deployment_name = "tf-job-operator"
        logging.info("Verifying TfJob controller started.")
        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 tf_job_deployment_name)

        # Verify that JupyterHub is actually deployed.
        jupyter_name = "tf-hub"
        logging.info("Verifying TfHub started.")
        util.wait_for_statefulset(api_client, namespace.metadata.name,
                                  jupyter_name)
コード例 #5
0
ファイル: test_deploy.py プロジェクト: zomglings/kubeflow
def setup(args):
    """Test deploying Kubeflow."""
    if args.cluster:
        project = args.project
        cluster_name = args.cluster
        zone = args.zone
        logging.info("Using cluster: %s in project: %s in zone: %s",
                     cluster_name, project, zone)
        # Print out config to help debug issues with accounts and
        # credentials.
        util.run(["gcloud", "config", "list"])
        util.configure_kubectl(project, zone, cluster_name)
        util.load_kube_config()
    else:
        # TODO(jlewi): This is sufficient for API access but it doesn't create
        # a kubeconfig file which ksonnet needs for ks init.
        logging.info("Running inside cluster.")
        incluster_config.load_incluster_config()

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    now = datetime.datetime.now()
    run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    if not os.path.exists(args.test_dir):
        os.makedirs(args.test_dir)

    logging.info("Using test directory: %s", args.test_dir)

    namespace_name = run_label

    def run():
        namespace = _setup_test(api_client, namespace_name)
        logging.info("Using namespace: %s", namespace)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

        # Initialize a ksonnet app.
        app_name = "kubeflow-test"
        util.run([
            "ks",
            "init",
            app_name,
        ], cwd=args.test_dir, use_print=True)

        app_dir = os.path.join(args.test_dir, app_name)

        kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
        util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
                 cwd=app_dir)

        # Install required packages
        packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

        for p in packages:
            util.run(["ks", "pkg", "install", p], cwd=app_dir)

        # Delete the vendor directory and replace with a symlink to the src
        # so that we use the code at the desired commit.
        target_dir = os.path.join(app_dir, "vendor", "kubeflow")

        logging.info("Deleting %s", target_dir)
        shutil.rmtree(target_dir)

        source = os.path.join(args.test_dir, "src", "kubeflow")
        logging.info("Creating link %s -> %s", target_dir, source)
        os.symlink(source, target_dir)

        # Deploy Kubeflow
        util.run([
            "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace.metadata.name
        ],
                 cwd=app_dir)

        # TODO(jlewi): For reasons I don't understand even though we ran
        # configure_kubectl above, if we don't rerun it we get rbac errors
        # when we do ks apply; I think because we aren't using the proper service
        # account. This might have something to do with the way ksonnet gets
        # its credentials; maybe we need to configure credentials after calling
        # ks init?
        if args.cluster:
            util.configure_kubectl(args.project, args.zone, args.cluster)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "kubeflow-core",
        ]

        util.run(apply_command, cwd=app_dir)

        # Verify that the TfJob operator is actually deployed.
        tf_job_deployment_name = "tf-job-operator"
        logging.info("Verifying TfJob controller started.")
        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 tf_job_deployment_name)

        # Verify that JupyterHub is actually deployed.
        jupyter_name = "tf-hub"
        logging.info("Verifying TfHub started.")
        util.wait_for_statefulset(api_client, namespace.metadata.name,
                                  jupyter_name)

    main_case = test_util.TestCase()
    main_case.class_name = "KubeFlow"
    main_case.name = "deploy-kubeflow"
    try:
        test_util.wrap_test(run, main_case)
    finally:
        # Delete the namespace
        logging.info("Deleting namespace %s", namespace_name)

        # We report teardown as a separate test case because this will help
        # us track down issues with garbage collecting namespaces.
        teardown = test_util.TestCase(main_case.class_name, "teardown")

        def run_teardown():
            core_api = k8s_client.CoreV1Api(api_client)
            core_api.delete_namespace(namespace_name, {})

        try:
            test_util.wrap_test(run_teardown, teardown)
        except Exception as e:  # pylint: disable-msg=broad-except
            logging.error("There was a problem deleting namespace: %s; %s",
                          namespace_name, e.message)
        junit_path = os.path.join(args.artifacts_dir,
                                  "junit_kubeflow-deploy.xml")
        logging.info("Writing test results to %s", junit_path)
        test_util.create_junit_xml_file([main_case, teardown], junit_path)
コード例 #6
0
def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    # TODO(jlewi): When using GKE we should copy the .kube config and any other
    # files to the test directory. We should then set the environment variable
    # KUBECONFIG to point at that file. This should prevent us from having
    # to rerun util.configure_kubectl on each step. Instead we could run it once
    # as part of GKE cluster creation and store the config in the NFS directory.
    # This would make the handling of credentials
    # and KUBECONFIG more consistent between GKE and minikube and eventually
    # this could be extended to other K8s deployments.
    if cluster_name:
        util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    salt = uuid.uuid4().hex[0:4]

    # Create a new environment for this run
    env = "test-env-{0}".format(salt)

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    name = None
    namespace = None
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        if k == "name":
            name = v

        if k == "namespace":
            namespace = v
        util.run(["ks", "param", "set", "--env=" + env, args.component, k, v],
                 cwd=args.app_dir)

    if not name:
        raise ValueError("name must be provided as a parameter.")

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(name)

    if not namespace:
        raise ValueError("namespace must be provided as a parameter.")

    start = time.time()

    try:
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.

        # TODO(jlewi): We should make this an argument.
        num_trials = 2

        for trial in range(num_trials):
            logging.info("Trial %s", trial)
            util.run(["ks", "apply", env, "-c", args.component],
                     cwd=args.app_dir)

            logging.info("Created job %s in namespaces %s", name, namespace)
            results = tf_job_client.wait_for_job(
                api_client,
                namespace,
                name,
                status_callback=tf_job_client.log_status)

            if results.get("status", {}).get("state",
                                             {}).lower() != "succeeded":
                t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format(
                    trial, name, namespace,
                    results.get("status", {}).get("state", None))
                logging.error(t.failure)
                break

            runtime_id = results.get("spec", {}).get("RuntimeId")
            logging.info("Trial %s Job %s in namespace %s runtime ID %s",
                         trial, name, namespace, runtime_id)

            uid = results.get("metadata", {}).get("uid")
            events = get_events(api_client, namespace, uid)
            created_pods, created_services = parse_events(events)

            num_expected = 0
            for replica in results.get("spec", {}).get("replicaSpecs", []):
                num_expected += replica.get("replicas", 0)

            creation_failures = []
            if len(created_pods) != num_expected:
                message = ("Expected {0} pods to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_pods))
                creation_failures.append(message)

            if len(created_services) != num_expected:
                message = ("Expected {0} services to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_services))
                creation_failures.append(message)

            if creation_failures:
                t.failure = "Trial {0} Job {1} in namespace {2}: {3}".format(
                    trial, name, namespace, ", ".join(creation_failures))
                logging.error(t.failure)
                break
            pod_labels = get_labels(name, runtime_id)
            pod_selector = to_selector(pod_labels)

            wait_for_pods_to_be_deleted(api_client, namespace, pod_selector)

            tf_job_client.delete_tf_job(api_client, namespace, name)

            logging.info("Waiting for job %s in namespaces %s to be deleted.",
                         name, namespace)
            wait_for_delete(api_client,
                            namespace,
                            name,
                            status_callback=tf_job_client.log_status)

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
        logging.error(t.failure)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.error("There was a problem running the job; Exception %s", e)
        logging.error(
            "There was a problem running the job; Exception "
            "message: %s", e.message)
        logging.error("Exception type: %s", e.__class__)
        logging.error("Exception args: %s", e.args)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
コード例 #7
0
ファイル: test_runner.py プロジェクト: jose5918/k8s
def run_test(args):
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    salt = uuid.uuid4().hex[0:4]

    # Create a new environment for this run
    env = "test-env-{0}".format(salt)

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    name = None
    namespace = None
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        if k == "name":
            name = v

        if k == "namespace":
            namespace = v
        util.run(["ks", "param", "set", "--env=" + env, args.component, k, v],
                 cwd=args.app_dir)

    if not name:
        raise ValueError("name must be provided as a parameter.")

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(name)

    if not namespace:
        raise ValueError("namespace must be provided as a parameter.")

    start = time.time()

    try:
        util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir)

        logging.info("Created job %s in namespaces %s", name, namespace)
        results = tf_job_client.wait_for_job(
            api_client,
            namespace,
            name,
            status_callback=tf_job_client.log_status)

        if results.get("status", {}).get("state", {}).lower() != "succeeded":
            t.failure = "Job {0} in namespace {1} in state {2}".format(
                name, namespace,
                results.get("status", {}).get("state", None))

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check tensorboard is created if its part of the job spec.
        #  2. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.error("There was a problem running the job; Exception %s", e)
        logging.error(
            "There was a problem running the job; Exception "
            "message: %s", e.message)
        logging.error("Exception type: %s", e.__class__)
        logging.error("Exception args: %s", e.args)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
コード例 #8
0
def setup(args):
    """Test deploying Kubeflow."""
    if args.cluster:
        project = args.project
        cluster_name = args.cluster
        zone = args.zone
        logging.info("Using cluster: %s in project: %s in zone: %s",
                     cluster_name, project, zone)
        # Print out config to help debug issues with accounts and
        # credentials.
        util.run(["gcloud", "config", "list"])
        util.configure_kubectl(project, zone, cluster_name)
        util.load_kube_config()
    else:
        # TODO(jlewi): This is sufficient for API access but it doesn't create
        # a kubeconfig file which ksonnet needs for ks init.
        logging.info("Running inside cluster.")
        incluster_config.load_incluster_config()

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    now = datetime.datetime.now()
    run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    if not os.path.exists(args.test_dir):
        os.makedirs(args.test_dir)

    logging.info("Using test directory: %s", args.test_dir)

    namespace_name = run_label

    def run():
        namespace = _setup_test(api_client, namespace_name)
        logging.info("Using namespace: %s", namespace)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

        # Initialize a ksonnet app.
        app_name = "kubeflow-test"
        util.run([
            "ks",
            "init",
            app_name,
        ], cwd=args.test_dir, use_print=True)

        app_dir = os.path.join(args.test_dir, app_name)

        # TODO(jlewi): In presubmits we probably want to change this so we can
        # pull the changes on a branch. Its not clear whether that's well supported
        # in Ksonnet yet.
        kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
        util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
                 cwd=app_dir)

        # Install required packages
        # TODO(jlewi): For presubmits how do we pull the package from the desired
        # branch at the desired commit.
        packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

        for p in packages:
            util.run(["ks", "pkg", "install", p], cwd=app_dir)

        # Deploy Kubeflow
        util.run([
            "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace.metadata.name
        ],
                 cwd=app_dir)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "kubeflow-core",
        ]

        if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
            with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf:
                key = json.load(hf)
                apply_command.append("--as=" + key["client_email"])
        util.run(apply_command, cwd=app_dir)

        # Verify that the TfJob operator is actually deployed.
        tf_job_deployment_name = "tf-job-operator"
        logging.info("Verifying TfJob controller started.")
        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 tf_job_deployment_name)

        # Verify that JupyterHub is actually deployed.
        jupyter_name = "tf-hub"
        logging.info("Verifying TfHub started.")
        util.wait_for_statefulset(api_client, namespace.metadata.name,
                                  jupyter_name)

    main_case = test_util.TestCase()
    main_case.class_name = "KubeFlow"
    main_case.name = "deploy-kubeflow"
    try:
        test_util.wrap_test(run, main_case)
    finally:
        # Delete the namespace
        logging.info("Deleting namespace %s", namespace_name)

        # We report teardown as a separate test case because this will help
        # us track down issues with garbage collecting namespaces.
        teardown = test_util.TestCase(main_case.class_name, "teardown")

        def run_teardown():
            core_api = k8s_client.CoreV1Api(api_client)
            core_api.delete_namespace(namespace_name, {})

        try:
            test_util.wrap_test(run_teardown, teardown)
        except Exception as e:  # pylint: disable-msg=broad-except
            logging.error("There was a problem deleting namespace: %s; %s",
                          namespace_name, e.message)
        junit_path = os.path.join(args.artifacts_dir,
                                  "junit_kubeflow-deploy.xml")
        logging.info("Writing test results to %s", junit_path)
        test_util.create_junit_xml_file([main_case, teardown], junit_path)
コード例 #9
0
def setup(args):
    """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
    gke = discovery.build("container", "v1")

    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    chart = args.chart
    machine_type = "n1-standard-8"

    cluster_request = {
        "cluster": {
            "name": cluster_name,
            "description": "A GKE cluster for TF.",
            "initialNodeCount": 1,
            "nodeConfig": {
                "machineType":
                machine_type,
                "oauthScopes": [
                    "https://www.googleapis.com/auth/cloud-platform",
                ],
            },
            # TODO(jlewi): Stop pinning GKE version once 1.8 becomes the default.
            "initialClusterVersion": "1.8.1-gke.1",
        }
    }

    if args.accelerators:
        # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
        cluster_request["cluster"]["enableKubernetesAlpha"] = True

        cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
        for accelerator_spec in args.accelerators:
            accelerator_type, accelerator_count = accelerator_spec.split(
                "=", 1)
            cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
                "acceleratorCount":
                accelerator_count,
                "acceleratorType":
                accelerator_type,
            })

    util.create_cluster(gke, project, zone, cluster_request)

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    util.setup_cluster(api_client)

    if chart.startswith("gs://"):
        remote = chart
        chart = os.path.join(tempfile.gettempdir(), os.path.basename(chart))
        gcs_client = storage.Client(project=project)
        bucket_name, path = util.split_gcs_uri(remote)

        bucket = gcs_client.get_bucket(bucket_name)
        blob = bucket.blob(path)
        logging.info("Downloading %s to %s", remote, chart)
        blob.download_to_filename(chart)

    t = test_util.TestCase()
    try:
        start = time.time()
        util.run([
            "helm", "install", chart, "-n", "tf-job", "--wait", "--replace",
            "--set", "rbac.install=true,cloud=gke"
        ])
    except subprocess.CalledProcessError as e:
        t.failure = "helm install failed;\n" + e.output
    finally:
        t.time = time.time() - start
        t.name = "helm-tfjob-install"
        t.class_name = "GKE"
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
コード例 #10
0
def setup(args):
    """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
    gke = discovery.build("container", "v1")

    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    machine_type = "n1-standard-8"

    cluster_request = {
        "cluster": {
            "name": cluster_name,
            "description": "A GKE cluster for TF.",
            "initialNodeCount": 1,
            "nodeConfig": {
                "machineType":
                machine_type,
                "oauthScopes": [
                    "https://www.googleapis.com/auth/cloud-platform",
                ],
            },
        }
    }

    if args.accelerators:
        # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
        cluster_request["cluster"]["enableKubernetesAlpha"] = True

        cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
        for accelerator_spec in args.accelerators:
            accelerator_type, accelerator_count = accelerator_spec.split(
                "=", 1)
            cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
                "acceleratorCount":
                accelerator_count,
                "acceleratorType":
                accelerator_type,
            })

    util.create_cluster(gke, project, zone, cluster_request)

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    try:
        start = time.time()

        params = {
            "tfJobImage": args.image,
            "name": "kubeflow-core",
            "namespace": args.namespace,
        }

        component = "core"

        account = util.run_and_output(
            ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
        logging.info("Using GCP account %s", account)
        util.run([
            "kubectl", "create", "clusterrolebinding", "default-admin",
            "--clusterrole=cluster-admin", "--user="******"tf-job-operator"
        logging.info("Verifying TfJob controller started.")

        # TODO(jlewi): We should verify the image of the operator is the correct.
        util.wait_for_deployment(api_client, args.namespace,
                                 tf_job_deployment_name)

    # Reraise the exception so that the step fails because there's no point
    # continuing the test.
    except subprocess.CalledProcessError as e:
        t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
        raise
    except util.TimeoutError as e:
        t.failure = e.message
        raise
    finally:
        t.time = time.time() - start
        t.name = "kubeflow-deploy"
        t.class_name = "GKE"
        gcs_client = storage.Client(project=args.project)
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
コード例 #11
0
def run(args, file_handler):
  src_dir = _get_src_dir()
  logging.info("Source directory: %s", src_dir)
  app_dir = os.path.join(src_dir, "test-infra")

  create_started_file(args.bucket)

  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud "
                 "to use service account.")
    # Since a service account is set tell gcloud to use it.
    util.run(["gcloud", "auth", "activate-service-account", "--key-file=" +
              os.getenv("GOOGLE_APPLICATION_CREDENTIALS")])

  util.configure_kubectl(args.project, args.zone, args.cluster)
  util.load_kube_config()

  # Create the name for the workflow
  workflow_name = os.getenv("JOB_NAME")
  job_type = os.getenv("JOB_TYPE")
  if job_type == "presubmit":
    workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
  elif job_type == "postsubmit":
    workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA"))

  workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

  # Add some salt. This is mostly a convenience for the case where you
  # are submitting jobs manually for testing/debugging. Since the prow should
  # vend unique build numbers for each job.
  workflow_name += "-{0}".format(uuid.uuid4().hex[0:4])

  util.run(["ks", "param", "set", "workflows", "name", workflow_name], cwd=app_dir)
  util.load_kube_config()

  api_client = k8s_client.ApiClient()

  # Set the prow environment variables.
  prow_env = []

  names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
           "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
           "REPO_NAME"]
  names.sort()
  for v in names:
    if not os.getenv(v):
      continue
    prow_env.append("{0}={1}".format(v, os.getenv(v)))

  util.run(["ks", "param", "set", COMPONENT, "prow_env", ",".join(prow_env)], cwd=app_dir)
  util.run(["ks", "param", "set", COMPONENT, "namespace", NAMESPACE], cwd=app_dir)
  util.run(["ks", "param", "set", COMPONENT, "bucket", args.bucket], cwd=app_dir)

  # For debugging print out the manifest
  util.run(["ks", "show", "prow", "-c", COMPONENT], cwd=app_dir)
  util.run(["ks", "apply", "prow", "-c", COMPONENT], cwd=app_dir)

  success = False
  try:
    results = argo_client.wait_for_workflow(api_client, NAMESPACE, workflow_name,
                                            status_callback=argo_client.log_status)
    if results["status"]["phase"] == "Succeeded":
      success = True
    logging.info("Workflow %s/%s finished phase: %s", NAMESPACE, workflow_name,
                 results["status"]["phase"] )
  except util.TimeoutError:
    success = False
    logging.error("Time out waiting for Workflow %s/%s to finish", NAMESPACE, workflow_name)
  finally:
    create_finished_file(args.bucket, success)

    # Upload logs to GCS. No logs after this point will appear in the
    # file in gcs
    file_handler.flush()
    upload_file_to_gcs(
      file_handler.baseFilename,
      os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt"))

  return success
コード例 #12
0
def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    salt = uuid.uuid4().hex[0:4]

    # Create a new environment for this run
    env = "test-env-{0}".format(salt)

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    name = None
    namespace = None
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        if k == "name":
            name = v

        if k == "namespace":
            namespace = v
        util.run(["ks", "param", "set", "--env=" + env, args.component, k, v],
                 cwd=args.app_dir)

    if not name:
        raise ValueError("name must be provided as a parameter.")

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(name)

    if not namespace:
        raise ValueError("namespace must be provided as a parameter.")

    start = time.time()

    try:
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.

        # TODO(jlewi): We should make this an argument.
        num_trials = 2

        for trial in range(num_trials):
            logging.info("Trial %s", trial)
            util.run(["ks", "apply", env, "-c", args.component],
                     cwd=args.app_dir)

            logging.info("Created job %s in namespaces %s", name, namespace)
            results = tf_job_client.wait_for_job(
                api_client,
                namespace,
                name,
                status_callback=tf_job_client.log_status)

            if results.get("status", {}).get("state",
                                             {}).lower() != "succeeded":
                t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format(
                    trial, name, namespace,
                    results.get("status", {}).get("state", None))
                logging.error(t.failure)
                break

            runtime_id = results.get("spec", {}).get("RuntimeId")
            logging.info("Trial %s Job %s in namespace %s runtime ID %s",
                         trial, name, namespace, runtime_id)

            # TODO(jlewi): We should check that pods were created for each replica
            pod_labels = get_labels(name, runtime_id)
            pod_selector = to_selector(pod_labels)
            pods = list_pods(api_client, namespace, pod_selector)

            logging.info("Trial %s selector: %s matched %s pods", trial,
                         pod_selector, len(pods.items))

            if not pods.items:
                t.failure = (
                    "Trial {0} Job {1} in namespace {2} no pods found for "
                    " selector {3}").format(trial, name, namespace,
                                            pod_selector)
                logging.error(t.failure)
                break

            tf_job_client.delete_tf_job(api_client, namespace, name)

            wait_for_delete(api_client,
                            namespace,
                            name,
                            status_callback=tf_job_client.log_status)

            # Verify the pods have been deleted. tf_job_client uses foreground
            # deletion so there shouldn't be any resources for the job left
            # once the job is gone.
            pods = list_pods(api_client, namespace, pod_selector)

            logging.info("Trial %s selector: %s matched %s pods", trial,
                         pod_selector, len(pods.items))

            if pods.items:
                t.failure = (
                    "Trial {0} Job {1} in namespace {2} pods found for "
                    " selector {3}; pods\n{4}").format(trial, name, namespace,
                                                       pod_selector, pods)
                logging.error(t.failure)
                break

            logging.info("Trial %s all pods deleted.", trial)

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
        logging.error(t.failure)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.error("There was a problem running the job; Exception %s", e)
        logging.error(
            "There was a problem running the job; Exception "
            "message: %s", e.message)
        logging.error("Exception type: %s", e.__class__)
        logging.error("Exception args: %s", e.args)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)