Exemple #1
0
def run_test(test_case, test_func, args):  # pylint: disable=too-many-branches,too-many-statements
  """Run a test."""
  gcs_client = storage.Client(project=args.project)
  project = args.project
  cluster_name = args.cluster
  zone = args.zone
  # TODO(jlewi): When using GKE we should copy the .kube config and any other
  # files to the test directory. We should then set the environment variable
  # KUBECONFIG to point at that file. This should prevent us from having
  # to rerun util.configure_kubectl on each step. Instead we could run it once
  # as part of GKE cluster creation and store the config in the NFS directory.
  # This would make the handling of credentials
  # and KUBECONFIG more consistent between GKE and minikube and eventually
  # this could be extended to other K8s deployments.
  if cluster_name:
    util.configure_kubectl(project, zone, cluster_name)
  util.load_kube_config()

  start = time.time()

  try:  # pylint: disable=too-many-nested-blocks
    # We repeat the test multiple times.
    # This ensures that if we delete the job we can create a new job with the
    # same name.

    num_trials = args.num_trials
    logging.info("tfjob_version=%s", args.tfjob_version)

    for trial in range(num_trials):
      logging.info("Trial %s", trial)
      test_func()

    # TODO(jlewi):
    #  Here are some validation checks to run:
    #  1. Check that all resources are garbage collected.
    # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
    # TODO(jlewi): Are there other generic validation checks we should
    # run.
  except tf_operator_util.JobTimeoutError as e:
    if e.job:
      spec = "Job:\n" + json.dumps(e.job, indent=2)
    else:
      spec = "JobTimeoutError did not contain job"
    test_case.failure = "Timeout waiting for job to finish: " + spec
    logging.exception(test_case.failure)
  except Exception as e:  # pylint: disable-msg=broad-except
    # TODO(jlewi): I'm observing flakes where the exception has message "status"
    # in an effort to try to nail down this exception we print out more
    # information about the exception.
    logging.exception("There was a problem running the job; Exception %s", e)
    # We want to catch all exceptions because we want the test as failed.
    test_case.failure = ("Exception occured; type {0} message {1}".format(
      e.__class__, e.message))
  finally:
    test_case.time = time.time() - start
    if args.artifacts_path:
      test_util.create_junit_xml_file(
        [test_case],
        args.artifacts_path + "/junit_" + test_func.__name__ + ".xml",
        gcs_client)
Exemple #2
0
def setup(args):
  """Test deploying Kubeflow."""
  api_client = create_k8s_client(args)
  app_dir = setup_kubeflow_ks_app(args, api_client)

  namespace = args.namespace
  # TODO(jlewi): We don't need to generate a core component if we are
  # just deploying TFServing. Might be better to refactor this code.
  # Deploy Kubeflow
  util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace], cwd=app_dir)

  # TODO(jlewi): For reasons I don't understand even though we ran
  # configure_kubectl above, if we don't rerun it we get rbac errors
  # when we do ks apply; I think because we aren't using the proper service
  # account. This might have something to do with the way ksonnet gets
  # its credentials; maybe we need to configure credentials after calling
  # ks init?
  if args.cluster:
    util.configure_kubectl(args.project, args.zone, args.cluster)

  apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

  util.run(apply_command, cwd=app_dir)

  # Verify that the TfJob operator is actually deployed.
  tf_job_deployment_name = "tf-job-operator"
  logging.info("Verifying TfJob controller started.")
  util.wait_for_deployment(api_client, namespace,
                           tf_job_deployment_name)

  # Verify that JupyterHub is actually deployed.
  jupyter_name = "tf-hub"
  logging.info("Verifying TfHub started.")
  util.wait_for_statefulset(api_client, namespace, jupyter_name)
def get_gke_credentials(test_case):
    """Configure kubeconfig to talk to the supplied GKE cluster."""
    args = parse_args()
    util.maybe_activate_service_account()
    config_file = os.path.expanduser(kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
    logging.info("Using Kubernetes config file: %s", config_file)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name,
                 project, zone)
    # Print out config to help debug issues with accounts and
    # credentials.
    util.run(["gcloud", "config", "list"])
    util.configure_kubectl(project, zone, cluster_name)

    # We want to modify the KUBECONFIG file to remove the gcloud commands
    # for any users that are authenticating using service accounts.
    # This will allow the script to be truly headless and not require gcloud.
    # More importantly, kubectl will properly attach auth.info scope so that
    # RBAC rules can be applied to the email and not the id.
    # See https://github.com/kubernetes/kubernetes/pull/58141
    #
    # TODO(jlewi): We might want to check GOOGLE_APPLICATION_CREDENTIALS
    # to see whether we are actually using a service account. If we aren't
    # using a service account then we might not want to delete the gcloud
    # commands.
    logging.info("Modifying kubeconfig %s", config_file)
    with open(config_file, "r") as hf:
        config = yaml.load(hf)

    for user in config["users"]:
        auth_provider = user.get("user", {}).get("auth-provider", {})
        if auth_provider.get("name") != "gcp":
            continue
        logging.info("Modifying user %s which has gcp auth provider",
                     user["name"])
        if "config" in auth_provider:
            logging.info("Deleting config from user %s", user["name"])
            del auth_provider["config"]

            # This is a hack because the python client library will complain
            # about an invalid config if there is no config field.
            #
            # It looks like the code checks here but that doesn't seem to work
            # https://github.com/kubernetes-client/python-base/blob/master/config/kube_config.py#L209
            auth_provider["config"] = {
                "dummy": "dummy",
            }
    logging.info("Writing update kubeconfig:\n %s", yaml.dump(config))
    with open(config_file, "w") as hf:
        yaml.dump(config, hf)
Exemple #4
0
def create_k8s_client(args):
    if args.cluster:
        project = args.project
        cluster_name = args.cluster
        zone = args.zone
        logging.info("Using cluster: %s in project: %s in zone: %s",
                     cluster_name, project, zone)
        # Print out config to help debug issues with accounts and
        # credentials.
        util.run(["gcloud", "config", "list"])
        util.configure_kubectl(project, zone, cluster_name)
        util.load_kube_config()
    else:
        # TODO(jlewi): This is sufficient for API access but it doesn't create
        # a kubeconfig file which ksonnet needs for ks init.
        logging.info("Running inside cluster.")
        incluster_config.load_incluster_config()

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
Exemple #5
0
    def run(self, tekton_cluster_info, current_cluster_info):
        """Kicks off all the Tekton pipelines.
    Args:
      tekton_cluster_info: ClusterInfo having the info to run pipelines on.
      Tekton runs on different cluster right now.
      current_cluster_info: Current cluster info.

    Returns:
      a list of UI urls.
    """
        urls = dict()
        try:
            # Currently only tekton tests run in kf-ci-v1.
            util.configure_kubectl(tekton_cluster_info.project,
                                   tekton_cluster_info.zone,
                                   tekton_cluster_info.cluster_name)
            # util.configure_kubectl(project, "us-east1-d", "kf-ci-v1")
            util.load_kube_config()

            for w in self.workflows:
                w.run()
                urls[w.name] = w.ui_url
                if w.teardown_runner:
                    urls[w.teardown_runner.name] = w.teardown_runner.ui_url
                logging.info("URL for workflow: %s", w.ui_url)
        except Exception as e:  # pylint: disable=broad-except
            logging.error(
                "Error when starting Tekton workflow: %s;\nstacktrace:\n%s", e,
                traceback.format_exc())
        finally:
            # Restore kubectl
            util.configure_kubectl(current_cluster_info.project,
                                   current_cluster_info.zone,
                                   current_cluster_info.cluster_name)
            util.load_kube_config()

        return urls
def run(args, file_handler):  # pylint: disable=too-many-statements,too-many-branches
    # Check https://github.com/kubernetes/test-infra/blob/master/prow/jobs.md
    # for a description of the injected environment variables.
    job_type = os.getenv("JOB_TYPE")
    repo_owner = os.getenv("REPO_OWNER")
    repo_name = os.getenv("REPO_NAME")
    base_branch_name = os.getenv("PULL_BASE_REF")
    pull_base_sha = os.getenv("PULL_BASE_SHA")

    # For presubmit/postsubmit jobs, find the list of files changed by the PR.
    diff_command = []
    if job_type == "presubmit":
        # We need to get a common ancestor for the PR and the base branch
        cloned_repo_dir = os.path.join(args.repos_dir, repo_owner, repo_name)

        _ = util.run([
            "git", "fetch", "origin",
            base_branch_name + ":refs/remotes/origin/" + base_branch_name
        ],
                     cwd=cloned_repo_dir)

        diff_command = ["git", "diff", "--name-only"]
        diff_branch = "remotes/origin/{}".format(base_branch_name)
        try:
            common_ancestor = util.run(
                ["git", "merge-base", "HEAD", diff_branch],
                cwd=cloned_repo_dir)
            diff_command.append(common_ancestor)
        except subprocess.CalledProcessError as e:
            logging.warning(
                "git merge-base failed; see "
                "https://github.com/kubeflow/kubeflow/issues/3523. Diff "
                "will be computed against the current master and "
                "therefore files not changed in the PR might be "
                "considered when determining which tests to trigger")
            diff_command.append(diff_branch)

    elif job_type == "postsubmit":
        # See: https://git-scm.com/docs/git-diff
        # This syntax compares the commit before pull_base_sha with the commit
        # at pull_base_sha
        diff_command = [
            "git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha
        ]

    changed_files = []
    if job_type in ("presubmit", "postsubmit"):
        changed_files = util.run(diff_command,
                                 cwd=os.path.join(args.repos_dir, repo_owner,
                                                  repo_name)).splitlines()

    for f in changed_files:
        logging.info("File %s is modified.", f)

    if args.release:
        generate_env_from_head(args)
    workflows = []
    config = {}
    if args.config_file:
        config, new_workflows = parse_config_file(args.config_file,
                                                  args.repos_dir)
        workflows.extend(new_workflows)

    # Add any paths to the python path
    extra_py_paths = []
    for p in config.get("python_paths", []):
        # Assume that python_paths are in the format $REPO_OWNER/$REPO_NAME/path,
        # we need to ensure that the repo is checked out if it is different from
        # the current one, and if the repo is not kubeflow/testing (which is already
        # checked out).
        segments = p.split("/")
        if ((segments[0] != repo_owner or segments[1] != repo_name)
                and not p.startswith("kubeflow/testing")):
            logging.info("Need to clone %s/%s", segments[0], segments[1])
            util.clone_repo(
                os.path.join(args.repos_dir, segments[0], segments[1]),
                segments[0], segments[1])

        path = os.path.join(args.repos_dir, p)
        extra_py_paths.append(path)

    kf_test_path = os.path.join(args.repos_dir, "kubeflow/testing/py")
    if kf_test_path not in extra_py_paths:
        logging.info("Adding %s to extra python paths", kf_test_path)
        extra_py_paths.append(kf_test_path)

    logging.info("Extra python paths: %s", ":".join(extra_py_paths))

    # Create an initial version of the file with no urls
    create_started_file(args.bucket, {})

    util.maybe_activate_service_account()

    util.configure_kubectl(args.project, args.zone, args.cluster)
    util.load_kube_config()

    workflow_names = []
    ui_urls = {}

    for w in workflows:  # pylint: disable=too-many-nested-blocks
        # Create the name for the workflow
        # We truncate sha numbers to prevent the workflow name from being too large.
        # Workflow name should not be more than 63 characters because its used
        # as a label on the pods.
        workflow_name = os.getenv("JOB_NAME") + "-" + w.name

        # Skip this workflow if it is scoped to a different job type.
        if w.job_types and not job_type in w.job_types:
            logging.info(
                "Skipping workflow %s because job type %s is not one of "
                "%s.", w.name, job_type, w.job_types)
            continue

        # If we are scoping this workflow to specific directories, check if any files
        # modified match the specified regex patterns.
        dir_modified = False
        if w.include_dirs:
            for f in changed_files:
                for d in w.include_dirs:
                    if fnmatch.fnmatch(f, d):
                        dir_modified = True
                        logging.info(
                            "Triggering workflow %s because %s in dir %s is modified.",
                            w.name, f, d)
                        break
                if dir_modified:
                    break

        # Only consider modified files when the job is pre or post submit, and if
        # the include_dirs stanza is defined.
        if job_type != "periodic" and w.include_dirs and not dir_modified:
            logging.info(
                "Skipping workflow %s because no code modified in %s.", w.name,
                w.include_dirs)
            continue

        if job_type == "presubmit":
            # When not running under prow we might not set all environment variables
            if os.getenv("PULL_NUMBER"):
                workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
            if os.getenv("PULL_PULL_SHA"):
                workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

        elif job_type == "postsubmit":
            if os.getenv("PULL_BASE_SHA"):
                workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

        # Append the last 4 digits of the build number
        if os.getenv("BUILD_NUMBER"):
            workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")[-4:])

        salt = uuid.uuid4().hex[0:4]
        # Add some salt. This is mostly a convenience for the case where you
        # are submitting jobs manually for testing/debugging. Since the prow should
        # vend unique build numbers for each job.
        workflow_name += "-{0}".format(salt)
        workflow_names.append(workflow_name)

        # check if ks workflow and run
        if w.app_dir:
            ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir)

            # Print ksonnet version
            util.run([ks_cmd, "version"])

            # Create a new environment for this run
            env = workflow_name

            util.run([
                ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)
            ],
                     cwd=w.app_dir)

            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component, "name",
                workflow_name
            ],
                     cwd=w.app_dir)

            # Set the prow environment variables.
            prow_env = []

            names = [
                "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
                "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
                "REPO_NAME"
            ]
            names.sort()
            for v in names:
                if not os.getenv(v):
                    continue
                prow_env.append("{0}={1}".format(v, os.getenv(v)))

            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component,
                "prow_env", ",".join(prow_env)
            ],
                     cwd=w.app_dir)
            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component,
                "namespace",
                get_namespace(args)
            ],
                     cwd=w.app_dir)
            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component, "bucket",
                args.bucket
            ],
                     cwd=w.app_dir)
            if args.release:
                util.run([
                    ks_cmd, "param", "set", "--env=" + env, w.component,
                    "versionTag",
                    os.getenv("VERSION_TAG")
                ],
                         cwd=w.app_dir)

            # Set any extra params. We do this in alphabetical order to make it easier to verify in
            # the unittest.
            param_names = w.params.keys()
            param_names.sort()
            for k in param_names:
                util.run([
                    ks_cmd, "param", "set", "--env=" + env, w.component, k,
                    "{0}".format(w.params[k])
                ],
                         cwd=w.app_dir)

            # For debugging print out the manifest
            util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir)
            util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir)

            ui_url = (
                "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
                "?tab=workflow".format(workflow_name))
            ui_urls[workflow_name] = ui_url
            logging.info("URL for workflow: %s", ui_url)
        else:
            w.kwargs["name"] = workflow_name
            w.kwargs["namespace"] = get_namespace(args)

            if TEST_TARGET_ARG_NAME not in w.kwargs:
                w.kwargs[TEST_TARGET_ARG_NAME] = w.name
                logging.info(
                    "Workflow %s doesn't set arg %s; defaulting to %s", w.name,
                    TEST_TARGET_ARG_NAME, w.kwargs[TEST_TARGET_ARG_NAME])

            # TODO(https://github.com/kubeflow/testing/issues/467): We shell out
            # to e2e_tool in order to dumpy the Argo workflow to a file which then
            # reimport. We do this because importing the py_func module appears
            # to break when we have to dynamically adjust sys.path to insert
            # new paths. Setting PYTHONPATH before launching python however appears
            # to work which is why we shell out to e2e_tool.
            command = [
                "python", "-m", "kubeflow.testing.e2e_tool", "show", w.py_func
            ]
            for k, v in w.kwargs.items():
                # The fire module turns underscores in parameter names into hyphens
                # so we convert underscores in parameter names to hyphens
                command.append("--{0}={1}".format(k.replace("_", "-"), v))

            with tempfile.NamedTemporaryFile(delete=False) as hf:
                workflow_file = hf.name

            command.append("--output=" + hf.name)
            env = os.environ.copy()
            env["PYTHONPATH"] = ":".join(extra_py_paths)
            util.run(command, env=env)

            with open(workflow_file) as hf:
                wf_result = yaml.load(hf)

            group, version = wf_result['apiVersion'].split('/')
            k8s_co = k8s_client.CustomObjectsApi()
            workflow_name = wf_result["metadata"]["name"]
            py_func_result = k8s_co.create_namespaced_custom_object(
                group=group,
                version=version,
                namespace=wf_result["metadata"]["namespace"],
                plural='workflows',
                body=wf_result)
            logging.info("Created workflow:\n%s",
                         yaml.safe_dump(py_func_result))

            ui_url = (
                "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
                "?tab=workflow".format(workflow_name))
            ui_urls[workflow_name] = ui_url
            logging.info("URL for workflow: %s", ui_url)

    # We delay creating started.json until we know the Argo workflow URLs
    create_started_file(args.bucket, ui_urls)

    workflow_success = False
    workflow_phase = {}
    workflow_status_yamls = {}
    results = []
    try:
        results = argo_client.wait_for_workflows(
            get_namespace(args),
            workflow_names,
            timeout=datetime.timedelta(minutes=180),
            status_callback=argo_client.log_status)
        workflow_success = True
    except util.ExceptionWithWorkflowResults as e:
        # We explicitly log any exceptions so that they will be captured in the
        # build-log.txt that is uploaded to Gubernator.
        logging.exception("Exception occurred: %s", e)
        results = e.workflow_results
        raise
    finally:
        prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket)
        # Upload logs to GCS. No logs after this point will appear in the
        # file in gcs
        file_handler.flush()
        util.upload_file_to_gcs(
            file_handler.baseFilename,
            os.path.join(prow_artifacts_dir, "build-log.txt"))

        # Upload workflow status to GCS.
        for r in results:
            phase = r.get("status", {}).get("phase")
            name = r.get("metadata", {}).get("name")
            workflow_phase[name] = phase
            workflow_status_yamls[name] = yaml.safe_dump(
                r, default_flow_style=False)
            if phase != "Succeeded":
                workflow_success = False
            logging.info("Workflow %s/%s finished phase: %s",
                         get_namespace(args), name, phase)

            for wf_name, wf_status in workflow_status_yamls.items():
                util.upload_to_gcs(
                    wf_status,
                    os.path.join(prow_artifacts_dir,
                                 '{}.yaml'.format(wf_name)))

        all_tests_success = prow_artifacts.finalize_prow_job(
            args.bucket, workflow_success, workflow_phase, ui_urls)

    return all_tests_success
Exemple #7
0
def setup_cluster(args):
    """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
    gke = discovery.build("container", "v1")

    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    machine_type = "n1-standard-8"

    cluster_request = {
        "cluster": {
            "name": cluster_name,
            "description": "A GKE cluster for TF.",
            "initialNodeCount": 1,
            "nodeConfig": {
                "machineType":
                machine_type,
                "oauthScopes": [
                    "https://www.googleapis.com/auth/cloud-platform",
                ],
            },
        }
    }

    if args.accelerators:
        # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
        cluster_request["cluster"]["enableKubernetesAlpha"] = True

        cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
        for accelerator_spec in args.accelerators:
            accelerator_type, accelerator_count = accelerator_spec.split(
                "=", 1)
            cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
                "acceleratorCount":
                accelerator_count,
                "acceleratorType":
                accelerator_type,
            })

    util.create_cluster(gke, project, zone, cluster_request)

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    try:
        start = time.time()

        account = util.run_and_output(
            ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
        logging.info("Using GCP account %s", account)
        util.run([
            "kubectl", "create", "clusterrolebinding", "default-admin",
            "--clusterrole=cluster-admin", "--user="******"setup-cluster failed;\n" + (e.output or "")
        raise
    except util.TimeoutError as e:
        t.failure = e.message
        raise
    finally:
        t.time = time.time() - start
        t.name = "setup-cluster"
        t.class_name = "GKE"
        gcs_client = storage.Client(project=args.project)
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Exemple #8
0
def setup(args):
    """Test deploying Kubeflow."""
    api_client = create_k8s_client(args)

    now = datetime.datetime.now()
    run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    if not os.path.exists(args.test_dir):
        os.makedirs(args.test_dir)

    logging.info("Using test directory: %s", args.test_dir)

    namespace_name = args.namespace

    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    if args.github_token:
        logging.info("Setting GITHUB_TOKEN to %s.", args.github_token)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

    if not os.getenv("GITHUB_TOKEN"):
        logging.warn("GITHUB_TOKEN not set; you will probably hit Github API "
                     "limits.")
    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run([
        "ks",
        "init",
        app_name,
    ], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
             cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
        util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run([
        "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
        "--namespace=" + namespace.metadata.name
    ],
             cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
        util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = [
        "ks",
        "apply",
        "default",
        "-c",
        "kubeflow-core",
    ]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name,
                              jupyter_name)

    if args.deploy_tf_serving:
        logging.info("Deploying tf-serving.")
        util.run([
            "ks", "generate", "tf-serving", "modelServer", "--name=inception",
            "--namespace=" + namespace.metadata.name,
            "--model_path=gs://kubeflow-models/inception",
            "--model_server_image=" + args.model_server_image
        ],
                 cwd=app_dir)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "modelServer",
        ]
        util.run(apply_command, cwd=app_dir)

        core_api = k8s_client.CoreV1Api(api_client)
        deploy = core_api.read_namespaced_service("inception",
                                                  namespace.metadata.name)
        cluster_ip = deploy.spec.cluster_ip

        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 "inception")
        logging.info("Verified TF serving started.")
Exemple #9
0
def setup(args):
  """Test deploying Kubeflow."""
  if args.cluster:
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    logging.info("Using cluster: %s in project: %s in zone: %s",
                 cluster_name, project, zone)
    # Print out config to help debug issues with accounts and
    # credentials.
    util.run(["gcloud", "config", "list"])
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()
  else:
    # TODO(jlewi): This is sufficient for API access but it doesn't create
    # a kubeconfig file which ksonnet needs for ks init.
    logging.info("Running inside cluster.")
    incluster_config.load_incluster_config()

  # Create an API client object to talk to the K8s master.
  api_client = k8s_client.ApiClient()

  now = datetime.datetime.now()
  run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

  if not os.path.exists(args.test_dir):
    os.makedirs(args.test_dir)

  logging.info("Using test directory: %s", args.test_dir)

  namespace_name = run_label
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)

  main_case = test_util.TestCase()
  main_case.class_name = "KubeFlow"
  main_case.name = "deploy-kubeflow"
  try:
    test_util.wrap_test(run, main_case)
  finally:
    # Delete the namespace
    logging.info("Deleting namespace %s", namespace_name)

    # We report teardown as a separate test case because this will help
    # us track down issues with garbage collecting namespaces.
    teardown = test_util.TestCase(main_case.class_name, "teardown")
    def run_teardown():
      core_api = k8s_client.CoreV1Api(api_client)
      core_api.delete_namespace(namespace_name, {})

    try:
      test_util.wrap_test(run_teardown, teardown)
    except Exception as e:  # pylint: disable-msg=broad-except
      logging.error("There was a problem deleting namespace: %s; %s",
                    namespace_name, e.message)
    junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml")
    logging.info("Writing test results to %s", junit_path)
    test_util.create_junit_xml_file([main_case, teardown], junit_path)
Exemple #10
0
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
Exemple #11
0
def run(args, file_handler):  # pylint: disable=too-many-statements,too-many-branches
    # Print ksonnet version
    util.run(["ks", "version"])
    if args.release:
        generate_env_from_head(args)
    workflows = []
    if args.config_file:
        workflows.extend(parse_config_file(args.config_file, args.repos_dir))

    if args.app_dir and args.component:
        # TODO(jlewi): We can get rid of this branch once all repos are using a prow_config.xml file.
        workflows.append(
            WorkflowComponent("legacy", args.app_dir, args.component, {}))
    create_started_file(args.bucket)

    util.maybe_activate_service_account()

    util.configure_kubectl(args.project, args.zone, args.cluster)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()
    workflow_names = []
    ui_urls = {}

    for w in workflows:
        # Create the name for the workflow
        # We truncate sha numbers to prevent the workflow name from being too large.
        # Workflow name should not be more than 63 characters because its used
        # as a label on the pods.
        workflow_name = os.getenv("JOB_NAME") + "-" + w.name
        job_type = os.getenv("JOB_TYPE")
        if job_type == "presubmit":
            workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
            workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

        elif job_type == "postsubmit":
            workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

        workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

        salt = uuid.uuid4().hex[0:4]
        # Add some salt. This is mostly a convenience for the case where you
        # are submitting jobs manually for testing/debugging. Since the prow should
        # vend unique build numbers for each job.
        workflow_name += "-{0}".format(salt)

        workflow_names.append(workflow_name)
        # Create a new environment for this run
        env = workflow_name

        util.run(["ks", "env", "add", env], cwd=w.app_dir)

        util.run([
            "ks", "param", "set", "--env=" + env, w.component, "name",
            workflow_name
        ],
                 cwd=w.app_dir)

        # Set the prow environment variables.
        prow_env = []

        names = [
            "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
            "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
            "REPO_NAME"
        ]
        names.sort()
        for v in names:
            if not os.getenv(v):
                continue
            prow_env.append("{0}={1}".format(v, os.getenv(v)))

        util.run([
            "ks", "param", "set", "--env=" + env, w.component, "prow_env",
            ",".join(prow_env)
        ],
                 cwd=w.app_dir)
        util.run([
            "ks", "param", "set", "--env=" + env, w.component, "namespace",
            get_namespace(args)
        ],
                 cwd=w.app_dir)
        util.run([
            "ks", "param", "set", "--env=" + env, w.component, "bucket",
            args.bucket
        ],
                 cwd=w.app_dir)
        if args.release:
            util.run([
                "ks", "param", "set", "--env=" + env, w.component,
                "versionTag",
                os.getenv("VERSION_TAG")
            ],
                     cwd=w.app_dir)

        # Set any extra params. We do this in alphabetical order to make it easier to verify in
        # the unittest.
        param_names = w.params.keys()
        param_names.sort()
        for k in param_names:
            util.run([
                "ks", "param", "set", "--env=" + env, w.component, k,
                "{0}".format(w.params[k])
            ],
                     cwd=w.app_dir)

        # For debugging print out the manifest
        util.run(["ks", "show", env, "-c", w.component], cwd=w.app_dir)
        util.run(["ks", "apply", env, "-c", w.component], cwd=w.app_dir)

        ui_url = (
            "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
            "?tab=workflow".format(workflow_name))
        ui_urls[workflow_name] = ui_url
        logging.info("URL for workflow: %s", ui_url)

    success = True
    workflow_phase = {}
    try:
        results = argo_client.wait_for_workflows(
            api_client,
            get_namespace(args),
            workflow_names,
            status_callback=argo_client.log_status)
        for r in results:
            phase = r.get("status", {}).get("phase")
            name = r.get("metadata", {}).get("name")
            workflow_phase[name] = phase
            if phase != "Succeeded":
                success = False
            logging.info("Workflow %s/%s finished phase: %s",
                         get_namespace(args), name, phase)
    except util.TimeoutError:
        success = False
        logging.error("Time out waiting for Workflows %s to finish",
                      ",".join(workflow_names))
    except Exception as e:
        # We explicitly log any exceptions so that they will be captured in the
        # build-log.txt that is uploaded to Gubernator.
        logging.error("Exception occurred: %s", e)
        raise
    finally:
        success = prow_artifacts.finalize_prow_job(args.bucket, success,
                                                   workflow_phase, ui_urls)

        # Upload logs to GCS. No logs after this point will appear in the
        # file in gcs
        file_handler.flush()
        util.upload_file_to_gcs(
            file_handler.baseFilename,
            os.path.join(prow_artifacts.get_gcs_dir(args.bucket),
                         "build-log.txt"))

    return success
Exemple #12
0
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
Exemple #13
0
def setup(args):
  """Test deploying Kubeflow."""
  if args.cluster:
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    logging.info("Using cluster: %s in project: %s in zone: %s",
                 cluster_name, project, zone)
    # Print out config to help debug issues with accounts and
    # credentials.
    util.run(["gcloud", "config", "list"])
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()
  else:
    # TODO(jlewi): This is sufficient for API access but it doesn't create
    # a kubeconfig file which ksonnet needs for ks init.
    logging.info("Running inside cluster.")
    incluster_config.load_incluster_config()

  # Create an API client object to talk to the K8s master.
  api_client = k8s_client.ApiClient()

  now = datetime.datetime.now()
  run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

  if not os.path.exists(args.test_dir):
    os.makedirs(args.test_dir)

  logging.info("Using test directory: %s", args.test_dir)

  namespace_name = run_label
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)

  main_case = test_util.TestCase()
  main_case.class_name = "KubeFlow"
  main_case.name = "deploy-kubeflow"
  try:
    test_util.wrap_test(run, main_case)
  finally:
    # Delete the namespace
    logging.info("Deleting namespace %s", namespace_name)

    # We report teardown as a separate test case because this will help
    # us track down issues with garbage collecting namespaces.
    teardown = test_util.TestCase(main_case.class_name, "teardown")
    def run_teardown():
      core_api = k8s_client.CoreV1Api(api_client)
      core_api.delete_namespace(namespace_name, {})

    try:
      test_util.wrap_test(run_teardown, teardown)
    except Exception as e:  # pylint: disable-msg=broad-except
      logging.error("There was a problem deleting namespace: %s; %s",
                    namespace_name, e.message)
    junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml")
    logging.info("Writing test results to %s", junit_path)
    test_util.create_junit_xml_file([main_case, teardown], junit_path)
Exemple #14
0
def setup(args):
  """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
  gke = discovery.build("container", "v1")

  project = args.project
  cluster_name = args.cluster
  zone = args.zone
  machine_type = "n1-standard-8"

  cluster_request = {
    "cluster": {
      "name": cluster_name,
      "description": "A GKE cluster for TF.",
      "initialNodeCount": 1,
      "nodeConfig": {
        "machineType": machine_type,
        "oauthScopes": [
          "https://www.googleapis.com/auth/cloud-platform",
        ],
      },
    }
  }

  if args.accelerators:
    # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
    cluster_request["cluster"]["enableKubernetesAlpha"] = True

    cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
    for accelerator_spec in args.accelerators:
      accelerator_type, accelerator_count = accelerator_spec.split("=", 1)
      cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
        "acceleratorCount":
        accelerator_count,
        "acceleratorType":
        accelerator_type,
      })

  util.create_cluster(gke, project, zone, cluster_request)

  util.configure_kubectl(project, zone, cluster_name)

  util.load_kube_config()
  # Create an API client object to talk to the K8s master.
  api_client = k8s_client.ApiClient()

  t = test_util.TestCase()
  try:
    start = time.time()

    params = {
      "tfJobImage": args.image,
      "name": "kubeflow-core",
      "namespace": args.namespace,
      "tfJobVersion":  args.tf_job_version,
    }

    component = "core"

    account = util.run_and_output(
      ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
    logging.info("Using GCP account %s", account)
    util.run([
      "kubectl", "create", "clusterrolebinding", "default-admin",
      "--clusterrole=cluster-admin", "--user="******"v1alpha1":
      tf_job_deployment_name = "tf-job-operator"
    elif args.tf_job_version == "v1alpha2":
      tf_job_deployment_name = "tf-job-operator-v1alpha2"
    else:
      raise ValueError(
        "Unrecognized value for tf_job_version %s" % args.tf_job_version)
    logging.info("Verifying TfJob deployment %s started.",
                 tf_job_deployment_name)

    # TODO(jlewi): We should verify the image of the operator is the correct.
    util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name)

  # Reraise the exception so that the step fails because there's no point
  # continuing the test.
  except subprocess.CalledProcessError as e:
    t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
    raise
  except util.TimeoutError as e:
    t.failure = e.message
    raise
  finally:
    t.time = time.time() - start
    t.name = "kubeflow-deploy"
    t.class_name = "GKE"
    gcs_client = storage.Client(project=args.project)
    test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Exemple #15
0
def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    # TODO(jlewi): When using GKE we should copy the .kube config and any other
    # files to the test directory. We should then set the environment variable
    # KUBECONFIG to point at that file. This should prevent us from having
    # to rerun util.configure_kubectl on each step. Instead we could run it once
    # as part of GKE cluster creation and store the config in the NFS directory.
    # This would make the handling of credentials
    # and KUBECONFIG more consistent between GKE and minikube and eventually
    # this could be extended to other K8s deployments.
    if cluster_name:
        util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()
    masterHost = api_client.configuration.host

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    namespace, name, env = _setup_ks_app(args)
    t.name = os.path.basename(name)

    start = time.time()

    try:  # pylint: disable=too-many-nested-blocks
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.

        # TODO(jlewi): We should make this an argument.
        num_trials = 2

        for trial in range(num_trials):
            logging.info("Trial %s", trial)
            util.run(["ks", "apply", env, "-c", args.component],
                     cwd=args.app_dir)

            logging.info("Created job %s in namespaces %s", name, namespace)
            logging.info("tfjob_version=%s", args.tfjob_version)
            # Wait for the job to either be in Running state or a terminal state
            if args.tfjob_version == "v1alpha1":
                logging.info("Wait for Phase Running, Done, or Failed")
                results = tf_job_client.wait_for_phase(
                    api_client,
                    namespace,
                    name, ["Running", "Done", "Failed"],
                    status_callback=tf_job_client.log_status)
            else:
                logging.info(
                    "Wait for conditions Running, Succeeded, or Failed")
                results = tf_job_client.wait_for_condition(
                    api_client,
                    namespace,
                    name, ["Running", "Succeeded", "Failed"],
                    status_callback=tf_job_client.log_status)

            logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

            # The job is now either running or done.
            if args.shutdown_policy:
                logging.info("Enforcing shutdownPolicy %s",
                             args.shutdown_policy)
                if args.shutdown_policy in ["master", "chief"]:
                    if args.tfjob_version == "v1alpha1":
                        replica = "master"
                    else:
                        replica = "chief"
                elif args.shutdown_policy in ["worker", "all_workers"]:
                    replica = "worker"
                else:
                    raise ValueError("Unrecognized shutdown_policy "
                                     "%s" % args.shutdown_policy)

                # Number of targets.
                num_targets = 1
                if args.shutdown_policy in ["all_workers"]:
                    # Assume v1alpha2
                    num_targets = results.get("spec", {}).get(
                        "tfReplicaSpecs", {}).get("Worker",
                                                  {}).get("replicas", 0)
                    logging.info("There are %s worker replicas", num_targets)

                if args.tfjob_version == "v1alpha1":
                    runtime_id = results.get("spec", {}).get("RuntimeId")
                    target = "{name}-{replica}-{runtime}".format(
                        name=name, replica=replica, runtime=runtime_id)
                    pod_labels = get_labels(name, runtime_id)
                    pod_selector = to_selector(pod_labels)
                else:
                    target = "{name}-{replica}".format(name=name,
                                                       replica=replica)
                    pod_labels = get_labels_v1alpha2(namespace, name)
                    pod_selector = to_selector(pod_labels)

                # Wait for the pods to be ready before we shutdown
                # TODO(jlewi): We are get pods using a label selector so there is
                # a risk that the pod we actual care about isn't present.
                logging.info(
                    "Waiting for pods to be running before shutting down.")
                wait_for_pods_to_be_in_phases(
                    api_client,
                    namespace,
                    pod_selector, ["Running"],
                    timeout=datetime.timedelta(minutes=4))
                logging.info("Pods are ready")
                logging.info("Issuing the terminate request")
                for num in range(num_targets):
                    full_target = target + "-{0}".format(num)
                    terminateReplica(masterHost, namespace, full_target)

            logging.info("Waiting for job to finish.")
            results = tf_job_client.wait_for_job(
                api_client,
                namespace,
                name,
                args.tfjob_version,
                status_callback=tf_job_client.log_status)

            if args.tfjob_version == "v1alpha1":
                if results.get("status", {}).get("state",
                                                 {}).lower() != "succeeded":
                    t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format(
                        trial, name, namespace,
                        results.get("status", {}).get("state", None))
                    logging.error(t.failure)
                    break
            else:
                # For v1alpha2 check for non-empty completionTime
                last_condition = results.get("status",
                                             {}).get("conditions", [])[-1]
                if last_condition.get("type", "").lower() != "succeeded":
                    t.failure = "Trial {0} Job {1} in namespace {2} in status {3}".format(
                        trial, name, namespace, results.get("status", {}))
                    logging.error(t.failure)
                    break

            runtime_id = results.get("spec", {}).get("RuntimeId")
            logging.info("Trial %s Job %s in namespace %s runtime ID %s",
                         trial, name, namespace, runtime_id)

            uid = results.get("metadata", {}).get("uid")
            events = get_events(api_client, namespace, uid)
            for e in events:
                logging.info("K8s event: %s", e.message)

            # Print out the K8s events because it can be useful for debugging.
            for e in events:
                logging.info("Recieved K8s Event:\n%s", e)
            created_pods, created_services = parse_events(events)

            num_expected = 0
            if args.tfjob_version == "v1alpha1":
                for replica in results.get("spec", {}).get("replicaSpecs", []):
                    num_expected += replica.get("replicas", 0)
            else:
                for replicakey in results.get("spec",
                                              {}).get("tfReplicaSpecs", {}):
                    replica_spec = results.get("spec",
                                               {}).get("tfReplicaSpecs",
                                                       {}).get(replicakey, {})
                    if replica_spec:
                        num_expected += replica_spec.get("replicas", 1)

            creation_failures = []
            if len(created_pods) != num_expected:
                message = ("Expected {0} pods to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_pods))
                creation_failures.append(message)

            if len(created_services) != num_expected:
                message = ("Expected {0} services to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_services))
                creation_failures.append(message)

            if creation_failures:
                # TODO(jlewi): Starting with
                # https://github.com/kubeflow/tf-operator/pull/646 the number of events
                # no longer seems to match the expected; it looks like maybe events
                # are being combined? For now we just log a warning rather than an
                # error.
                logging.warning(creation_failures)
            if args.tfjob_version == "v1alpha1":
                pod_labels = get_labels(name, runtime_id)
                pod_selector = to_selector(pod_labels)
            else:
                pod_labels = get_labels_v1alpha2(name)
                pod_selector = to_selector(pod_labels)

            # We don't wait for pods to be deleted in v1alpha2 because CleanPodPolicy
            # means completed pods won't be deleted.
            # TODO(jlewi): We should add a test to deal with deleted pods.
            if args.tfjob_version == "v1alpha1":
                wait_for_pods_to_be_deleted(api_client, namespace,
                                            pod_selector)

            tf_job_client.delete_tf_job(api_client,
                                        namespace,
                                        name,
                                        version=args.tfjob_version)

            logging.info("Waiting for job %s in namespaces %s to be deleted.",
                         name, namespace)
            wait_for_delete(api_client,
                            namespace,
                            name,
                            args.tfjob_version,
                            status_callback=tf_job_client.log_status)

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
        logging.exception(t.failure)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.exception("There was a problem running the job; Exception %s",
                          e)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches
  # Check https://github.com/kubernetes/test-infra/blob/master/prow/jobs.md
  # for a description of the injected environment variables.
  job_type = os.getenv("JOB_TYPE")
  repo_owner = os.getenv("REPO_OWNER")
  repo_name = os.getenv("REPO_NAME")
  base_branch_name = os.getenv("PULL_BASE_REF")
  pull_base_sha = os.getenv("PULL_BASE_SHA")

  # For presubmit/postsubmit jobs, find the list of files changed by the PR.
  diff_command = []
  if job_type == "presubmit":
    # We need to get a common ancestor for the PR and the base branch
    cloned_repo_dir = os.path.join(args.repos_dir, repo_owner, repo_name)

    _ = util.run(["git", "fetch", "origin", base_branch_name + ":refs/remotes/origin/" +
                  base_branch_name], cwd=cloned_repo_dir)

    diff_command = ["git", "diff", "--name-only"]
    diff_branch = "remotes/origin/{}".format(base_branch_name)
    try:
      common_ancestor = util.run(["git", "merge-base", "HEAD", diff_branch],
                                 cwd=cloned_repo_dir)
      diff_command.append(common_ancestor)
    except subprocess.CalledProcessError as e:
      logging.warning("git merge-base failed; see "
                      "https://github.com/kubeflow/kubeflow/issues/3523. Diff "
                      "will be computed against the current master and "
                      "therefore files not changed in the PR might be "
                      "considered when determining which tests to trigger")
      diff_command.append(diff_branch)

  elif job_type == "postsubmit":
    # See: https://git-scm.com/docs/git-diff
    # This syntax compares the commit before pull_base_sha with the commit
    # at pull_base_sha
    diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha]

  changed_files = []
  if job_type in ("presubmit", "postsubmit"):
    changed_files = util.run(diff_command,
      cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines()

  for f in changed_files:
    logging.info("File %s is modified.", f)

  if args.release:
    generate_env_from_head(args)
  workflows = []
  if args.config_file:
    workflows.extend(parse_config_file(args.config_file, args.repos_dir))

  # Create an initial version of the file with no urls
  create_started_file(args.bucket, {})

  util.maybe_activate_service_account()

  util.configure_kubectl(args.project, args.zone, args.cluster)
  util.load_kube_config()

  workflow_names = []
  ui_urls = {}

  for w in workflows:
    # Create the name for the workflow
    # We truncate sha numbers to prevent the workflow name from being too large.
    # Workflow name should not be more than 63 characters because its used
    # as a label on the pods.
    workflow_name = os.getenv("JOB_NAME") + "-" + w.name
    ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir)

    # Print ksonnet version
    util.run([ks_cmd, "version"])

    # Skip this workflow if it is scoped to a different job type.
    if w.job_types and not job_type in w.job_types:
      logging.info("Skipping workflow %s because job type %s is not one of "
                   "%s.", w.name, job_type, w.job_types)
      continue

    # If we are scoping this workflow to specific directories, check if any files
    # modified match the specified regex patterns.
    dir_modified = False
    if w.include_dirs:
      for f in changed_files:
        for d in w.include_dirs:
          if fnmatch.fnmatch(f, d):
            dir_modified = True
            logging.info("Triggering workflow %s because %s in dir %s is modified.",
                         w.name, f, d)
            break
        if dir_modified:
          break

    # Only consider modified files when the job is pre or post submit, and if
    # the include_dirs stanza is defined.
    if job_type != "periodic" and w.include_dirs and not dir_modified:
      logging.info("Skipping workflow %s because no code modified in %s.",
                   w.name, w.include_dirs)
      continue

    if job_type == "presubmit":
      workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
      workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

    elif job_type == "postsubmit":
      workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

    # Append the last 4 digits of the build number
    workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")[-4:])

    salt = uuid.uuid4().hex[0:4]
    # Add some salt. This is mostly a convenience for the case where you
    # are submitting jobs manually for testing/debugging. Since the prow should
    # vend unique build numbers for each job.
    workflow_name += "-{0}".format(salt)

    workflow_names.append(workflow_name)
    # Create a new environment for this run
    env = workflow_name

    util.run([ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)],
              cwd=w.app_dir)

    util.run([ks_cmd, "param", "set", "--env=" + env, w.component,
              "name", workflow_name],
             cwd=w.app_dir)

    # Set the prow environment variables.
    prow_env = []

    names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
             "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
             "REPO_NAME"]
    names.sort()
    for v in names:
      if not os.getenv(v):
        continue
      prow_env.append("{0}={1}".format(v, os.getenv(v)))

    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env",
             ",".join(prow_env)], cwd=w.app_dir)
    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace",
             get_namespace(args)], cwd=w.app_dir)
    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket",
             args.bucket], cwd=w.app_dir)
    if args.release:
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag",
                os.getenv("VERSION_TAG")], cwd=w.app_dir)

    # Set any extra params. We do this in alphabetical order to make it easier to verify in
    # the unittest.
    param_names = w.params.keys()
    param_names.sort()
    for k in param_names:
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k,
               "{0}".format(w.params[k])], cwd=w.app_dir)

    # For debugging print out the manifest
    util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir)
    util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir)

    ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
              "?tab=workflow".format(workflow_name))
    ui_urls[workflow_name] = ui_url
    logging.info("URL for workflow: %s", ui_url)

  # We delay creating started.json until we know the Argo workflow URLs
  create_started_file(args.bucket, ui_urls)

  workflow_success = False
  workflow_phase = {}
  workflow_status_yamls = {}
  results = []
  try:
    results = argo_client.wait_for_workflows(
      get_namespace(args), workflow_names,
      timeout=datetime.timedelta(minutes=180),
      status_callback=argo_client.log_status
    )
    workflow_success = True
  except util.ExceptionWithWorkflowResults as e:
    # We explicitly log any exceptions so that they will be captured in the
    # build-log.txt that is uploaded to Gubernator.
    logging.exception("Exception occurred: %s", e)
    results = e.workflow_results
    raise
  finally:
    prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket)
    # Upload logs to GCS. No logs after this point will appear in the
    # file in gcs
    file_handler.flush()
    util.upload_file_to_gcs(
      file_handler.baseFilename,
      os.path.join(prow_artifacts_dir, "build-log.txt"))

    # Upload workflow status to GCS.
    for r in results:
      phase = r.get("status", {}).get("phase")
      name = r.get("metadata", {}).get("name")
      workflow_phase[name] = phase
      workflow_status_yamls[name] = yaml.safe_dump(r, default_flow_style=False)
      if phase != "Succeeded":
        workflow_success = False
      logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase)

      for wf_name, wf_status in workflow_status_yamls.items():
        util.upload_to_gcs(
          wf_status,
          os.path.join(prow_artifacts_dir, '{}.yaml'.format(wf_name)))

    all_tests_success = prow_artifacts.finalize_prow_job(
      args.bucket, workflow_success, workflow_phase, ui_urls)

  return all_tests_success
Exemple #17
0
def setup_kubeflow(args):
    """Setup Kubeflow.

  Args:
    args: Command line arguments that control the setup process.
  """
    project = args.project
    cluster_name = args.cluster
    zone = args.zone

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    try:
        start = time.time()

        params = {
            "tfJobImage": args.image,
            "name": "kubeflow-core",
            "namespace": args.namespace,
            "tfJobVersion": args.tf_job_version,
        }

        component = "core"

        account = util.run_and_output(
            ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
        logging.info("Using GCP account %s", account)

        ks_deploy(args.test_app_dir, component, params, account=account)

        # Verify that the TfJob operator is actually deployed.
        if args.tf_job_version == "v1alpha2":
            tf_job_deployment_name = "tf-job-operator-v1alpha2"
        elif args.tf_job_version == "v1beta1":
            tf_job_deployment_name = "tf-job-operator-v1beta1"
        else:
            raise ValueError("Unrecognized value for tf_job_version %s" %
                             args.tf_job_version)
        logging.info("Verifying TfJob deployment %s started.",
                     tf_job_deployment_name)

        # TODO(jlewi): We should verify the image of the operator is the correct
        # one.
        try:
            util.wait_for_deployment(api_client, args.namespace,
                                     tf_job_deployment_name)
        finally:
            # Run kubectl describe to get useful information about the deployment.
            # This will help troubleshoot any errors.
            util.run([
                "kubectl", "-n", args.namespace, "describe", "deploy",
                tf_job_deployment_name
            ])
            util.run([
                "kubectl", "-n", args.namespace, "describe", "pods", "-l",
                "name=tf-job-operator"
            ])

    # Reraise the exception so that the step fails because there's no point
    # continuing the test.
    except subprocess.CalledProcessError as e:
        t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
        raise
    except util.TimeoutError as e:
        t.failure = e.message
        raise
    finally:
        t.time = time.time() - start
        t.name = "kubeflow-deploy"
        t.class_name = "GKE"
        gcs_client = storage.Client(project=args.project)
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Exemple #18
0
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches
  job_type = os.getenv("JOB_TYPE")
  repo_owner = os.getenv("REPO_OWNER")
  repo_name = os.getenv("REPO_NAME")
  pull_base_sha = os.getenv("PULL_BASE_SHA")

  # For presubmit/postsubmit jobs, find the list of files changed by the PR.
  diff_command = []
  if job_type == "presubmit":
    diff_command = ["git", "diff", "--name-only", "master"]
  elif job_type == "postsubmit":
    diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha]

  changed_files = []
  if job_type == "presubmit" or job_type == "postsubmit":
    changed_files = util.run(diff_command,
      cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines()

  for f in changed_files:
    logging.info("File %s is modified.", f)

  if args.release:
    generate_env_from_head(args)
  workflows = []
  if args.config_file:
    workflows.extend(parse_config_file(args.config_file, args.repos_dir))

  create_started_file(args.bucket)

  util.maybe_activate_service_account()

  util.configure_kubectl(args.project, args.zone, args.cluster)
  util.load_kube_config()

  workflow_names = []
  ui_urls = {}

  for w in workflows:
    # Create the name for the workflow
    # We truncate sha numbers to prevent the workflow name from being too large.
    # Workflow name should not be more than 63 characters because its used
    # as a label on the pods.
    workflow_name = os.getenv("JOB_NAME") + "-" + w.name
    ks_cmd = get_ksonnet_cmd(w)

    # Print ksonnet version
    util.run([ks_cmd, "version"])

    # Skip this workflow if it is scoped to a different job type.
    if w.job_types and not job_type in w.job_types:
      logging.info("Skipping workflow %s because job type %s is not one of "
                   "%s.", w.name, job_type, w.job_types)
      continue

    # If we are scoping this workflow to specific directories, check if any files
    # modified match the specified regex patterns.
    dir_modified = False
    if w.include_dirs:
      for f in changed_files:
        for d in w.include_dirs:
          if fnmatch.fnmatch(f, d):
            dir_modified = True
            logging.info("Triggering workflow %s because %s in dir %s is modified.",
                         w.name, f, d)
            break
        if dir_modified:
          break

    # Only consider modified files when the job is pre or post submit, and if
    # the include_dirs stanza is defined.
    if job_type != "periodic" and w.include_dirs and not dir_modified:
      logging.info("Skipping workflow %s because no code modified in %s.",
                   w.name, w.include_dirs)
      continue

    if job_type == "presubmit":
      workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
      workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

    elif job_type == "postsubmit":
      workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

    workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

    salt = uuid.uuid4().hex[0:4]
    # Add some salt. This is mostly a convenience for the case where you
    # are submitting jobs manually for testing/debugging. Since the prow should
    # vend unique build numbers for each job.
    workflow_name += "-{0}".format(salt)

    workflow_names.append(workflow_name)
    # Create a new environment for this run
    env = workflow_name

    util.run([ks_cmd, "env", "add", env], cwd=w.app_dir)

    util.run([ks_cmd, "param", "set", "--env=" + env, w.component,
              "name", workflow_name],
             cwd=w.app_dir)

    # Set the prow environment variables.
    prow_env = []

    names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
             "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
             "REPO_NAME"]
    names.sort()
    for v in names:
      if not os.getenv(v):
        continue
      prow_env.append("{0}={1}".format(v, os.getenv(v)))

    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env",
             ",".join(prow_env)], cwd=w.app_dir)
    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace",
             get_namespace(args)], cwd=w.app_dir)
    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket",
             args.bucket], cwd=w.app_dir)
    if args.release:
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag",
                os.getenv("VERSION_TAG")], cwd=w.app_dir)

    # Set any extra params. We do this in alphabetical order to make it easier to verify in
    # the unittest.
    param_names = w.params.keys()
    param_names.sort()
    for k in param_names:
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k,
               "{0}".format(w.params[k])], cwd=w.app_dir)

    # For debugging print out the manifest
    util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir)
    util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir)

    ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
              "?tab=workflow".format(workflow_name))
    ui_urls[workflow_name] = ui_url
    logging.info("URL for workflow: %s", ui_url)

  success = True
  workflow_phase = {}
  try:
    results = argo_client.wait_for_workflows(get_namespace(args),
                                             workflow_names,
                                             timeout=datetime.timedelta(minutes=180),
                                             status_callback=argo_client.log_status)
    for r in results:
      phase = r.get("status", {}).get("phase")
      name = r.get("metadata", {}).get("name")
      workflow_phase[name] = phase
      if phase != "Succeeded":
        success = False
      logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase)
  except util.TimeoutError:
    success = False
    logging.exception("Time out waiting for Workflows %s to finish", ",".join(workflow_names))
  except Exception as e:
    # We explicitly log any exceptions so that they will be captured in the
    # build-log.txt that is uploaded to Gubernator.
    logging.exception("Exception occurred: %s", e)
    raise
  finally:
    success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls)

    # Upload logs to GCS. No logs after this point will appear in the
    # file in gcs
    file_handler.flush()
    util.upload_file_to_gcs(
      file_handler.baseFilename,
      os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt"))

  return success
Exemple #19
0
def run(args, file_handler):
    create_started_file(args.bucket)

    util.maybe_activate_service_account()

    util.configure_kubectl(args.project, args.zone, args.cluster)
    util.load_kube_config()

    # Create the name for the workflow
    # We truncate sha numbers to prevent the workflow name from being too large.
    # Workflow name should not be more than 63 characters because its used
    # as a label on the pods.
    workflow_name = os.getenv("JOB_NAME")
    job_type = os.getenv("JOB_TYPE")
    if job_type == "presubmit":
        workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
        workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

    elif job_type == "postsubmit":
        workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

    workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

    salt = uuid.uuid4().hex[0:4]
    # Add some salt. This is mostly a convenience for the case where you
    # are submitting jobs manually for testing/debugging. Since the prow should
    # vend unique build numbers for each job.
    workflow_name += "-{0}".format(salt)

    # Create a new environment for this run
    env = workflow_name

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    util.run([
        "ks", "param", "set", "--env=" + env, args.component, "name",
        workflow_name
    ],
             cwd=args.app_dir)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    # Set the prow environment variables.
    prow_env = []

    names = [
        "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA",
        "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME"
    ]
    names.sort()
    for v in names:
        if not os.getenv(v):
            continue
        prow_env.append("{0}={1}".format(v, os.getenv(v)))

    util.run([
        "ks", "param", "set", "--env=" + env, args.component, "prow_env",
        ",".join(prow_env)
    ],
             cwd=args.app_dir)
    util.run([
        "ks", "param", "set", "--env=" + env, args.component, "namespace",
        NAMESPACE
    ],
             cwd=args.app_dir)
    util.run([
        "ks", "param", "set", "--env=" + env, args.component, "bucket",
        args.bucket
    ],
             cwd=args.app_dir)

    # For debugging print out the manifest
    util.run(["ks", "show", env, "-c", args.component], cwd=args.app_dir)
    util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir)

    ui_url = (
        "http://testing-argo.kubeflow.io/timeline/kubeflow-test-infra/{0}"
        ";tab=workflow".format(workflow_name))
    logging.info("URL for workflow: %s", ui_url)
    success = False
    try:
        results = argo_client.wait_for_workflow(
            api_client,
            NAMESPACE,
            workflow_name,
            status_callback=argo_client.log_status)
        if results["status"]["phase"] == "Succeeded":
            success = True
        logging.info("Workflow %s/%s finished phase: %s", NAMESPACE,
                     workflow_name, results["status"]["phase"])
    except util.TimeoutError:
        success = False
        logging.error("Time out waiting for Workflow %s/%s to finish",
                      NAMESPACE, workflow_name)
    finally:
        create_finished_file(args.bucket, success)

        # Upload logs to GCS. No logs after this point will appear in the
        # file in gcs
        file_handler.flush()
        upload_file_to_gcs(
            file_handler.baseFilename,
            os.path.join(prow_artifacts.get_gcs_dir(args.bucket),
                         "build-log.txt"))

    return success