Example #1
0
def main():  # pylint: disable=too-many-locals
    logging.getLogger().setLevel(logging.INFO)  # pylint: disable=too-many-locals
    # create the top-level parser
    parser = argparse.ArgumentParser(description="Test Kubeflow E2E.")

    parser.add_argument(
        "--test_dir",
        default="",
        type=str,
        help="Directory to use for all the test files. If not set a temporary "
        "directory is created.")

    parser.add_argument(
        "--artifacts_dir",
        default="",
        type=str,
        help="Directory to use for artifacts that should be preserved after "
        "the test runs. Defaults to test_dir if not set.")

    parser.add_argument("--project",
                        default=None,
                        type=str,
                        help="The project to use.")

    parser.add_argument(
        "--cluster",
        default=None,
        type=str,
        help=("The name of the cluster. If not set assumes the "
              "script is running in a cluster and uses that cluster."))

    parser.add_argument("--namespace",
                        required=True,
                        type=str,
                        help=("The namespace to use."))

    parser.add_argument("--zone",
                        default="us-east1-d",
                        type=str,
                        help="The zone for the cluster.")

    parser.add_argument(
        "--github_token",
        default=None,
        type=str,
        help=
        ("The GitHub API token to use. This is needed since ksonnet uses the "
         "GitHub API and without it we get rate limited. For more info see: "
         "https://github.com/ksonnet/ksonnet/blob/master/docs"
         "/troubleshooting.md. Can also be set using environment variable "
         "GITHUB_TOKEN."))

    subparsers = parser.add_subparsers()

    parser_setup = subparsers.add_parser("setup",
                                         help="setup the test infrastructure.")

    parser_setup.set_defaults(func=setup)

    parser_teardown = subparsers.add_parser(
        "teardown", help="teardown the test infrastructure.")

    parser_teardown.set_defaults(func=teardown)

    parser_setup.add_argument(
        "--deploy_tf_serving",
        default=False,
        type=bool,
        help=("If True, deploy the tf-serving component."))

    parser_setup.add_argument("--model_server_image",
                              default="gcr.io/kubeflow/model-server:1.0",
                              type=str,
                              help=("The TF serving image to use."))

    args = parser.parse_args()

    if not args.test_dir:
        logging.info("--test_dir not set; using a temporary directory.")

        now = datetime.datetime.now()
        label = "test_deploy-" + now.strftime(
            "%m%d-%H%M-") + uuid.uuid4().hex[0:4]

        # Create a temporary directory for this test run
        args.test_dir = os.path.join(tempfile.gettempdir(), label)

    if not args.artifacts_dir:
        args.artifacts_dir = args.test_dir

    test_log = os.path.join(args.artifacts_dir, "logs",
                            "test_deploy." + args.func.__name__ + ".log.txt")
    if not os.path.exists(os.path.dirname(test_log)):
        os.makedirs(os.path.dirname(test_log))

    # TODO(jlewi): We should make this a util routine in kubeflow.testing.util
    # Setup a logging file handler. This way we can upload the log outputs
    # to gubernator.
    root_logger = logging.getLogger()

    file_handler = logging.FileHandler(test_log)
    root_logger.addHandler(file_handler)
    # We need to explicitly set the formatter because it will not pick up
    # the BasicConfig.
    formatter = logging.Formatter(
        fmt=("%(levelname)s|%(asctime)s"
             "|%(pathname)s|%(lineno)d| %(message)s"),
        datefmt="%Y-%m-%dT%H:%M:%S")
    file_handler.setFormatter(formatter)
    logging.info("Logging to %s", test_log)

    util.maybe_activate_service_account()

    wrap_test(args)
Example #2
0
def run(args, file_handler):  # pylint: disable=too-many-statements,too-many-branches
    job_type = os.getenv("JOB_TYPE")
    repo_owner = os.getenv("REPO_OWNER")
    repo_name = os.getenv("REPO_NAME")
    pull_base_sha = os.getenv("PULL_BASE_SHA")

    # For presubmit/postsubmit jobs, find the list of files changed by the PR.
    diff_command = []
    if job_type == "presubmit":
        # We need to get a common ancestor for the PR and the master branch
        common_ancestor = util.run(["git", "merge-base", "HEAD", "master"],
                                   cwd=os.path.join(args.repos_dir, repo_owner,
                                                    repo_name))
        diff_command = ["git", "diff", "--name-only", common_ancestor]
    elif job_type == "postsubmit":
        # See: https://git-scm.com/docs/git-diff
        # This syntax compares the commit before pull_base_sha with the commit
        # at pull_base_sha
        diff_command = [
            "git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha
        ]

    changed_files = []
    if job_type == "presubmit" or job_type == "postsubmit":
        changed_files = util.run(diff_command,
                                 cwd=os.path.join(args.repos_dir, repo_owner,
                                                  repo_name)).splitlines()

    for f in changed_files:
        logging.info("File %s is modified.", f)

    if args.release:
        generate_env_from_head(args)
    workflows = []
    if args.config_file:
        workflows.extend(parse_config_file(args.config_file, args.repos_dir))

    create_started_file(args.bucket)

    util.maybe_activate_service_account()

    util.configure_kubectl(args.project, args.zone, args.cluster)
    util.load_kube_config()

    workflow_names = []
    ui_urls = {}

    for w in workflows:
        # Create the name for the workflow
        # We truncate sha numbers to prevent the workflow name from being too large.
        # Workflow name should not be more than 63 characters because its used
        # as a label on the pods.
        workflow_name = os.getenv("JOB_NAME") + "-" + w.name
        ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir)

        # Print ksonnet version
        util.run([ks_cmd, "version"])

        # Skip this workflow if it is scoped to a different job type.
        if w.job_types and not job_type in w.job_types:
            logging.info(
                "Skipping workflow %s because job type %s is not one of "
                "%s.", w.name, job_type, w.job_types)
            continue

        # If we are scoping this workflow to specific directories, check if any files
        # modified match the specified regex patterns.
        dir_modified = False
        if w.include_dirs:
            for f in changed_files:
                for d in w.include_dirs:
                    if fnmatch.fnmatch(f, d):
                        dir_modified = True
                        logging.info(
                            "Triggering workflow %s because %s in dir %s is modified.",
                            w.name, f, d)
                        break
                if dir_modified:
                    break

        # Only consider modified files when the job is pre or post submit, and if
        # the include_dirs stanza is defined.
        if job_type != "periodic" and w.include_dirs and not dir_modified:
            logging.info(
                "Skipping workflow %s because no code modified in %s.", w.name,
                w.include_dirs)
            continue

        if job_type == "presubmit":
            workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
            workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

        elif job_type == "postsubmit":
            workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

        workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

        salt = uuid.uuid4().hex[0:4]
        # Add some salt. This is mostly a convenience for the case where you
        # are submitting jobs manually for testing/debugging. Since the prow should
        # vend unique build numbers for each job.
        workflow_name += "-{0}".format(salt)

        workflow_names.append(workflow_name)
        # Create a new environment for this run
        env = workflow_name

        util.run(
            [ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)],
            cwd=w.app_dir)

        util.run([
            ks_cmd, "param", "set", "--env=" + env, w.component, "name",
            workflow_name
        ],
                 cwd=w.app_dir)

        # Set the prow environment variables.
        prow_env = []

        names = [
            "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
            "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
            "REPO_NAME"
        ]
        names.sort()
        for v in names:
            if not os.getenv(v):
                continue
            prow_env.append("{0}={1}".format(v, os.getenv(v)))

        util.run([
            ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env",
            ",".join(prow_env)
        ],
                 cwd=w.app_dir)
        util.run([
            ks_cmd, "param", "set", "--env=" + env, w.component, "namespace",
            get_namespace(args)
        ],
                 cwd=w.app_dir)
        util.run([
            ks_cmd, "param", "set", "--env=" + env, w.component, "bucket",
            args.bucket
        ],
                 cwd=w.app_dir)
        if args.release:
            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component,
                "versionTag",
                os.getenv("VERSION_TAG")
            ],
                     cwd=w.app_dir)

        # Set any extra params. We do this in alphabetical order to make it easier to verify in
        # the unittest.
        param_names = w.params.keys()
        param_names.sort()
        for k in param_names:
            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component, k,
                "{0}".format(w.params[k])
            ],
                     cwd=w.app_dir)

        # For debugging print out the manifest
        util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir)
        util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir)

        ui_url = (
            "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
            "?tab=workflow".format(workflow_name))
        ui_urls[workflow_name] = ui_url
        logging.info("URL for workflow: %s", ui_url)

    success = True
    workflow_phase = {}
    try:
        results = argo_client.wait_for_workflows(
            get_namespace(args),
            workflow_names,
            timeout=datetime.timedelta(minutes=180),
            status_callback=argo_client.log_status)
        for r in results:
            phase = r.get("status", {}).get("phase")
            name = r.get("metadata", {}).get("name")
            workflow_phase[name] = phase
            if phase != "Succeeded":
                success = False
            logging.info("Workflow %s/%s finished phase: %s",
                         get_namespace(args), name, phase)
    except util.TimeoutError:
        success = False
        logging.exception("Time out waiting for Workflows %s to finish",
                          ",".join(workflow_names))
    except Exception as e:
        # We explicitly log any exceptions so that they will be captured in the
        # build-log.txt that is uploaded to Gubernator.
        logging.exception("Exception occurred: %s", e)
        raise
    finally:
        success = prow_artifacts.finalize_prow_job(args.bucket, success,
                                                   workflow_phase, ui_urls)

        # Upload logs to GCS. No logs after this point will appear in the
        # file in gcs
        file_handler.flush()
        util.upload_file_to_gcs(
            file_handler.baseFilename,
            os.path.join(prow_artifacts.get_gcs_dir(args.bucket),
                         "build-log.txt"))

    return success
Example #3
0
def main():  # pylint: disable=too-many-locals
  logging.getLogger().setLevel(logging.INFO)  # pylint: disable=too-many-locals

  util.maybe_activate_service_account()

  now = datetime.datetime.now()

  # create the top-level parser
  parser = argparse.ArgumentParser(description="Setup clusters for testing.")
  subparsers = parser.add_subparsers()

  #############################################################################
  # setup
  #
  parser_setup = subparsers.add_parser(
    "setup_cluster", help="Setup a cluster for testing.")

  parser_setup.add_argument(
    "--accelerator",
    dest="accelerators",
    action="append",
    help="Accelerator to add to the cluster. Should be of the form type=count.")

  parser_setup.add_argument(
    "--namespace",
    default="kubeflow-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4],
    help="The directory containing the ksonnet app used for testing.",
  )
  parser_setup.set_defaults(func=setup_cluster)
  add_common_args(parser_setup)

  parser_kubeflow = subparsers.add_parser(
    "setup_kubeflow", help="Deploy Kubeflow for testing.")

  parser_kubeflow.add_argument(
    "--tf_job_version",
    dest="tf_job_version",
    help="Which version of the TFJobOperator to deploy.")

  parser_kubeflow.set_defaults(func=setup_kubeflow)

  parser_kubeflow.add_argument(
    "--namespace",
    default="kubeflow-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4],
    help="The directory containing the ksonnet app used for testing.",
  )

  parser_kubeflow.add_argument(
    "--image",
    help="The image to use",
  )

  add_common_args(parser_kubeflow)

  parser_kubeflow.add_argument(
    "--test_app_dir",
    help="The directory containing the ksonnet app used for testing.",
  )

  #############################################################################
  # teardown
  #
  parser_teardown = subparsers.add_parser(
    "teardown", help="Teardown the cluster.")
  parser_teardown.set_defaults(func=teardown)
  add_common_args(parser_teardown)

  # parse the args and call whatever function was selected
  args = parser.parse_args()
  args.func(args)
Example #4
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    logging.getLogger().setLevel(logging.INFO)  # pylint: disable=too-many-locals
    # create the top-level parser
    parser = argparse.ArgumentParser(description="Test Kubeflow E2E.")

    parser.add_argument(
        "--test_dir",
        default="",
        type=str,
        help="Directory to use for all the test files. If not set a temporary "
        "directory is created.")

    parser.add_argument(
        "--artifacts_dir",
        default="",
        type=str,
        help="Directory to use for artifacts that should be preserved after "
        "the test runs. Defaults to test_dir if not set.")

    parser.add_argument(
        "--as_gcloud_user",
        dest="as_gcloud_user",
        action="store_true",
        help=("Impersonate the user corresponding to the gcloud "
              "command with kubectl and ks."))
    parser.add_argument("--no-as_gcloud_user",
                        dest="as_gcloud_user",
                        action="store_false")
    parser.set_defaults(as_gcloud_user=False)

    # TODO(jlewi): This should not be a global flag.
    parser.add_argument("--project",
                        default=None,
                        type=str,
                        help="The project to use.")

    # TODO(jlewi): This should not be a global flag.
    parser.add_argument("--namespace",
                        default=None,
                        type=str,
                        help=("The namespace to use."))

    parser.add_argument(
        "--github_token",
        default=None,
        type=str,
        help=
        ("The GitHub API token to use. This is needed since ksonnet uses the "
         "GitHub API and without it we get rate limited. For more info see: "
         "https://github.com/ksonnet/ksonnet/blob/master/docs"
         "/troubleshooting.md. Can also be set using environment variable "
         "GITHUB_TOKEN."))

    parser.add_argument("--deploy_name",
                        default="",
                        type=str,
                        help="The name of the deployment.")

    subparsers = parser.add_subparsers()

    parser_teardown = subparsers.add_parser(
        "teardown", help="teardown the test infrastructure.")

    parser_teardown.set_defaults(func=teardown)

    parser_tf_serving = subparsers.add_parser(
        "deploy_model", help="Deploy a TF serving model.")

    parser_tf_serving.set_defaults(func=deploy_model)

    parser_tf_serving.add_argument(
        "--params",
        default="",
        type=str,
        help=("Comma separated list of parameters to set on the model."))

    parser_pytorch_job = subparsers.add_parser("deploy_pytorchjob",
                                               help="Deploy a pytorch-job")

    parser_pytorch_job.set_defaults(func=deploy_pytorchjob)

    parser_pytorch_job.add_argument(
        "--params",
        default="",
        type=str,
        help=("Comma separated list of parameters to set on the model."))

    parser_argo_job = subparsers.add_parser("deploy_argo", help="Deploy argo")

    parser_argo_job.set_defaults(func=deploy_argo)

    parser_katib_test = subparsers.add_parser("test_katib", help="Test Katib")

    parser_katib_test.set_defaults(func=test_katib)

    parser_minikube = subparsers.add_parser(
        "deploy_minikube", help="Setup a K8s cluster on minikube.")

    parser_minikube.set_defaults(func=deploy_minikube)

    parser_minikube.add_argument("--vm_name",
                                 required=True,
                                 type=str,
                                 help="The name of the VM to use.")

    parser_minikube.add_argument("--zone",
                                 default="us-east1-d",
                                 type=str,
                                 help="The zone for the cluster.")

    parser_teardown_minikube = subparsers.add_parser(
        "teardown_minikube", help="Delete the VM running minikube.")

    parser_teardown_minikube.set_defaults(func=teardown_minikube)

    parser_teardown_minikube.add_argument("--zone",
                                          default="us-east1-d",
                                          type=str,
                                          help="The zone for the cluster.")

    parser_teardown_minikube.add_argument("--vm_name",
                                          required=True,
                                          type=str,
                                          help="The name of the VM to use.")

    args = parser.parse_args()

    if not args.test_dir:
        logging.info("--test_dir not set; using a temporary directory.")

        now = datetime.datetime.now()
        label = "test_deploy-" + now.strftime(
            "%m%d-%H%M-") + uuid.uuid4().hex[0:4]

        # Create a temporary directory for this test run
        args.test_dir = os.path.join(tempfile.gettempdir(), label)

    if not args.artifacts_dir:
        args.artifacts_dir = args.test_dir

    test_log = os.path.join(
        args.artifacts_dir, "logs",
        "test_deploy." + args.func.__name__ + args.deploy_name + ".log.txt")
    if not os.path.exists(os.path.dirname(test_log)):
        os.makedirs(os.path.dirname(test_log))

    # TODO(jlewi): We should make this a util routine in kubeflow.testing.util
    # Setup a logging file handler. This way we can upload the log outputs
    # to gubernator.
    root_logger = logging.getLogger()

    file_handler = logging.FileHandler(test_log)
    root_logger.addHandler(file_handler)
    # We need to explicitly set the formatter because it will not pick up
    # the BasicConfig.
    formatter = logging.Formatter(
        fmt=("%(levelname)s|%(asctime)s"
             "|%(pathname)s|%(lineno)d| %(message)s"),
        datefmt="%Y-%m-%dT%H:%M:%S")
    file_handler.setFormatter(formatter)
    logging.info("Logging to %s", test_log)
    util.run(["ks", "version"])

    util.maybe_activate_service_account()
    config_file = os.path.expanduser(kube_config.KUBE_CONFIG_DEFAULT_LOCATION)

    # Print out the config to help debugging.
    output = util.run_and_output(["gcloud", "config", "config-helper"])
    logging.info("gcloud config: \n%s", output)
    wrap_test(args)
Example #5
0
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches
  # Check https://github.com/kubernetes/test-infra/blob/master/prow/jobs.md
  # for a description of the injected environment variables.
  job_type = os.getenv("JOB_TYPE")
  repo_owner = os.getenv("REPO_OWNER")
  repo_name = os.getenv("REPO_NAME")
  base_branch_name = os.getenv("PULL_BASE_REF")
  pull_base_sha = os.getenv("PULL_BASE_SHA")

  # For presubmit/postsubmit jobs, find the list of files changed by the PR.
  diff_command = []
  if job_type == "presubmit":
    # We need to get a common ancestor for the PR and the base branch
    cloned_repo_dir = os.path.join(args.repos_dir, repo_owner, repo_name)

    _ = util.run(["git", "fetch", "origin", base_branch_name + ":refs/remotes/origin/" +
                  base_branch_name], cwd=cloned_repo_dir)

    diff_command = ["git", "diff", "--name-only"]
    diff_branch = "remotes/origin/{}".format(base_branch_name)
    try:
      common_ancestor = util.run(["git", "merge-base", "HEAD", diff_branch],
                                 cwd=cloned_repo_dir)
      diff_command.append(common_ancestor)
    except subprocess.CalledProcessError as e:
      logging.warning("git merge-base failed; see "
                      "https://github.com/kubeflow/kubeflow/issues/3523. Diff "
                      "will be computed against the current master and "
                      "therefore files not changed in the PR might be "
                      "considered when determining which tests to trigger")
      diff_command.append(diff_branch)

  elif job_type == "postsubmit":
    # See: https://git-scm.com/docs/git-diff
    # This syntax compares the commit before pull_base_sha with the commit
    # at pull_base_sha
    diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha]

  changed_files = []
  if job_type in ("presubmit", "postsubmit"):
    changed_files = util.run(diff_command,
      cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines()

  for f in changed_files:
    logging.info("File %s is modified.", f)

  if args.release:
    generate_env_from_head(args)
  workflows = []
  config = {}
  if args.config_file:
    config, new_workflows = parse_config_file(args.config_file, args.repos_dir)
    workflows.extend(new_workflows)

  # Add any paths to the python path
  extra_py_paths = []
  for p in config.get("python_paths", []):
    # Assume that python_paths are in the format $REPO_OWNER/$REPO_NAME/path,
    # we need to ensure that the repo is checked out if it is different from
    # the current one, and if the repo is not kubeflow/testing (which is already
    # checked out).
    segments = p.split("/")
    if ((segments[0] != repo_owner or segments[1] != repo_name)
        and not p.startswith("kubeflow/testing")):
      logging.info("Need to clone %s/%s", segments[0], segments[1])
      util.clone_repo(os.path.join(args.repos_dir, segments[0], segments[1]),
                      segments[0], segments[1])

    path = os.path.join(args.repos_dir, p)
    extra_py_paths.append(path)

  kf_test_path = os.path.join(args.repos_dir, "kubeflow/testing/py")
  if kf_test_path not in extra_py_paths:
    logging.info("Adding %s to extra python paths", kf_test_path)
    extra_py_paths.append(kf_test_path)

  logging.info("Extra python paths: %s", ":".join(extra_py_paths))

  # Create an initial version of the file with no urls
  create_started_file(args.bucket, {})

  util.maybe_activate_service_account()

  util.configure_kubectl(args.project, args.zone, args.cluster)
  util.load_kube_config()

  tekton_runner = tekton_client.TektonRunner()
  workflow_names = []
  tkn_names = []
  tkn_cleanup_args = []
  ui_urls = {}

  for w in workflows: # pylint: disable=too-many-nested-blocks
    # Create the name for the workflow
    # We truncate sha numbers to prevent the workflow name from being too large.
    # Workflow name should not be more than 63 characters because its used
    # as a label on the pods.
    #
    # TODO(jlewi):This should no longer be used with Tekton. For tekton
    # name should be based on generateName; we should use labels to
    # provide additional metadata info like PR number.
    workflow_name = os.getenv("JOB_NAME", "") + "-" + w.name

    # Skip this workflow if it is scoped to a different job type.
    if w.job_types and not job_type in w.job_types:
      logging.info("Skipping workflow %s because job type %s is not one of "
                   "%s.", w.name, job_type, w.job_types)
      continue

    # If we are scoping this workflow to specific directories, check if any files
    # modified match the specified regex patterns.
    dir_modified = False
    if w.include_dirs:
      for f in changed_files:
        for d in w.include_dirs:
          if fnmatch.fnmatch(f, d):
            dir_modified = True
            logging.info("Triggering workflow %s because %s in dir %s is modified.",
                         w.name, f, d)
            break
        if dir_modified:
          break

    # Only consider modified files when the job is pre or post submit, and if
    # the include_dirs stanza is defined.
    if job_type != "periodic" and w.include_dirs and not dir_modified:
      logging.info("Skipping workflow %s because no code modified in %s.",
                   w.name, w.include_dirs)
      continue

    if job_type == "presubmit":
      # When not running under prow we might not set all environment variables
      if os.getenv("PULL_NUMBER"):
        workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
      if os.getenv("PULL_PULL_SHA"):
        workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

    elif job_type == "postsubmit":
      if os.getenv("PULL_BASE_SHA"):
        workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

    # Append the last 4 digits of the build number
    if os.getenv("BUILD_NUMBER"):
      workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")[-4:])

    salt = uuid.uuid4().hex[0:4]
    # Add some salt. This is mostly a convenience for the case where you
    # are submitting jobs manually for testing/debugging. Since the prow should
    # vend unique build numbers for each job.
    workflow_name += "-{0}".format(salt)
    if w.tekton_run:
      tkn_names.append(workflow_name)
    else:
      workflow_names.append(workflow_name)

    # check if ks workflow and run
    if w.app_dir:
      ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir)

      # Print ksonnet version
      util.run([ks_cmd, "version"])

      # Create a new environment for this run
      env = workflow_name

      util.run([ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)],
                cwd=w.app_dir)

      util.run([ks_cmd, "param", "set", "--env=" + env, w.component,
                "name", workflow_name],
               cwd=w.app_dir)

      # Set the prow environment variables.
      prow_env = []

      names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
               "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
               "REPO_NAME"]
      names.sort()
      for v in names:
        if not os.getenv(v):
          continue
        prow_env.append("{0}={1}".format(v, os.getenv(v)))

      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env",
               ",".join(prow_env)], cwd=w.app_dir)
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace",
               get_namespace(args)], cwd=w.app_dir)
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket",
               args.bucket], cwd=w.app_dir)
      if args.release:
        util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag",
                  os.getenv("VERSION_TAG")], cwd=w.app_dir)

      # Set any extra params. We do this in alphabetical order to make it easier to verify in
      # the unittest.
      param_names = w.params.keys()
      param_names.sort()
      for k in param_names:
        util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k,
                 "{0}".format(w.params[k])], cwd=w.app_dir)

      # For debugging print out the manifest
      util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir)
      util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir)

      ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
              "?tab=workflow".format(workflow_name))
      ui_urls[workflow_name] = ui_url
      logging.info("URL for workflow: %s", ui_url)
    elif w.tekton_run:
      pull_revision = None
      if os.getenv("PULL_NUMBER"):
        pull_revision = "refs/pull/{pull_num}/head".format(
            pull_num=os.getenv("PULL_NUMBER"))
      elif os.getenv("PULL_BASE_SHA"):
        pull_revision = os.getenv("PULL_BASE_SHA")
      else:
        pull_revision = "master"
      logging.info("Adding Tekton pipeline %s", w.name)
      try:
        pipeline_runner = tekton_client.PipelineRunner(
            w.tekton_params,
            w.kwargs.get(TEST_TARGET_ARG_NAME, w.name),
            w.tekton_run,
            args.bucket,
            repo_owner,
            repo_name,
            pull_revision)
      except (FileNotFoundError, ValueError) as e:
        logging.error("Error when starting Tekton workflow:%s\n Exception %s;\n"
                      "stacktrace:\n%s",
                      w.tekton_run, e, traceback.format_exc())
        continue
      if w.tekton_teardown:
        logging.info("Appending teardown process for Tekton pipeline %s",
                     w.name)
        pipeline_runner.append_teardown(tekton_client.PipelineRunner(
          w.tekton_teardown_params,
          w.kwargs.get(TEST_TARGET_ARG_NAME, w.name),
          w.tekton_teardown,
          args.bucket,
          repo_owner,
          repo_name,
          pull_revision))
      tekton_runner.append(pipeline_runner)
    else:
      w.kwargs["name"] = workflow_name
      w.kwargs["namespace"] = get_namespace(args)

      if TEST_TARGET_ARG_NAME not in w.kwargs:
        w.kwargs[TEST_TARGET_ARG_NAME] = w.name
        logging.info("Workflow %s doesn't set arg %s; defaulting to %s",
                     w.name, TEST_TARGET_ARG_NAME,
                     w.kwargs[TEST_TARGET_ARG_NAME])

      # TODO(https://github.com/kubeflow/testing/issues/467): We shell out
      # to e2e_tool in order to dumpy the Argo workflow to a file which then
      # reimport. We do this because importing the py_func module appears
      # to break when we have to dynamically adjust sys.path to insert
      # new paths. Setting PYTHONPATH before launching python however appears
      # to work which is why we shell out to e2e_tool.
      command = ["python", "-m", "kubeflow.testing.e2e_tool", "show",
                 w.py_func]
      for k, v in w.kwargs.items():
        # The fire module turns underscores in parameter names into hyphens
        # so we convert underscores in parameter names to hyphens
        command.append("--{0}={1}".format(k.replace("_", "-"), v))

      with tempfile.NamedTemporaryFile(delete=False) as hf:
        workflow_file = hf.name

      command.append("--output=" + hf.name)
      env = os.environ.copy()
      env["PYTHONPATH"] = ":".join(extra_py_paths)
      util.run(command, env=env)

      with open(workflow_file) as hf:
        wf_result = yaml.load(hf)

      group, version = wf_result['apiVersion'].split('/')
      k8s_co = k8s_client.CustomObjectsApi()
      workflow_name = wf_result["metadata"]["name"]
      py_func_result = k8s_co.create_namespaced_custom_object(
        group=group,
        version=version,
        namespace=wf_result["metadata"]["namespace"],
        plural='workflows',
        body=wf_result)
      logging.info("Created workflow:\n%s", yaml.safe_dump(py_func_result))

      ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
              "?tab=workflow".format(workflow_name))
      ui_urls[workflow_name] = ui_url
      logging.info("URL for workflow: %s", ui_url)

  ui_urls.update(tekton_runner.run(
      tekton_client.ClusterInfo(args.project,
                                TEKTON_CLUSTER_ZONE,
                                TEKTON_CLUSTER_NAME),
      tekton_client.ClusterInfo(args.project, args.zone, args.cluster)))
  # We delay creating started.json until we know the Argo workflow URLs
  create_started_file(args.bucket, ui_urls)

  workflow_success = False
  workflow_phase = {}
  workflow_status_yamls = {}
  results = []
  tekton_results = []
  try:
    results = argo_client.wait_for_workflows(
      get_namespace(args), workflow_names,
      timeout=datetime.timedelta(minutes=180),
      status_callback=argo_client.log_status
    )
    util.configure_kubectl(args.project, "us-east1-d", "kf-ci-v1")
    util.load_kube_config()
    tekton_results = tekton_runner.join()
    workflow_success = True
  except util.ExceptionWithWorkflowResults as e:
    # We explicitly log any exceptions so that they will be captured in the
    # build-log.txt that is uploaded to Gubernator.
    logging.exception("Exception occurred: %s", e)
    results = e.workflow_results
    raise
  except Exception as e:
    logging.exception("Other exception: %s", e)
    raise
  finally:
    util.configure_kubectl(args.project, args.zone, args.cluster)
    util.load_kube_config()
    prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket)

    # Upload workflow status to GCS.
    for r in results:
      phase = r.get("status", {}).get("phase")
      name = r.get("metadata", {}).get("name")
      workflow_phase[name] = phase
      workflow_status_yamls[name] = yaml.safe_dump(r, default_flow_style=False)
      if phase != "Succeeded":
        workflow_success = False
      logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase)

      for wf_name, wf_status in workflow_status_yamls.items():
        util.upload_to_gcs(
          wf_status,
          os.path.join(prow_artifacts_dir, '{}.yaml'.format(wf_name)))

    for r in tekton_results:
      condition = "Failed"
      name = r.get("metadata", {}).get("name")
      if r.get("status", {}).get("conditions", []):
        condition = r["status"]["conditions"][0].get("reason", "Failed")
      workflow_phase[name] = condition
      workflow_status_yamls[name] = yaml.safe_dump(r, default_flow_style=False)
      if condition != "Succeeded":
        workflow_success = False
      logging.info("Workflow %s/%s finished phase: %s",
                   args.tekton_namespace, name, condition)

    # Upload logs to GCS. No logs after this point will appear in the
    # file in gcs
    file_handler.flush()
    util.upload_file_to_gcs(
      file_handler.baseFilename,
      os.path.join(prow_artifacts_dir, "build-log.txt"))

    all_tests_success = prow_artifacts.finalize_prow_job(
      args.bucket, workflow_success, workflow_phase, ui_urls)

  return all_tests_success
Example #6
0
def run_papermill_job(
        notebook_path,
        name,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        repos,
        image,
        artifacts_gcs="",
        test_target_name=""):
    """Generate a K8s job to run a notebook using papermill

  Args:
    notebook_path: Path to the notebook. This should be in the form
      "{REPO_OWNER}/{REPO}/path/to/notebook.ipynb"
    name: Name for the K8s job
    namespace: The namespace where the job should run.
    repos: Which repos to checkout; if None or empty tries
      to infer based on PROW environment variables
    image: The docker image to run the notebook in.
  """

    util.maybe_activate_service_account()

    with open("job.yaml") as hf:
        job = yaml.load(hf)

    if notebook_path.startswith("/"):
        raise ValueError(
            "notebook_path={0} should not start with /".format(notebook_path))

    # We need to checkout the correct version of the code
    # in presubmits and postsubmits. We should check the environment variables
    # for the prow environment variables to get the appropriate values.
    # We should probably also only do that if the
    # See
    # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
    if not repos:
        repos = argo_build_util.get_repo_from_prow_env()
        logging.info(f"Using repos {repos}")

    if not repos:
        raise ValueError("Could not get repos from prow environment variable "
                         "and --repos isn't explicitly set")

    repos += ",kubeflow/testing@HEAD"

    logging.info("Repos set to %s", repos)
    job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
        "/usr/local/bin/checkout_repos.sh",
        "--repos=" + repos,
        "--src_dir=/src",
        "--depth=all",
    ]

    job["spec"]["template"]["spec"]["containers"][0]["image"] = image

    full_notebook_path = os.path.join("/src", notebook_path)
    job["spec"]["template"]["spec"]["containers"][0]["command"] = [
        "python3", "-m", "kubeflow.examples.notebook_tests.execute_notebook",
        "--notebook_path", full_notebook_path
    ]

    job["spec"]["template"]["spec"]["containers"][0][
        "workingDir"] = os.path.dirname(full_notebook_path)

    # The prow bucket to use for results/artifacts
    prow_bucket = prow_artifacts.PROW_RESULTS_BUCKET

    if artifacts_gcs:
        prow_dir = os.path.join(artifacts_gcs, "artifacts")
        if test_target_name:
            prow_dir = os.path.join(prow_dir, test_target_name)
        logging.info("Prow artifacts directory: %s", prow_dir)
        prow_bucket, prow_path = util.split_gcs_uri(prow_dir)
    elif os.getenv("REPO_OWNER") and os.getenv("REPO_NAME"):
        # Running under prow
        prow_dir = prow_artifacts.get_gcs_dir(prow_bucket)
        logging.info("Prow artifacts dir: %s", prow_dir)
        prow_dir = os.path.join(prow_dir, "artifacts")

        if os.getenv("TEST_TARGET_NAME"):
            prow_dir = os.path.join(prow_dir,
                                    os.getenv("TEST_TARGET_NAME").lstrip("/"))
        prow_bucket, prow_path = util.split_gcs_uri(prow_dir)

    else:
        prow_path = "notebook-test" + datetime.datetime.now().strftime(
            "%H%M%S")
        prow_path = prow_path + "-" + uuid.uuid4().hex[0:3]
        prow_dir = util.to_gcs_uri(prow_bucket, prow_path)

    prow_path = os.path.join(prow_path, name + ".html")
    output_gcs = util.to_gcs_uri(NB_BUCKET, prow_path)

    job["spec"]["template"]["spec"]["containers"][0]["env"] = [
        {
            "name": "OUTPUT_GCS",
            "value": output_gcs
        },
        {
            "name": "PYTHONPATH",
            "value": "/src/kubeflow/testing/py:/src/kubeflow/examples/py"
        },
    ]

    logging.info("Notebook will be written to %s", output_gcs)
    util.load_kube_config(persist_config=False)

    if name:
        job["metadata"]["name"] = name
    else:
        job["metadata"]["name"] = ("notebook-test-" +
                                   datetime.datetime.now().strftime("%H%M%S") +
                                   "-" + uuid.uuid4().hex[0:3])
    name = job["metadata"]["name"]

    job["metadata"]["namespace"] = namespace

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
    batch_api = k8s_client.BatchV1Api(api_client)

    logging.info("Creating job:\n%s", yaml.dump(job))
    actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
                                                 job)
    logging.info("Created job %s.%s:\n%s", namespace, name,
                 yaml.safe_dump(actual_job.to_dict()))

    logging.info("*********************Job logs************************")
    logging.info(logs_for_job(PROJECT, name))
    logging.info("*****************************************************")
    final_job = util.wait_for_job(api_client,
                                  namespace,
                                  name,
                                  timeout=datetime.timedelta(minutes=30))

    logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict()))

    logging.info("*********************Job logs************************")
    logging.info(logs_for_job(PROJECT, name))
    logging.info("*****************************************************")

    # Download notebook html to artifacts
    logging.info("Copying %s to bucket %s", output_gcs, prow_bucket)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(NB_BUCKET)
    blob = bucket.get_blob(prow_path)

    destination_bucket = storage_client.get_bucket(prow_bucket)
    bucket.copy_blob(blob, destination_bucket)

    if not final_job.status.conditions:
        raise RuntimeError("Job {0}.{1}; did not complete".format(
            namespace, name))

    last_condition = final_job.status.conditions[-1]

    if last_condition.type not in ["Complete"]:
        logging.error("Job didn't complete successfully")
        raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
Example #7
0
def run(args, file_handler):  # pylint: disable=too-many-statements,too-many-branches
    # Print ksonnet version
    util.run(["ks", "version"])
    if args.release:
        generate_env_from_head(args)
    workflows = []
    if args.config_file:
        workflows.extend(parse_config_file(args.config_file, args.repos_dir))

    if args.app_dir and args.component:
        # TODO(jlewi): We can get rid of this branch once all repos are using a prow_config.xml file.
        workflows.append(
            WorkflowComponent("legacy", args.app_dir, args.component, {}))
    create_started_file(args.bucket)

    util.maybe_activate_service_account()

    util.configure_kubectl(args.project, args.zone, args.cluster)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()
    workflow_names = []
    ui_urls = {}

    for w in workflows:
        # Create the name for the workflow
        # We truncate sha numbers to prevent the workflow name from being too large.
        # Workflow name should not be more than 63 characters because its used
        # as a label on the pods.
        workflow_name = os.getenv("JOB_NAME") + "-" + w.name
        job_type = os.getenv("JOB_TYPE")
        if job_type == "presubmit":
            workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
            workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

        elif job_type == "postsubmit":
            workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

        workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

        salt = uuid.uuid4().hex[0:4]
        # Add some salt. This is mostly a convenience for the case where you
        # are submitting jobs manually for testing/debugging. Since the prow should
        # vend unique build numbers for each job.
        workflow_name += "-{0}".format(salt)

        workflow_names.append(workflow_name)
        # Create a new environment for this run
        env = workflow_name

        util.run(["ks", "env", "add", env], cwd=w.app_dir)

        util.run([
            "ks", "param", "set", "--env=" + env, w.component, "name",
            workflow_name
        ],
                 cwd=w.app_dir)

        # Set the prow environment variables.
        prow_env = []

        names = [
            "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
            "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
            "REPO_NAME"
        ]
        names.sort()
        for v in names:
            if not os.getenv(v):
                continue
            prow_env.append("{0}={1}".format(v, os.getenv(v)))

        util.run([
            "ks", "param", "set", "--env=" + env, w.component, "prow_env",
            ",".join(prow_env)
        ],
                 cwd=w.app_dir)
        util.run([
            "ks", "param", "set", "--env=" + env, w.component, "namespace",
            get_namespace(args)
        ],
                 cwd=w.app_dir)
        util.run([
            "ks", "param", "set", "--env=" + env, w.component, "bucket",
            args.bucket
        ],
                 cwd=w.app_dir)
        if args.release:
            util.run([
                "ks", "param", "set", "--env=" + env, w.component,
                "versionTag",
                os.getenv("VERSION_TAG")
            ],
                     cwd=w.app_dir)

        # Set any extra params. We do this in alphabetical order to make it easier to verify in
        # the unittest.
        param_names = w.params.keys()
        param_names.sort()
        for k in param_names:
            util.run([
                "ks", "param", "set", "--env=" + env, w.component, k,
                "{0}".format(w.params[k])
            ],
                     cwd=w.app_dir)

        # For debugging print out the manifest
        util.run(["ks", "show", env, "-c", w.component], cwd=w.app_dir)
        util.run(["ks", "apply", env, "-c", w.component], cwd=w.app_dir)

        ui_url = (
            "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
            "?tab=workflow".format(workflow_name))
        ui_urls[workflow_name] = ui_url
        logging.info("URL for workflow: %s", ui_url)

    success = True
    workflow_phase = {}
    try:
        results = argo_client.wait_for_workflows(
            api_client,
            get_namespace(args),
            workflow_names,
            status_callback=argo_client.log_status)
        for r in results:
            phase = r.get("status", {}).get("phase")
            name = r.get("metadata", {}).get("name")
            workflow_phase[name] = phase
            if phase != "Succeeded":
                success = False
            logging.info("Workflow %s/%s finished phase: %s",
                         get_namespace(args), name, phase)
    except util.TimeoutError:
        success = False
        logging.error("Time out waiting for Workflows %s to finish",
                      ",".join(workflow_names))
    except Exception as e:
        # We explicitly log any exceptions so that they will be captured in the
        # build-log.txt that is uploaded to Gubernator.
        logging.error("Exception occurred: %s", e)
        raise
    finally:
        success = prow_artifacts.finalize_prow_job(args.bucket, success,
                                                   workflow_phase, ui_urls)

        # Upload logs to GCS. No logs after this point will appear in the
        # file in gcs
        file_handler.flush()
        util.upload_file_to_gcs(
            file_handler.baseFilename,
            os.path.join(prow_artifacts.get_gcs_dir(args.bucket),
                         "build-log.txt"))

    return success
Example #8
0
def test_xgboost_synthetic(
        record_xml_attribute,
        name,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        repos,
        image):
    '''Generate Job and summit.'''
    util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")

    util.maybe_activate_service_account()

    with open("job.yaml") as hf:
        job = yaml.load(hf)

    # We need to checkout the correct version of the code
    # in presubmits and postsubmits. We should check the environment variables
    # for the prow environment variables to get the appropriate values.
    # We should probably also only do that if the
    # See
    # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
    if not repos:
        repos = argo_build_util.get_repo_from_prow_env()

    logging.info("Repos set to %s", repos)
    job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
        "/usr/local/bin/checkout_repos.sh",
        "--repos=" + repos,
        "--src_dir=/src",
        "--depth=all",
    ]
    job["spec"]["template"]["spec"]["containers"][0]["image"] = image
    util.load_kube_config(persist_config=False)

    if name:
        job["metadata"]["name"] = name
    else:
        job["metadata"]["name"] = ("xgboost-test-" +
                                   datetime.datetime.now().strftime("%H%M%S") +
                                   "-" + uuid.uuid4().hex[0:3])
        name = job["metadata"]["name"]

    job["metadata"]["namespace"] = namespace

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
    batch_api = k8s_client.BatchV1Api(api_client)

    logging.info("Creating job:\n%s", yaml.dump(job))
    actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
                                                 job)
    logging.info("Created job %s.%s:\n%s", namespace, name,
                 yaml.safe_dump(actual_job.to_dict()))

    final_job = util.wait_for_job(api_client,
                                  namespace,
                                  name,
                                  timeout=datetime.timedelta(minutes=30))

    logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict()))

    if not final_job.status.conditions:
        raise RuntimeError("Job {0}.{1}; did not complete".format(
            namespace, name))

    last_condition = final_job.status.conditions[-1]

    if last_condition.type not in ["Complete"]:
        logging.error("Job didn't complete successfully")
        raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
    def get_credentials(project="kubeflow-ci-deployment",
                        pattern=DEFAULT_PATTERN,
                        location=None,
                        output="",
                        testing_label=None):
        """Get the latest deployment information and use it to get credentials.

    Args:
      project: string, Name of deployed GCP project.
      pattern:  Regex pattern to look for
      location: zone or region to search for clusters.
      output: (Optional) if supplied write information about matching
        cluster to this YAML file.
      testing_label: string, annotation used to identify testing clusters. Optional.
    """
        logging.info(
            "Calling get_credential - this call needs gcloud client CLI.")
        util.maybe_activate_service_account()

        command = [
            "gcloud", "container", "clusters", "get-credentials",
            "--project=" + project
        ]

        info = {
            "project": project,
            "location": location,
        }

        if location:
            c = _get_latest_cluster(project, location, pattern)

            if not c:
                message = (
                    "No clusters found matching: project: {0}, location: {1}, "
                    "pattern: {2}").format(project, location, pattern)
                raise ValueError(message)

            if ZONE_PATTERN.match(location):
                command.append("--zone=" + location)
            else:
                command.append("--region=" + location)
            command.append(c["name"])

            info["cluster"] = c

        else:
            # This is the pre blueprint which is using deployment manager
            logging.warning(
                "Invoking deprecated path because location not set")
            dm = get_latest(project=project,
                            testing_label=testing_label,
                            base_name=pattern,
                            field="all")
            command.append("--zone=" + dm["zone"], dm["name"])

            info["cluster"] = dm

        if output:
            logging.info(f"Writing cluster information to {output}")
            with open(output, "w") as hf:
                yaml.dump(info, hf)

        # This call may be flaky due to timeout.
        @retry(stop_max_attempt_number=10, wait_fixed=5000)
        def run_get_credentials():
            util.run(command)

        run_get_credentials()
Example #10
0
def main():
    logging.basicConfig(
        level=logging.INFO,
        format=('%(levelname)s|%(asctime)s'
                '|%(pathname)s|%(lineno)d| %(message)s'),
        datefmt='%Y-%m-%dT%H:%M:%S',
    )
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument("--project",
                        default="kubeflow-ci",
                        type=str,
                        help=("The project."))

    parser.add_argument("--max_age_hours",
                        default=3,
                        type=int,
                        help=("The age of deployments to gc."))

    subparsers = parser.add_subparsers()

    ######################################################
    # Paraser for everything
    parser_all = subparsers.add_parser("all", help="Cleanup everything")

    add_deployments_args(parser_all)
    add_workflow_args(parser_all)

    parser_all.set_defaults(func=cleanup_all)

    ######################################################
    # Parser for argo_workflows
    parser_argo = subparsers.add_parser("workflows", help="Cleanup workflows")

    add_workflow_args(parser_argo)
    parser_argo.set_defaults(func=cleanup_workflows)

    ######################################################
    # Parser for endpoints
    parser_endpoints = subparsers.add_parser("endpoints",
                                             help="Cleanup endpoints")

    parser_endpoints.set_defaults(func=cleanup_endpoints)

    ######################################################
    # Parser for service accounts
    parser_service_account = subparsers.add_parser(
        "service_accounts", help="Cleanup service accounts")

    parser_service_account.set_defaults(func=cleanup_service_accounts)

    ######################################################
    # Parser for deployments
    parser_deployments = subparsers.add_parser("deployments",
                                               help="Cleanup deployments")

    add_deployments_args(parser_deployments)
    parser_deployments.set_defaults(func=cleanup_deployments)
    args = parser.parse_args()

    util.maybe_activate_service_account()
    args.func(args)
Example #11
0
def test_xgboost_synthetic(
        record_xml_attribute,
        name,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        repos,
        image,
        notebook_artifacts_dir):
    '''Generate Job and summit.'''
    util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")

    util.maybe_activate_service_account()

    with open("job.yaml") as hf:
        job = yaml.load(hf)

    # We need to checkout the correct version of the code
    # in presubmits and postsubmits. We should check the environment variables
    # for the prow environment variables to get the appropriate values.
    # We should probably also only do that if the
    # See
    # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
    if not repos:
        repos = argo_build_util.get_repo_from_prow_env()

    repos += ",kubeflow/testing@HEAD"
    logging.info("Repos set to %s", repos)
    job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
        "/usr/local/bin/checkout_repos.sh",
        "--repos=" + repos,
        "--src_dir=/src",
        "--depth=all",
    ]

    nb_bucket = "kubeflow-ci-deployment"
    nb_path = os.path.join("xgboost_synthetic_testing", os.getenv("JOB_TYPE"),
                           os.getenv("HOSTNAME"), "notebook.html")
    output_gcs = util.to_gcs_uri(nb_bucket, nb_path)
    logging.info("Tested notebook will be outputed to: %s", output_gcs)
    job["spec"]["template"]["spec"]["containers"][0]["env"] = [
        {
            "name": "PYTHONPATH",
            "value": "/src/kubeflow/testing/py"
        },
        {
            "name": "OUTPUT_GCS",
            "value": output_gcs
        },
    ]
    job["spec"]["template"]["spec"]["containers"][0]["image"] = image
    util.load_kube_config(persist_config=False)

    if name:
        job["metadata"]["name"] = name
    else:
        job["metadata"]["name"] = ("xgboost-test-" +
                                   datetime.datetime.now().strftime("%H%M%S") +
                                   "-" + uuid.uuid4().hex[0:3])
        name = job["metadata"]["name"]

    job["metadata"]["namespace"] = namespace

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
    batch_api = k8s_client.BatchV1Api(api_client)

    logging.info("Creating job:\n%s", yaml.dump(job))
    actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
                                                 job)
    logging.info("Created job %s.%s:\n%s", namespace, name,
                 yaml.safe_dump(actual_job.to_dict()))

    final_job = util.wait_for_job(api_client,
                                  namespace,
                                  name,
                                  timeout=datetime.timedelta(minutes=30))

    logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict()))

    if not final_job.status.conditions:
        raise RuntimeError("Job {0}.{1}; did not complete".format(
            namespace, name))

    last_condition = final_job.status.conditions[-1]

    # Download notebook html to artifacts
    notebook_artifacts_path = os.path.join(notebook_artifacts_dir,
                                           "notebook.html")
    logging.info("Writing notebook artifact to: %s", notebook_artifacts_path)
    os.makedirs(notebook_artifacts_dir, exist_ok=True)
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(nb_bucket)
    blob = bucket.get_blob(nb_path)
    blob.download_to_filename(notebook_artifacts_path)

    if last_condition.type not in ["Complete"]:
        logging.error("Job didn't complete successfully")
        raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
Example #12
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    logging.basicConfig(
        level=logging.INFO,
        format=('%(levelname)s|%(asctime)s'
                '|%(pathname)s|%(lineno)d| %(message)s'),
        datefmt='%Y-%m-%dT%H:%M:%S',
    )
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()

    parser.add_argument("--project",
                        default="kubeflow-ci-deployment",
                        type=str,
                        help=("The project."))

    parser.add_argument("--zone",
                        default="us-east1-d",
                        type=str,
                        help=("The zone to deploy in."))
    parser.add_argument(
        "--oauth_file",
        default=("gs://kubeflow-ci-deployment_kf-data/"
                 "kf-iap-oauth.kubeflow-ci-deployment.yaml"),
        type=str,
        help=("The file containing the OAuth client ID & secret"
              "for IAP."))

    # TODO(jlewi): Should rename this argument to something like kfctl_src
    # We should try to do it in a backwards compatible way.
    parser.add_argument(
        "--kubeflow_repo",
        default="/src/kubeflow/kubeflow",
        type=str,
        help=("Path to the source for kfctl. Should be the directory "
              "containing the Makefile to build kfctl"))

    parser.add_argument("--kfctl_path",
                        default="",
                        type=str,
                        help=("Path to kfctl; can be a URL."))

    parser.add_argument(
        "--kfctl_config",
        default=("https://raw.githubusercontent.com/kubeflow/manifests"
                 "/master/kfdef/kfctl_gcp_iap.yaml"),
        type=str,
        help=("Path to the kfctl config to use"))

    parser.add_argument("--apps_dir",
                        default=os.getcwd(),
                        type=str,
                        help=("Directory to store kubeflow apps."))

    parser.add_argument(
        "--name",
        type=str,
        default="kf-vmaster-{uid}",
        help=("Name for the deployment. This can be a python format string "
              "with the variable uid. Uid will automatically be substituted "
              "for a unique value based on the time."))

    parser.add_argument(
        "--email",
        type=str,
        default="",
        help=("(Optional). Email of the person to create the default profile"
              "for. If not specificied uses the gcloud config value."))

    parser.add_argument(
        "--extra_users",
        type=str,
        default="",
        help=("Comma separated list of additional users to grant access. "
              "Should be in the form user:[email protected] or"
              "serviceAccount:[email protected]"))

    parser.add_argument("--setup_project",
                        dest="setup_project",
                        action="store_true",
                        help="Setup the project")
    parser.add_argument("--no-setup_project",
                        dest="setup_project",
                        action="store_false",
                        help="Do not setup the project")
    parser.set_defaults(setup_project=True)

    parser.add_argument("--use_self_cert",
                        dest="use_self_cert",
                        action="store_true",
                        help="Use a self signed certificate")
    parser.add_argument("--no-use_self_cert",
                        dest="use_self_cert",
                        action="store_false",
                        help="Do not use a self signed certificate")
    parser.set_defaults(use_self_cert=True)

    args = parser.parse_args()

    util.maybe_activate_service_account()

    # For debugging purposes output the command
    util.run(["gcloud", "config", "list"])
    util.run(["gcloud", "auth", "list"])

    bucket, blob_path = util.split_gcs_uri(args.oauth_file)

    client = storage.Client(project=args.project)
    bucket = client.get_bucket(bucket)

    blob = bucket.get_blob(blob_path)
    contents = blob.download_as_string()

    oauth_info = yaml.load(contents)

    if args.kubeflow_repo and args.kfctl_path:
        raise ValueError(
            "Exactly one of --kubeflow_repo and --kfctl_path neeeds "
            "to be set.")

    if not args.kubeflow_repo and not args.kfctl_path:
        raise ValueError(
            "Exactly one of --kubeflow_repo and --kfctl_path neeeds "
            "to be set.")

    git_describe = ""
    if args.kubeflow_repo:
        git_describe = util.run(
            ["git", "describe", "--tags", "--always", "--dirty"],
            cwd=args.kubeflow_repo).strip("'")

        kfctl_path = build_kfctl_go(args)
    else:
        if args.kfctl_path.startswith("http"):
            temp_dir = tempfile.mkdtemp()
            util.run(["curl", "-L", "-o", "kfctl.tar.gz", args.kfctl_path],
                     cwd=temp_dir)
            util.run(["tar", "-xvf", "kfctl.tar.gz"], cwd=temp_dir)
            kfctl_path = os.path.join(temp_dir, "kfctl")
            git_describe = util.run([kfctl_path, "version"])
        else:
            kfctl_path = args.kfctl_path

    logging.info("kfctl path set to %s", kfctl_path)

    # We need to keep the name short to avoid hitting limits with certificates.
    uid = datetime.datetime.now().strftime("%m%d") + "-"
    uid = uid + uuid.uuid4().hex[0:3]

    args.name = args.name.format(uid=uid)
    logging.info("Using name %s", args.name)

    app_dir = os.path.join(args.apps_dir, args.name)

    if not os.path.exists(args.apps_dir):
        os.makedirs(args.apps_dir)

    env = {}
    env.update(os.environ)
    env.update(oauth_info)

    # GCP labels can only take as input alphanumeric characters, hyphens, and
    # underscores. Replace not valid characters with hyphens.
    labels = {
        "git": git_describe,
        "purpose": "kf-test-cluster",
    }

    for k, v in labels.items():
        val = v.lower().replace("\"", "")
        val = re.sub(r"[^a-z0-9\-_]", "-", val)
        labels[k] = val

    deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=labels)
    add_extra_users(args.project, args.extra_users)
Example #13
0
def main():
  logging.basicConfig(level=logging.INFO,
                      format=('%(levelname)s|%(asctime)s'
                              '|%(pathname)s|%(lineno)d| %(message)s'),
                      datefmt='%Y-%m-%dT%H:%M:%S',
                      )
  logging.getLogger().setLevel(logging.INFO)

  parser = argparse.ArgumentParser()
  parser.add_argument(
    "--project", default="kubeflow-ci", type=str, help=("The project."))

  # The values prefixed with testing_ refer to the test cluster where the
  # Argo workflows run. In contrast --project is the project where the tests
  # spin up Kubeflow instances.
  parser.add_argument(
    "--testing_project", default="kubeflow-ci", type=str,
    help=("The cluster used for Argo workflows."))

  parser.add_argument(
    "--testing_cluster", default="kubeflow-testing", type=str,
    help=("The cluster used for Argo workflows."))

  parser.add_argument(
    "--testing_zone", default="us-east1-d", type=str,
    help=("The zone of the cluster used for Argo workflows."))

  parser.add_argument(
    "--max_age_hours", default=3, type=int, help=("The age of deployments to gc."))

  parser.add_argument(
    "--gc_backend_services", default=False, type=bool,
    help=("""Whether to GC backend services."""))

  parser.add_argument(
    "--max_wf_age_hours", default=7*24, type=int,
    help=("How long to wait before garbage collecting Argo workflows."))

  parser.add_argument('--dryrun', dest='dryrun', action='store_true')
  parser.add_argument('--no-dryrun', dest='dryrun', action='store_false')
  parser.set_defaults(dryrun=False)

  subparsers = parser.add_subparsers()

  ######################################################
  # Paraser for everything
  parser_all = subparsers.add_parser(
    "all", help="Cleanup everything")

  add_deployments_args(parser_all)
  add_workflow_args(parser_all)

  parser_all.set_defaults(func=cleanup_all)

  ######################################################
  # Parser for argo_workflows
  parser_argo = subparsers.add_parser(
    "workflows", help="Cleanup workflows")

  add_workflow_args(parser_argo)
  parser_argo.set_defaults(func=cleanup_workflows)

  ######################################################
  # Parser for endpoints
  parser_endpoints = subparsers.add_parser(
    "endpoints", help="Cleanup endpoints")

  parser_endpoints.set_defaults(func=cleanup_endpoints)

  ######################################################
  # Parser for firewallrules
  parser_firewall = subparsers.add_parser(
    "firewall", help="Cleanup firewall rules")

  parser_firewall.set_defaults(func=cleanup_firewall_rules)


  ######################################################
  # Parser for health checks
  parser_health = subparsers.add_parser(
    "health_checks", help="Cleanup health checks")

  parser_health.set_defaults(func=cleanup_health_checks)

  ######################################################
  # Parser for service accounts
  parser_service_account = subparsers.add_parser(
    "service_accounts", help="Cleanup service accounts")

  parser_service_account.set_defaults(func=cleanup_service_accounts)

  ######################################################
  # Parser for service account bindings
  parser_service_account = subparsers.add_parser(
    "service_account_bindings", help="Cleanup service account bindings")

  parser_service_account.set_defaults(func=cleanup_service_account_bindings)

  ######################################################
  # Parser for certificates
  parser_certificates = subparsers.add_parser(
    "certificates", help="Cleanup certificates")

  parser_certificates.set_defaults(func=cleanup_certificates)

  ######################################################
  # Parser for auto deployments
  parser_auto_deployments = subparsers.add_parser(
    "auto_deployments", help="Cleanup auto deployments")

  add_deployments_args(parser_auto_deployments)
  parser_auto_deployments.set_defaults(func=cleanup_auto_deployments)

  ######################################################
  # Parser for deployments
  parser_deployments = subparsers.add_parser(
    "deployments", help="Cleanup deployments")

  add_deployments_args(parser_deployments)
  parser_deployments.set_defaults(func=cleanup_deployments)

  ######################################################
  # Parser for clusters
  parser_clusters = subparsers.add_parser(
    "clusters", help="Cleanup clusters")

  parser_clusters.add_argument(
    "--zones", default="us-east1-d,us-central1-a", type=str,
    help="Comma separated list of zones to check.")

  parser_clusters.set_defaults(func=cleanup_clusters)

  ######################################################
  # Parser for instance groups
  parser_ig = subparsers.add_parser(
      "instance_groups", help="Cleanup instance groups")
  add_deployments_args(parser_ig)
  parser_ig.set_defaults(func=cleanup_instance_groups)

  args = parser.parse_args()

  # Update max age
  MAX_LIFETIME[E2E_INFRA] = datetime.timedelta(hours=args.max_age_hours)

  logging.info("Max lifetime:\n%s", MAX_LIFETIME)

  util.maybe_activate_service_account()

  args.func(args)
Example #14
0
    def all(
        self,
        build_project,
        registry_project,
        remote_fork,  # pylint: disable=too-many-statements,too-many-branches
        add_github_host=False):
        """Build the latest image and update the prototype.

    Args:
      build_project: GCP project used to build the image.
      registry_project: GCP project used to host the image.
      remote_fork: Url of the remote fork.
        The remote fork used to create the PR;
         e.g. [email protected]:jlewi/kubeflow.git. currently only ssh is
         supported.
      add_github_host: If true will add the github ssh host to known ssh hosts.
    """
        repo = git.Repo(self._root_dir())
        util.maybe_activate_service_account()
        last_commit = self.last_commit

        # Ensure github.com is in the known hosts
        if add_github_host:
            output = util.run(["ssh-keyscan", "github.com"])
            with open(os.path.join(os.getenv("HOME"), ".ssh", "known_hosts"),
                      mode='a') as hf:
                hf.write(output)

        if not remote_fork.startswith("*****@*****.**"):
            raise ValueError("Remote fork currently only supports ssh")

        remote_repo = self._find_remote_repo(repo, remote_fork)

        if not remote_repo:
            fork_name = remote_fork.split(":", 1)[-1].split("/", 1)[0]
            logging.info("Adding remote %s=%s", fork_name, remote_fork)
            remote_repo = repo.create_remote(fork_name, remote_fork)

        logging.info("Last change to components-jupyter-web-app was %s",
                     last_commit)

        base = "gcr.io/{0}/jupyter-web-app".format(registry_project)

        # Check if there is already an image tagged with this commit.
        image = base + ":" + self.last_commit
        transport = transport_pool.Http(httplib2.Http)
        src = docker_name.from_string(image)
        creds = docker_creds.DefaultKeychain.Resolve(src)

        image_exists = False
        try:
            with v2_2_image.FromRegistry(src, creds, transport) as src_image:
                logging.info("Image %s exists; digest: %s", image,
                             src_image.digest())
                image_exists = True
        except docker_http.V2DiagnosticException as e:
            if e.status == 404:
                logging.info("%s doesn't exist", image)
            else:
                raise

        if not image_exists:
            logging.info("Building the image")
            image = self.build_image(build_project, registry_project)
            logging.info("Created image: %s", image)
        else:
            logging.info("Image %s already exists", image)

        # We should check what the current image is if and not update it
        # if its the existing image
        prototype_file = self.update_prototype(image)

        if not prototype_file:
            logging.info("Prototype not updated so not creating a PR.")
            return

        branch_name = "update_jupyter_{0}".format(last_commit)

        if repo.active_branch.name != branch_name:
            logging.info("Creating branch %s", branch_name)

            branch_names = [b.name for b in repo.branches]
            if branch_name in branch_names:
                logging.info("Branch %s exists", branch_name)
                util.run(["git", "checkout", branch_name],
                         cwd=self._root_dir())
            else:
                util.run(["git", "checkout", "-b", branch_name],
                         cwd=self._root_dir())

        if self._check_if_pr_exists(commit=last_commit):
            # Since a PR already exists updating to the specified commit
            # don't create a new one.
            # We don't want to just push -f because if the PR already exists
            # git push -f will retrigger the tests.
            # To force a recreate of the PR someone could close the existing
            # PR and a new PR will be created on the next cron run.
            return

        logging.info("Add file %s to repo", prototype_file)
        repo.index.add([prototype_file])
        repo.index.commit(
            "Update the jupyter web app image to {0}".format(image))

        util.run(["git", "push", "-f", remote_repo.name], cwd=self._root_dir())

        self.create_pull_request(commit=last_commit)
Example #15
0
def run_papermill_job(
        notebook_path,
        name,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        image,
        output=""):
    """Generate a K8s job to run a notebook using papermill

  Args:
    notebook_path: Path to the notebook.
    name: Name for the K8s job
    namespace: The namespace where the job should run.
    image: The docker image to run the notebook in.
    output = Location where artifacts like the rendered notebook
      should be uploaded. Should generally be an object storage path.
      Currently only GCS is supported.
  """

    util.maybe_activate_service_account()

    with open("job.yaml") as hf:
        job = yaml.load(hf)

    job["spec"]["template"]["spec"]["containers"][0]["image"] = image

    job["spec"]["template"]["spec"]["containers"][0]["command"] = [
        "python3", "-m", "kubeflow.testing.notebook_tests.execute_notebook",
        "--notebook_path", notebook_path
    ]

    job["spec"]["template"]["spec"]["containers"][0]["env"] = [
        {
            "name": "OUTPUT_GCS",
            "value": output
        },
        {
            "name": "PYTHONPATH",
            "value": "/src/kubeflow/testing/py"
        },
    ]

    logging.info("Notebook will be written to %s", output)
    util.load_kube_config(persist_config=False)

    if name:
        job["metadata"]["name"] = name
    else:
        job["metadata"]["name"] = ("notebook-test-" +
                                   datetime.datetime.now().strftime("%H%M%S") +
                                   "-" + uuid.uuid4().hex[0:3])
    name = job["metadata"]["name"]

    job["metadata"]["namespace"] = namespace

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
    batch_api = k8s_client.BatchV1Api(api_client)

    logging.info("Creating job:\n%s", yaml.dump(job))
    actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
                                                 job)
    logging.info("Created job %s.%s:\n%s", namespace, name,
                 yaml.safe_dump(actual_job.to_dict()))

    logging.info("*********************Job logs************************")
    logging.info(logs_for_job(PROJECT, name))
    logging.info("*****************************************************")
    final_job = util.wait_for_job(api_client,
                                  namespace,
                                  name,
                                  timeout=datetime.timedelta(minutes=30))

    logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict()))

    logging.info("*********************Job logs************************")
    logging.info(logs_for_job(PROJECT, name))
    logging.info("*****************************************************")

    if not final_job.status.conditions:
        raise RuntimeError("Job {0}.{1}; did not complete".format(
            namespace, name))

    last_condition = final_job.status.conditions[-1]

    if last_condition.type not in ["Complete"]:
        logging.error("Job didn't complete successfully")
        raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
Example #16
0
def run(args, file_handler):
    create_started_file(args.bucket)

    util.maybe_activate_service_account()

    util.configure_kubectl(args.project, args.zone, args.cluster)
    util.load_kube_config()

    # Create the name for the workflow
    # We truncate sha numbers to prevent the workflow name from being too large.
    # Workflow name should not be more than 63 characters because its used
    # as a label on the pods.
    workflow_name = os.getenv("JOB_NAME")
    job_type = os.getenv("JOB_TYPE")
    if job_type == "presubmit":
        workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
        workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

    elif job_type == "postsubmit":
        workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

    workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

    salt = uuid.uuid4().hex[0:4]
    # Add some salt. This is mostly a convenience for the case where you
    # are submitting jobs manually for testing/debugging. Since the prow should
    # vend unique build numbers for each job.
    workflow_name += "-{0}".format(salt)

    # Create a new environment for this run
    env = workflow_name

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    util.run([
        "ks", "param", "set", "--env=" + env, args.component, "name",
        workflow_name
    ],
             cwd=args.app_dir)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    # Set the prow environment variables.
    prow_env = []

    names = [
        "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA",
        "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME"
    ]
    names.sort()
    for v in names:
        if not os.getenv(v):
            continue
        prow_env.append("{0}={1}".format(v, os.getenv(v)))

    util.run([
        "ks", "param", "set", "--env=" + env, args.component, "prow_env",
        ",".join(prow_env)
    ],
             cwd=args.app_dir)
    util.run([
        "ks", "param", "set", "--env=" + env, args.component, "namespace",
        NAMESPACE
    ],
             cwd=args.app_dir)
    util.run([
        "ks", "param", "set", "--env=" + env, args.component, "bucket",
        args.bucket
    ],
             cwd=args.app_dir)

    # For debugging print out the manifest
    util.run(["ks", "show", env, "-c", args.component], cwd=args.app_dir)
    util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir)

    ui_url = (
        "http://testing-argo.kubeflow.io/timeline/kubeflow-test-infra/{0}"
        ";tab=workflow".format(workflow_name))
    logging.info("URL for workflow: %s", ui_url)
    success = False
    try:
        results = argo_client.wait_for_workflow(
            api_client,
            NAMESPACE,
            workflow_name,
            status_callback=argo_client.log_status)
        if results["status"]["phase"] == "Succeeded":
            success = True
        logging.info("Workflow %s/%s finished phase: %s", NAMESPACE,
                     workflow_name, results["status"]["phase"])
    except util.TimeoutError:
        success = False
        logging.error("Time out waiting for Workflow %s/%s to finish",
                      NAMESPACE, workflow_name)
    finally:
        create_finished_file(args.bucket, success)

        # Upload logs to GCS. No logs after this point will appear in the
        # file in gcs
        file_handler.flush()
        upload_file_to_gcs(
            file_handler.baseFilename,
            os.path.join(prow_artifacts.get_gcs_dir(args.bucket),
                         "build-log.txt"))

    return success