Ejemplo n.º 1
0
def ks_deploy(app_dir, component, params, env=None, account=None, namespace=None):
  """Deploy the specified ksonnet component.
  Args:
    app_dir: The ksonnet directory
    component: Name of the component to deployed
    params: A dictionary of parameters to set; can be empty but should not be
      None.
    env: (Optional) The environment to use, if none is specified a new one
      is created.
    account: (Optional) The account to use.
    namespace: (Optional) The namespace to use when adding the environment
  Raises:
    ValueError: If input arguments aren't valid.
  """
  if not component:
    raise ValueError("component can't be None.")

  # TODO(jlewi): It might be better if the test creates the app and uses
  # the latest stable release of the ksonnet configs. That however will cause
  # problems when we make changes to the TFJob operator that require changes
  # to the ksonnet configs. One advantage of checking in the app is that
  # we can modify the files in vendor if needed so that changes to the code
  # and config can be submitted in the same pr.
  now = datetime.datetime.now()
  if not env:
    env = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

  logging.info("Using app directory: %s", app_dir)

  if not namespace:
    util.run(["ks", "env", "add", env], cwd=app_dir)
  else:
    util.run(["ks", "env", "add", env, "--namespace=" + namespace], cwd=app_dir)

  for k, v in params.iteritems():
    util.run(
      ["ks", "param", "set", "--env=" + env, component, k, v], cwd=app_dir)

  apply_command = ["ks", "apply", env, "-c", component]
  if account:
    apply_command.append("--as=" + account)
  util.run(apply_command, cwd=app_dir)
Ejemplo n.º 2
0
def generate_env_from_head(args):
    commit = util.run(["git", "rev-parse", "HEAD"],
                      cwd=os.path.join(args.repos_dir, os.getenv("REPO_OWNER"),
                                       os.getenv("REPO_NAME")))
    pull_base_sha = commit[0:8]
    date_str = datetime.datetime.now().strftime("%Y%m%d")
    build_number = uuid.uuid4().hex[0:4]
    version_tag = "v{0}-{1}".format(date_str, pull_base_sha)
    env_var = {
        "PULL_BASE_SHA": pull_base_sha,
        "BUILD_NUMBER": build_number,
        "VERSION_TAG": version_tag,
    }

    for k in env_var:
        if os.getenv(k):
            continue
        os.environ[k] = env_var.get(k)
Ejemplo n.º 3
0
def setup_kubeflow_ks_app(args, api_client):
  """Create a ksonnet app for Kubeflow"""
  if not os.path.exists(args.test_dir):
    os.makedirs(args.test_dir)

  logging.info("Using test directory: %s", args.test_dir)

  namespace_name = args.namespace

  namespace = _setup_test(api_client, namespace_name)
  logging.info("Using namespace: %s", namespace)
  if args.github_token:
    logging.info("Setting GITHUB_TOKEN to %s.", args.github_token)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

  if not os.getenv("GITHUB_TOKEN"):
    logging.warn("GITHUB_TOKEN not set; you will probably hit Github API "
                 "limits.")
  # Initialize a ksonnet app.
  app_name = "kubeflow-test"
  util.run(["ks", "init", app_name,], cwd=args.test_dir)

  app_dir = os.path.join(args.test_dir, app_name)

  kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
  util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

  # Install required packages
  packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

  for p in packages:
    util.run(["ks", "pkg", "install", p], cwd=app_dir)

  # Delete the vendor directory and replace with a symlink to the src
  # so that we use the code at the desired commit.
  target_dir = os.path.join(app_dir, "vendor", "kubeflow")

  logging.info("Deleting %s", target_dir)
  shutil.rmtree(target_dir)

  REPO_ORG = "kubeflow"
  REPO_NAME = "kubeflow"
  REGISTRY_PATH = "kubeflow"
  source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                        REGISTRY_PATH)
  logging.info("Creating link %s -> %s", target_dir, source)
  os.symlink(source, target_dir)

  return app_dir
Ejemplo n.º 4
0
def run(test_files_dirs, flake8_path_args, test_case):
  # Go through each Python file in test_files_dirs and run flake8
  for test_files_dir in test_files_dirs:
    for root, _, files in os.walk(test_files_dir):
      for test_file in files:
        full_path = os.path.join(root, test_file)
        assert root == os.path.dirname(full_path)
        if should_test(full_path):
          logging.info("Testing: %s", test_file)
          try:
            output = util.run(['flake8', full_path] + FLAKE8_OPTS, cwd=root)
            try:
              parsed = json.loads(output)
            except AttributeError:
              logging.error(
                  "Output of flake8 could not be parsed as json; "
                  "output: %s", output)
              parsed = {}

            if not hasattr(parsed, "get"):
              # Legacy style tests emit true rather than a json object.
              # Parsing the string as json converts it to a bool so we
              # just use parsed as test_passed
              # Old style tests actually use std.assert so flake8 will
              # actually return an error in the case the test did
              # not pass.
              logging.warn(
                  "flake8 is using old style and not emitting an object. "
                  "Result was: %s. Output will be treated as a boolean", output)
              test_passed = parsed
            else:
              test_passed = parsed.get("pass", False)

            if not test_passed:
              msg = '{} test failed'.format(test_file)
              test_case.add_failure_info(msg)
              logging.error(
                  '{}. See Subprocess output for details.'.format(msg))
          except Exception as e:
            msg = '{} test failed'.format(test_file)
            test_case.add_failure_info(msg)
            logging.error('{} with exception %s. See Subprocess output for '
                          'details.'.format(msg, e))
Ejemplo n.º 5
0
def run(test_files_dirs, jsonnet_path_args, test_case):
  # Go through each jsonnet file in test_files_dirs and run jsonnet eval
  for test_files_dir in test_files_dirs:
    for root, _, files in os.walk(test_files_dir):
      for test_file in files:
        full_path = os.path.join(root, test_file)
        if should_test(full_path):
          logging.info("Testing: %s", test_file)
          try:
            output = util.run(
              ['jsonnet', 'eval', full_path] + jsonnet_path_args,
              cwd=os.path.dirname(full_path))
            try:
              parsed = json.loads(output)
            except AttributeError:
              logging.error(
                "Output of jsonnet eval could not be parsed as json; "
                "output: %s", output)
              parsed = {}

            if not hasattr(parsed, "get"):
              # Legacy style tests emit true rather than a json object.
              # Parsing the string as json converts it to a bool so we
              # just use parsed as test_passed
              # Old style tests actually use std.assert so jsonnet eval
              # will actually return an error in the case the test didn't
              # pass.
              logging.warn(
                "jsonnet test is using old style and not emitting an object. "
                "Result was: %s. Output will be treated as a boolean", output)
              test_passed = parsed
            else:
              test_passed = parsed.get("pass", false)

            if not test_passed:
              test_case.add_failure_info('{} test failed'.format(test_file))
              logging.error('%s test failed. See Subprocess output for details.',
                            test_file)
          except Exception as e:
            test_case.add_failure_info('{} test failed'.format(test_file))
            logging.error('%s test failed with exception %s. '
                          'See Subprocess output for details.', e, test_file)
Ejemplo n.º 6
0
def test_profiles():
    app_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
    if app_credentials:
        logging.info("Activate service account")
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + app_credentials
        ])

    # util.load_kube_config appears to hang on python3
    kube_config.load_kube_config()
    api_client = k8s_client.ApiClient()

    this_dir = os.path.dirname(__file__)
    util.run(["kubectl", "apply", "-f", "sample_profile.yaml"], cwd=this_dir)

    # TODO: check CR status/condition instead of sleep
    # conditions = ["Ready"]
    # namespace = "kubeflow"
    # name = "john"
    # results = util.wait_for_cr_condition(api_client, GROUP, PLURAL, VERSION,
    #                                      namespace, name, conditions)
    # logging.info("Result of CRD:\n%s", results)
    time.sleep(10)

    # Verifies the namespace is created.
    name = "john"  # The name of the profile, also the new namespace's name.
    coreV1 = k8s_client.CoreV1Api(api_client)
    retry_read_namespace = retry(
        wait_exponential_multiplier=
        1000,  # wait 2^i * 1000 ms, on the i-th retry
        wait_exponential_max=60000,  # 60 sec max
    )(coreV1.read_namespace)
    resp = retry_read_namespace(name)
    logging.info("found namespace: %s", resp)

    rbacV1 = k8s_client.RbacAuthorizationV1Api(api_client)
    resp = rbacV1.read_namespaced_role("edit", name)
    logging.info("role: %s", resp)
    resp = rbacV1.read_namespaced_role_binding("default", name)
    logging.info("role binding: %s", resp)

    # delete the profile and make sure namespace is deleted
    util.run(["kubectl", "delete", "-f", "sample_profile.yaml"], cwd=this_dir)
    time.sleep(15)

    with pytest.raises(ApiException) as e:
        resp = coreV1.read_namespace(name)
    logging.info("exception info: %s", e)
Ejemplo n.º 7
0
    def _check_if_pr_exists(self, commit=None):
        """Check if a PR is already open.

    Returns:
      exists: True if a PR updating the image to the specified commit already
       exists and false otherwise.
    """
        # TODO(jlewi): Modeled on
        # https://github.com/kubeflow/examples/blob/master/code_search/docker/ks/update_index.sh
        # TODO(jlewi): We should use the GitHub API and check if there is an
        # existing open pull request. Or potentially just use the hub CLI.

        if not commit:
            commit = self.last_commit
            logging.info("No commit specified defaulting to %s", commit)

        pr_title = self._pr_title(commit)

        # See hub conventions:
        # https://hub.github.com/hub.1.html
        # The GitHub repository is determined automatically based on the name
        # of remote repositories
        output = util.run(["hub", "pr", "list", "--format=%U;%t\n"],
                          cwd=self.manifests_repo_dir)

        lines = output.splitlines()

        prs = {}
        for l in lines:
            n, t = l.split(";", 1)
            prs[t] = n

        if pr_title in prs:
            logging.info(
                "PR %s already exists to update the Jupyter web app image "
                "to %s", prs[pr_title], commit)
            return True

        return False
Ejemplo n.º 8
0
    def test_serve(self):
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.
        api_client = k8s_client.ApiClient()

        # TODO (jinchihe) beflow code will be removed once new test-worker image
        # is publish in https://github.com/kubeflow/testing/issues/373.
        kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
             'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
        util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl],
                 cwd=self.app_dir)
        util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'],
                 cwd=self.app_dir)

        # Apply the components
        configmap = 'mnist-map-serving'
        for pair in self.params.split(","):
            k, v = pair.split("=", 1)
            if k == "namespace":
                util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir)
            else:
                util.run([
                    'kustomize', 'edit', 'add', 'configmap', configmap,
                    '--from-literal=' + k + '=' + v
                ],
                         cwd=self.app_dir)

        # Seems the util.run cannot handle pipes case, using check_call.
        subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -'
        subprocess.check_call(subCmd, shell=True)

        util.wait_for_deployment(api_client,
                                 self.namespace,
                                 self.name,
                                 timeout_minutes=4)
Ejemplo n.º 9
0
def deploy_with_kfctl_go(kfctl_path, args, app_dir, env):
    """Deploy Kubeflow using kfctl go binary."""
    # username and password are passed as env vars and won't appear in the logs
    #
    # TODO(https://github.com/kubeflow/kubeflow/issues/2831): We should be
    # loading the config in the repo we have checked out kfctl doesn't support
    # specifying a file URI. Once it does we should change --version to
    # use it.
    #
    # TODO(zhenghuiwang): use the master of kubeflow/manifests once
    # https://github.com/kubeflow/kubeflow/issues/3475 is fixed.
    logging.warning("Loading configs from master.")
    util.run([
        kfctl_path, "init", app_dir, "-V", "--platform=gcp",
        "--version=master", "--package-manager=kustomize",
        "--skip-init-gcp-project", "--disable_usage_report", "--use_istio",
        "--project=" + args.project
    ],
             env=env)

    # We need to specify a valid email because
    #  1. We need to create appropriate RBAC rules to allow the current user
    #     to create the required K8s resources.
    #  2. Setting the IAM policy will fail if the email is invalid.
    # TODO(jlewi): kfctl should eventually do this automatically.
    email = util.run(["gcloud", "config", "get-value", "account"])

    if not email:
        raise ValueError("Could not determine GCP account being used.")

    util.run([
        kfctl_path, "generate", "-V", "all", "--email=" + email,
        "--zone=" + args.zone
    ],
             env=env,
             cwd=app_dir)

    util.run([kfctl_path, "apply", "-V", "all"], env=env, cwd=app_dir)
Ejemplo n.º 10
0
 def _gcloud_list():
   # For debugging purposes output the command
   util.run(["gcloud", "config", "list"])
   util.run(["gcloud", "auth", "list"])
Ejemplo n.º 11
0
def deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=None): # pylint: disable=too-many-branches
  """Deploy Kubeflow using kfctl go binary."""
  # username and password are passed as env vars and won't appear in the logs
  #
  # We need to edit and rewrite the config file to the app dir because
  # kfctl uses the path of the config file as the app dir.s
  logging.warning("Loading configs %s.", args.kfctl_config)

  if args.kfctl_config.startswith("http"):
    response = requests.get(args.kfctl_config)
    raw_config = response.content
  else:
    with open(args.kfctl_config) as hf:
      raw_config = hf.read()

  config_spec = yaml.load(raw_config)

  # We need to specify a valid email because
  #  1. We need to create appropriate RBAC rules to allow the current user
  #     to create the required K8s resources.
  #  2. Setting the IAM policy will fail if the email is invalid.
  email = args.email

  if not email:
    logging.info("email not set trying to get default from gcloud")
    email = util.run(["gcloud", "auth", "list",
                      "--filter", "status:ACTIVE", "--format", "value(account)"])

  if not email:
    raise ValueError("Could not determine GCP account being used.")

  kfdef_version = config_spec["apiVersion"].strip().lower()

  if kfdef_version == KFDEF_V1ALPHA1:
    config_spec = build_v06_spec(config_spec, args.project, email, args.zone,
                                 args.setup_project)
  else:
    config_spec = build_v07_spec(config_spec, args.project, email, args.zone,
                                 args.setup_project)

  config_spec["spec"] = util.filter_spartakus(config_spec["spec"])

  # Remove name because we will auto infer from directory.
  if "name" in config_spec["metadata"]:
    logging.info("Deleting name in kfdef spec.")
    del config_spec["metadata"]["name"]

  app_name = os.path.basename(app_dir)
  if not "labels" in config_spec["metadata"]:
    config_spec["metadata"]["labels"] = {}

  if labels:
    config_spec["metadata"]["labels"].update(labels)

  logging.info("KFDefSpec:\n%s", yaml.safe_dump(config_spec))

  if kfdef_version == KFDEF_V1ALPHA1:
    logging.info("Deploying using v06 syntax")

    logging.info("Checking if deployment %s already exists in project %s",
                 args.project, app_name)

    if check_if_kfapp_exists(args.project, app_name, args.zone):
      # With v0.6 kfctl can't successfully run apply a 2nd time so if
      # the deployment already exists we can't redeploy.
      logging.info("Deployment %s already exists in project %s; not "
                   "redeploying", args.project, app_name)
      return

    with tempfile.NamedTemporaryFile(prefix="tmpkf_config", suffix=".yaml",
                                     delete=False) as hf:
      config_file = hf.name
      logging.info("Writing file %s", config_file)
      yaml.dump(config_spec, hf)

    util.run([kfctl_path, "init", app_dir, "-V", "--config=" + config_file],
             env=env)

    util.run([kfctl_path, "generate", "-V", "all"], env=env, cwd=app_dir)

    util.run([kfctl_path, "apply", "-V", "all"], env=env, cwd=app_dir)
  else:
    logging.info("Deploying using v07 syntax")

    if not os.path.exists(app_dir):
      logging.info("Creating app dir %s", app_dir)
      os.makedirs(app_dir)

    config_file = os.path.join(app_dir, "kf_config.yaml")
    with open(config_file, "w") as hf:
      logging.info("Writing file %s", config_file)
      yaml.dump(config_spec, hf)

    util.run([kfctl_path, "apply", "-V", "-f", config_file], env=env)

  # We will hit lets encrypt rate limiting with the managed certificates
  # So create a self signed certificate and update the ingress to use it.
  if args.use_self_cert:
    logging.info("Configuring self signed certificate")

    util.load_kube_credentials()

    api_client = k8s_client.ApiClient()
    ingress_namespace = "istio-system"
    ingress_name = "envoy-ingress"
    tls_endpoint = "{0}.endpoints.{1}.cloud.goog".format(app_name, args.project)
    logging.info("Configuring self signed cert for %s", tls_endpoint)
    util.use_self_signed_for_ingress(ingress_namespace, ingress_name,
                                     tls_endpoint, api_client)
Ejemplo n.º 12
0
def main(): # pylint: disable=too-many-locals,too-many-statements
  logging.basicConfig(level=logging.INFO,
                            format=('%(levelname)s|%(asctime)s'
                                '|%(pathname)s|%(lineno)d| %(message)s'),
                        datefmt='%Y-%m-%dT%H:%M:%S',
                      )
  logging.getLogger().setLevel(logging.INFO)

  parser = argparse.ArgumentParser()

  parser.add_argument(
          "--project", default="kubeflow-ci-deployment", type=str,
            help=("The project."))

  parser.add_argument(
          "--zone", default="us-east1-d", type=str, help=("The zone to deploy in."))
  parser.add_argument(
          "--oauth_file",
            default=("gs://kubeflow-ci-deployment_kf-data/"
                     "kf-iap-oauth.kubeflow-ci-deployment.yaml"),
      type=str, help=("The file containing the OAuth client ID & secret"
                    "for IAP."))

  # TODO(jlewi): Should rename this argument to something like kfctl_src
  # We should try to do it in a backwards compatible way.
  parser.add_argument(
          "--kubeflow_repo",
            default="/src/kubeflow/kubeflow",
      type=str, help=("Path to the source for kfctl. Should be the directory "
                      "containing the Makefile to build kfctl"))

  parser.add_argument(
          "--kfctl_path",
            default="",
      type=str, help=("Path to kfctl; can be a URL."))

  parser.add_argument(
          "--kfctl_config",
            default=("https://raw.githubusercontent.com/kubeflow/manifests"
                     "/master/kfdef/kfctl_gcp_iap.yaml"),
            type=str, help=("Path to the kfctl config to use"))

  parser.add_argument(
          "--apps_dir",
            default=os.getcwd(),
      type=str, help=("Directory to store kubeflow apps."))

  parser.add_argument(
          "--name", type=str, default="kf-vmaster-{uid}",
          help=("Name for the deployment. This can be a python format string "
                "with the variable uid. Uid will automatically be substituted "
                "for a unique value based on the time."))

  parser.add_argument(
          "--email", type=str, default="",
          help=("(Optional). Email of the person to create the default profile"
                "for. If not specificied uses the gcloud config value."))

  parser.add_argument(
          "--extra_users", type=str, default="",
          help=("Comma separated list of additional users to grant access. "
                "Should be in the form user:[email protected] or"
                "serviceAccount:[email protected]"))

  parser.add_argument(
          "--labels", type=str, default="",
          help=("Comma separated list of extra labels; e.g "
                "--labels=k1=v1,k2=v2"))

  parser.add_argument("--setup_project", dest="setup_project",
                      action="store_true", help="Setup the project")
  parser.add_argument("--no-setup_project", dest="setup_project",
                      action="store_false", help="Do not setup the project")
  parser.set_defaults(setup_project=True)

  parser.add_argument("--use_self_cert", dest="use_self_cert",
                      action="store_true",
                      help="Use a self signed certificate")
  parser.add_argument("--no-use_self_cert", dest="use_self_cert",
                      action="store_false",
                      help="Do not use a self signed certificate")
  parser.set_defaults(use_self_cert=True)

  args = parser.parse_args()

  util.maybe_activate_service_account()

  # Wait for credentials to deal with workload identity issues
  gcp_util.get_gcp_credentials()

  # Wrap gcloud commands in retry loop to deal with metadata; workload
  # identity issues.
  @retrying.retry(stop_max_delay=5*60*1000, wait_exponential_max=10000)
  def _gcloud_list():
    # For debugging purposes output the command
    util.run(["gcloud", "config", "list"])
    util.run(["gcloud", "auth", "list"])
  _gcloud_list()

  bucket, blob_path = util.split_gcs_uri(args.oauth_file)

  client = storage.Client(project=args.project)
  bucket = client.get_bucket(bucket)

  blob = bucket.get_blob(blob_path)
  contents = blob.download_as_string()

  oauth_info = yaml.load(contents)

  if args.kubeflow_repo and args.kfctl_path:
    raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds "
                     "to be set.")

  if not args.kubeflow_repo and not args.kfctl_path:
    raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds "
                     "to be set.")

  git_describe = ""
  if args.kubeflow_repo:
    git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"],
                             cwd=args.kubeflow_repo).strip("'")

    kfctl_path = build_kfctl_go(args)
  else:
    if args.kfctl_path.startswith("http"):
      temp_dir = tempfile.mkdtemp()

      filename = "kfctl"

      zipped = False
      if args.kfctl_path.endswith(".tar.gz"):
        zipped = True
        filename = filename + ".tar.gz"

      util.run(["curl", "-L", "-o", filename, args.kfctl_path],
               cwd=temp_dir)
      if zipped:
        util.run(["tar", "-xvf", "kfctl.tar.gz"], cwd=temp_dir)

      kfctl_path = os.path.join(temp_dir, "kfctl")
      logging.info("Changing permissions on %s", kfctl_path)
      os.chmod(kfctl_path, 0o777)
    else:
      kfctl_path = args.kfctl_path

  git_describe = util.run([kfctl_path, "version"])

  logging.info("kfctl path set to %s", kfctl_path)

  # We need to keep the name short to avoid hitting limits with certificates.
  uid = datetime.datetime.now().strftime("%m%d") + "-"
  uid = uid + uuid.uuid4().hex[0:3]

  args.name = args.name.format(uid=uid)
  logging.info("Using name %s", args.name)

  app_dir = os.path.join(args.apps_dir, args.name)

  if not os.path.exists(args.apps_dir):
    os.makedirs(args.apps_dir)

  env = {}
  env.update(os.environ)
  env.update(oauth_info)

  # GCP labels can only take as input alphanumeric characters, hyphens, and
  # underscores. Replace not valid characters with hyphens.
  labels = {"kfctl-git": git_describe,
            "purpose": "kf-test-cluster",
            "auto-deploy": "true"}

  for k, v in labels.items():
    val = v.lower().replace("\"", "")
    val = re.sub(r"[^a-z0-9\-_]", "-", val)
    labels[k] = val

  if args.labels:
    logging.info("Parsing labels %s", args.labels)
    for pair in args.labels.split(","):
      pieces = pair.split("=")
      if len(pieces) != 2:
        logging.error("Skipping pair %s; not of the form key=value", pair)
        continue
      key = pieces[0].strip()
      value = pieces[1].strip()

      labels[key] = value
  logging.info("labels: %s", labels)
  deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=labels)
  add_extra_users(args.project, args.extra_users)
Ejemplo n.º 13
0
def main(unparsed_args=None):  # pylint: disable=too-many-locals
    logging.getLogger().setLevel(logging.INFO)  # pylint: disable=too-many-locals
    # create the top-level parser
    parser = argparse.ArgumentParser(description="Get Images by regex")

    parser.add_argument(
        "--pattern",
        default="",
        type=str,
        help="Regex pattern e.g. .*tensorflow.*notebook.*:v20180619.*")

    parser.add_argument("--images_file",
                        default="image_tags.yaml",
                        type=str,
                        help="Yaml file containing the tags to attach.")

    parser.add_argument("--repository",
                        default=None,
                        type=str,
                        help="GCR repository name (optional).")

    args = parser.parse_args()

    with open(args.images_file) as hf:
        config = yaml.load(hf)

    existing_images = {}

    for image in config["images"]:
        existing_images[image["name"]] = {}
        for v in image["versions"]:
            existing_images[image["name"]][v["digest"]] = v

    list_images_cmd = [
        "gcloud", "--project=kubeflow-images-public", "container", "images",
        "list", "--format=json"
    ]
    # By default gcloud uses gcr.io/[project] as the repository. However for
    # images like katib, we may need to specify the repository as
    # gcr.io/[project]/katib.
    if args.repository:
        list_images_cmd.append("--repository=" + args.repository)
    raw_images = util.run(list_images_cmd)

    all_images = json.loads(raw_images)
    name_pattern, tag_pattern = args.pattern.split(":")

    name_re = re.compile(name_pattern)
    tag_re = re.compile(tag_pattern)

    matching = []
    for image in all_images:
        if not name_re.match(image["name"]):
            continue
        logging.info("Matching image: %s", image["name"])
        matching.append(image)

    # For each image ist all tags and find the matching ones
    images_to_add = {}
    for image in matching:
        raw_tags = util.run([
            "gcloud", "--project=kubeflow-images-public", "container",
            "images", "list-tags", image["name"], "--format=json"
        ])

        tags = json.loads(raw_tags)

        for info in tags:
            for t in info["tags"]:
                if tag_re.match(t):
                    is_match = True
                    versions = images_to_add.get(image["name"], {})
                    versions[info["digest"]] = info
                    images_to_add[image["name"]] = versions

    # Merge in any missing versions
    for name, versions in images_to_add.iteritems():
        if name not in existing_images:
            existing_images[name] = {}

        for v in versions.itervalues():
            if v["digest"] in existing_images[name]:
                logging.info("Image %s sha %s already defined.", name,
                             v["digest"])
            else:
                logging.info("Image %s adding sha %s", name, v["digest"])
                existing_images[name][v["digest"]] = v

    # Convert to the expected output
    output = {}
    output["images"] = []

    names = sorted(existing_images.keys())
    for name in names:
        versions = existing_images[name]
        new_image = {}
        new_image["name"] = name
        new_image["versions"] = []
        for v in versions.itervalues():
            new_image["versions"].append(v)

        output["images"].append(new_image)

    with open(args.images_file, "w") as hf:
        hf.write(yaml.safe_dump(output, default_flow_style=False))
    logging.info("Done.")
Ejemplo n.º 14
0
    def test_train(self):
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.
        api_client = k8s_client.ApiClient()

        # Setup parameters for kustomize
        # TODO(jinchihe): Should enhance here after the kustomize util created.
        configmap = 'mnist-map-gcs'
        for pair in self.params.split(","):
            k, v = pair.split("=", 1)
            if k == "namespace" or k == "image":
                util.run(["kustomize edit set", k, v], cwd=self.app_dir)
            elif k == "numPs":
                util.run(["./definition.sh --numPs", v], cwd=self.app_dir)
            elif k == "numWorkers":
                util.run(["./definition.sh --numWorkers", v], cwd=self.app_dir)
            elif k == "secret":
                secretName, secretMountPath = v.split("=", 1)
                util.run([
                    "kustomize edit add configmap", configmap,
                    "--from-literal=secretName=", secretName
                ],
                         cwd=self.app_dir)
                util.run([
                    "kustomize edit add configmap", configmap,
                    "--from-literal=secretMountPath=", secretMountPath
                ],
                         cwd=self.app_dir)
            elif k == "envVariables":
                var_k, var_v = v.split("=", 1)
                util.run([
                    "kustomize edit add configmap", configmap,
                    "--from-literal=", var_k, "=", var_v
                ],
                         cwd=self.app_dir)
            else:
                util.run([
                    "kustomize edit add configmap", configmap,
                    "--from-literal=", k, "=", v
                ],
                         cwd=self.app_dir)

        # Create the TF job
        util.run(["kustomize build . |kubectl apply -f -"], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        # Check for errors creating pods and services. Can potentially
        # help debug failed test runs.
        creation_failures = tf_job_client.get_creation_failures_from_tfjob(
            api_client, self.namespace, results)
        if creation_failures:
            logging.warning(creation_failures)

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(  # pylint: disable=attribute-defined-outside-init
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return
Ejemplo n.º 15
0
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
Ejemplo n.º 16
0
def test_kf_is_ready(namespace, use_basic_auth, use_istio, app_path):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    # TODO(yanniszark): This list is incomplete and missing a lot of components.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "jupyter-web-app-deployment",
        "minio",
        "ml-pipeline",
        "ml-pipeline-persistenceagent",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "ml-pipeline-viewer-controller-deployment",
        "mysql",
        "notebook-controller-deployment",
        "profiles-deployment",
        "pytorch-operator",
        "tf-job-operator",
        "workflow-controller",
    ]

    stateful_set_names = []

    with open(os.path.join(app_path, "app.yaml")) as f:
        kfdef = yaml.safe_load(f)
    platform = kfdef["spec"]["platform"]

    ingress_related_deployments = [
        "istio-citadel",
        "istio-egressgateway",
        "istio-galley",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "istio-tracing",
        "kiali",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
Ejemplo n.º 17
0
def run_benchmark_job():
    """Submit benchmark jobs to remote kubernetes cluster."""
    args = parse_args()
    app_dir = os.path.join(str(os.environ['BENCHMARK_DIR']), "ks-app")

    kubeconfig_path = str(os.environ['KUBECONFIG'])
    api_client = deploy_utils.create_k8s_client(kubeconfig_path)

    namespace = args.namespace
    job_name = args.experiment_name

    # Set the namespace of kb job to default
    namespace = "default"
    # Deploy Kubebench
    util.run(
        ["ks", "generate", "kubebench-job", job_name, "--name=" + job_name],
        cwd=app_dir)
    job_config_prefix = "ks param set " + job_name + " "

    cmd = job_config_prefix + "mainJobKsRegistry " + args.training_job_registry
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "mainJobKsPackage " + args.training_job_pkg
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "mainJobKsPrototype " + args.training_job_prototype
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "mainJobConfig " + args.training_job_config
    util.run(cmd.split(), cwd=app_dir)

    cmd = job_config_prefix + "awsCredentialsSecret " + args.aws_secret
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "awsCredentialsSecretAccessKeyId " + args.aws_access_key_id
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "awsCredentialsSecretAccessKey " + args.aws_secret_access_key
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "awsRegion " + args.aws_region
    util.run(cmd.split(), cwd=app_dir)

    cmd = job_config_prefix + "githubTokenSecret " + args.github_secret_name
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "githubTokenSecretKey GITHUB_TOKEN"
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "controllerImage seedjeffwan/configurator:20190415"
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "postJobImage seedjeffwan/mpi-post-processor:logs"
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "postJobArgs null"
    util.run(cmd.split(), cwd=app_dir)
    cmd = job_config_prefix + "reporterType null"
    util.run(cmd.split(), cwd=app_dir)

    cmd = job_config_prefix + "experimentDataPvc " + args.data_pvc
    util.run(cmd.split(), cwd=app_dir)

    # cmd = "ks param set " + job_name + " config_args -- --config-file=" + pvc_mount + \
    #         "/config/" + config_name + ".yaml"
    # util.run(cmd.split(), cwd=app_dir)
    # cmd = "ks param set " + job_name + " report_args -- --output-file=" + pvc_mount + \
    #         "/output/results.csv"
    # util.run(cmd.split(), cwd=app_dir)

    apply_command = ["ks", "apply", "default", "-c", job_name]
    util.run(apply_command, cwd=app_dir)

    # TODO: expose timeout setting here.
    deploy_utils.wait_for_benchmark_job(job_name, namespace)
    deploy_utils.cleanup_benchmark_job(app_dir, job_name)
Ejemplo n.º 18
0
def setup_kubeflow_ks_app(dir, namespace, github_token, api_client):
    """Create a ksonnet app for Kubeflow"""
    util.makedirs(dir)

    logging.info("Using test directory: %s", dir)

    namespace_name = namespace

    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    if github_token:
        logging.info("Setting GITHUB_TOKEN to %s.", github_token)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = github_token

    if not os.getenv("GITHUB_TOKEN"):
        logging.warning(
            "GITHUB_TOKEN not set; you will probably hit Github API "
            "limits.")
    # Initialize a ksonnet app.
    app_name = "kubeflow-test-" + uuid.uuid4().hex[0:4]
    util.run([
        "ks",
        "init",
        app_name,
    ], cwd=dir)

    app_dir = os.path.join(dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
             cwd=app_dir)

    # Install required packages
    packages = [
        "kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job",
        "kubeflow/pytorch-job", "kubeflow/argo"
    ]

    # Instead of installing packages we edit the app.yaml file directly
    #for p in packages:
    # util.run(["ks", "pkg", "install", p], cwd=app_dir)
    app_file = os.path.join(app_dir, "app.yaml")
    with open(app_file) as f:
        app_yaml = yaml.load(f)

    libraries = {}
    for pkg in packages:
        pkg = pkg.split("/")[1]
        libraries[pkg] = {
            'gitVersion': {
                'commitSha': 'fake',
                'refSpec': 'fake'
            },
            'name': pkg,
            'registry': "kubeflow"
        }
    app_yaml['libraries'] = libraries

    with open(app_file, "w") as f:
        yaml.dump(app_yaml, f)

    # Create vendor directory with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    return app_dir
Ejemplo n.º 19
0
def test_kf_is_ready(namespace, use_basic_auth, use_istio):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """

    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "cloud-endpoints-controller",
        "jupyter-web-app-deployment",
        "metadata-db",
        "metadata-deployment",
        "metadata-ui",
        "ml-pipeline",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "notebook-controller-deployment",
        "tf-job-operator",
        "pytorch-operator",
        "katib-controller",
        "workflow-controller",
    ]

    stateful_set_names = [
        "kfserving-controller-manager",
    ]

    ingress_related_deployments = []
    ingress_related_stateful_sets = []

    if use_basic_auth:
        deployment_names.extend(["basic-auth-login"])
        ingress_related_stateful_sets.extend(["backend-updater"])
    else:
        ingress_related_deployments.extend(["iap-enabler"])
        ingress_related_stateful_sets.extend(["backend-updater"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]
    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
Ejemplo n.º 20
0
def install_kubeflow(api_client, app_dir, namespace):
  """Deploy required kubeflow packages to run benchmark"""
  util.run(["ks", "generate", "argo", "argo"], cwd=app_dir)
  util.run(["ks", "generate", "tf-job-operator", "tf-job-operator"], cwd=app_dir)
  util.run(["ks", "generate", "mpi-operator", "mpi-operator"], cwd=app_dir)

  if namespace != 'default':
    cmd = "ks param set tf-job-operator namespace " + namespace
    util.run(cmd.split(), cwd=app_dir)

    cmd = "ks param set mpi-operator namespace " + namespace
    util.run(cmd.split(), cwd=app_dir)

    cmd = "ks param set argo namespace " + namespace
    util.run(cmd.split(), cwd=app_dir)

  util.run(cmd.split(), cwd=app_dir)

  apply_command = ["ks", "apply", "default", "-c", "argo",
                   "-c", "tf-job-operator",  "-c", "mpi-operator"]

  util.run(apply_command, cwd=app_dir)
Ejemplo n.º 21
0
def check_if_kfapp_exists(project, name, zone): # pylint: disable=too-many-branches
  """Check if a deployment with the specified name already exists."""
  credentials = GoogleCredentials.get_application_default()
  dm = discovery.build("deploymentmanager", "v2", credentials=credentials)

  deployments_client = dm.deployments()
  enable_api = False
  try:
    deployments_client.get(project=project, deployment=name).execute()
  except errors.HttpError as e:
    if not e.content:
      raise
    error_content = json.loads(e.content)
    if error_content.get("error", {}).get("code", 0) == 404: # pylint: disable=no-else-return
      return False
    elif error_content.get("error", {}).get("code", 0) == 403:
      # We get a 403 if the deployment manager API isn't enabled
      logging.info("Fetching deployment %s in project %s returned error:\n%s",
                   name, project, error_content)
      enable_api = True
    else:
      raise

  if enable_api:
    logging.info("Enabling the deployment manager api.")
    util.run(["gcloud", "--project=" + project, "services", "enable",
              "deploymentmanager.googleapis.com"])
    logging.info("Api enabled; raising ApiNotEnabledError to force retry")
    raise ApiNotEnabledError

  # TODO(jlewi): It would be better to get the actual zone of the deployment
  util.run(["gcloud", "--project=" + project, "container", "clusters",
            "get-credentials", "--zone=" + zone, name])
  logging.info("Checking if project %s kfapp %s finished setup.", project, name)
  util.load_kube_credentials()

  # TODO(jlewi): This is a bit of a hack for v0.6. For v0.6 we check if the
  # ingress already exists and if it does we report it as true and otherwise
  # false. The reasoning is if the ingress doesn't exist we want to see
  # if we can fix/resume the deployment by running reapply
  # With v0.7 kfctl apply should be an idempotent operation so we can always
  # rerun apply; but with v0.6 rerunning apply if the ingress exists results
  # in an error.
  api_client = k8s_client.ApiClient()
  v1 = k8s_client.CoreV1Api(api_client)
  ingress_namespace = "istio-system"
  ingress_name = "envoy-ingress"

  extensions = k8s_client.ExtensionsV1beta1Api(api_client)

  missing_ingress = True
  try:
    logging.info("Trying to read ingress %s.%s", ingress_name,
                 ingress_namespace)
    extensions.read_namespaced_ingress(ingress_name, ingress_namespace)
    missing_ingress = False
    logging.info("Ingress %s.%s exists", ingress_name, ingress_namespace)
  except rest.ApiException as e:
    if e.status == 404:
      logging.info("Project: %s, KFApp: %s is missing ingress %s.%s",
                   project, name, ingress_namespace, ingress_name)
      missing_ingress = True
    else:
      raise

  if missing_ingress:
    # Check if the service istio-ingressgateway already exists
    # if it does we need to delete it before rerunning apply.
    service_name = "istio-ingressgateway"
    logging.info("ingress %s.%s exists; checking if service %s.%s exists",
                 ingress_namespace, ingress_name, ingress_namespace,
                 service_name)

    has_service = False
    try:
      v1.read_namespaced_service(service_name, ingress_namespace)
      has_service = True
    except rest.ApiException as e:
      if e.status == 404:
        logging.info("Project: %s, KFApp: %s is missing service %s.%s",
                     project, name, ingress_namespace, service_name)
      else:
        raise

    if has_service:
      logging.info("Deleting service: %s.%s", ingress_namespace, service_name)
      v1.delete_namespaced_service(service_name, ingress_namespace,
                                   body=k8s_client.V1DeleteOptions())
      logging.info("Deleted service: %s.%s", ingress_namespace, service_name)

    return False


  return True
Ejemplo n.º 22
0
def main():  # pylint: disable=too-many-locals
  logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals
  # create the top-level parser
  parser = argparse.ArgumentParser(
    description="Test Kubeflow E2E.")

  parser.add_argument(
    "--test_dir",
    default="",
    type=str,
    help="Directory to use for all the test files. If not set a temporary "
         "directory is created.")

  parser.add_argument(
    "--artifacts_dir",
    default="",
    type=str,
    help="Directory to use for artifacts that should be preserved after "
         "the test runs. Defaults to test_dir if not set.")

  parser.add_argument(
    "--project",
    default=None,
    type=str,
    help="The project to use.")

  parser.add_argument(
    "--cluster",
    default=None,
    type=str,
    help=("The name of the cluster. If not set assumes the "
          "script is running in a cluster and uses that cluster."))

  parser.add_argument(
    "--zone",
    default="us-east1-d",
    type=str,
    help="The zone for the cluster.")

  parser.add_argument(
    "--github_token",
    default=None,
    type=str,
    help=("The GitHub API token to use. This is needed since ksonnet uses the "
          "GitHub API and without it we get rate limited. For more info see: "
          "https://github.com/ksonnet/ksonnet/blob/master/docs"
          "/troubleshooting.md"))

  args = parser.parse_args()

  if not args.test_dir:
    logging.info("--test_dir not set; using a temporary directory.")

    now = datetime.datetime.now()
    label = "test_deploy-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    # Create a temporary directory for this test run
    args.test_dir = os.path.join(tempfile.gettempdir(), label)

  if not args.artifacts_dir:
    args.artifacts_dir = args.test_dir
  # Setup a logging file handler. This way we can upload the log outputs
  # to gubernator.
  root_logger = logging.getLogger()

  test_log = os.path.join(args.artifacts_dir, "logs", "test_deploy.log.txt")
  if not os.path.exists(os.path.dirname(test_log)):
    os.makedirs(os.path.dirname(test_log))

  file_handler = logging.FileHandler(test_log)
  root_logger.addHandler(file_handler)
  # We need to explicitly set the formatter because it will not pick up
  # the BasicConfig.
  formatter = logging.Formatter(fmt=("%(levelname)s|%(asctime)s"
                                     "|%(pathname)s|%(lineno)d| %(message)s"),
                                datefmt="%Y-%m-%dT%H:%M:%S")
  file_handler.setFormatter(formatter)
  logging.info("Logging to %s", test_log)

  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud "
                 "to use service account.")
    # Since a service account is set tell gcloud to use it.
    util.run(["gcloud", "auth", "activate-service-account", "--key-file=" +
              os.getenv("GOOGLE_APPLICATION_CREDENTIALS")])
  setup(args)
Ejemplo n.º 23
0
def setup(args):
  """Test deploying Kubeflow."""
  if args.cluster:
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    logging.info("Using cluster: %s in project: %s in zone: %s",
                 cluster_name, project, zone)
    # Print out config to help debug issues with accounts and
    # credentials.
    util.run(["gcloud", "config", "list"])
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()
  else:
    # TODO(jlewi): This is sufficient for API access but it doesn't create
    # a kubeconfig file which ksonnet needs for ks init.
    logging.info("Running inside cluster.")
    incluster_config.load_incluster_config()

  # Create an API client object to talk to the K8s master.
  api_client = k8s_client.ApiClient()

  now = datetime.datetime.now()
  run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

  if not os.path.exists(args.test_dir):
    os.makedirs(args.test_dir)

  logging.info("Using test directory: %s", args.test_dir)

  namespace_name = run_label
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)

  main_case = test_util.TestCase()
  main_case.class_name = "KubeFlow"
  main_case.name = "deploy-kubeflow"
  try:
    test_util.wrap_test(run, main_case)
  finally:
    # Delete the namespace
    logging.info("Deleting namespace %s", namespace_name)

    # We report teardown as a separate test case because this will help
    # us track down issues with garbage collecting namespaces.
    teardown = test_util.TestCase(main_case.class_name, "teardown")
    def run_teardown():
      core_api = k8s_client.CoreV1Api(api_client)
      core_api.delete_namespace(namespace_name, {})

    try:
      test_util.wrap_test(run_teardown, teardown)
    except Exception as e:  # pylint: disable-msg=broad-except
      logging.error("There was a problem deleting namespace: %s; %s",
                    namespace_name, e.message)
    junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml")
    logging.info("Writing test results to %s", junit_path)
    test_util.create_junit_xml_file([main_case, teardown], junit_path)
Ejemplo n.º 24
0
def install_kubebench_nfs(api_client, app_dir, namespace):
  """Deploy required kubeflow packages to run benchmark"""
  util.run(["ks", "pkg", "install", "kubebench/kubebench-quickstarter"], cwd=app_dir)
  util.run(["ks", "generate", "kubebench-quickstarter-service", "kubebench-quickstarter-service"], cwd=app_dir)
  util.run(["ks", "generate", "kubebench-quickstarter-volume", "kubebench-quickstarter-volume"], cwd=app_dir)

  util.run(["ks", "param", "set", "kubebench-quickstarter-service", "namespace", namespace], cwd=app_dir)
  util.run(["ks", "param", "set", "kubebench-quickstarter-volume", "namespace", namespace], cwd=app_dir)

  apply_command = ["ks", "apply", "default", "-c", "kubebench-quickstarter-service"]
  util.run(apply_command, cwd=app_dir)

  kubebench_nfs_deployment_name = "kubebench-nfs-deploy"
  kubebench_nfs_service_name = "kubebench-nfs-svc"
  logging.info("Verifying NFS deployment started")
  util.wait_for_deployment(api_client, namespace, kubebench_nfs_deployment_name)

  service = get_k8s_service(api_client, namespace, kubebench_nfs_service_name)
  util.run(["ks", "param", "set", "kubebench-quickstarter-volume", "nfsServiceIP", service.spec.cluster_ip], cwd=app_dir)
  apply_command = ["ks", "apply", "default", "-c", "kubebench-quickstarter-volume"]
  util.run(apply_command, cwd=app_dir)
Ejemplo n.º 25
0
 def run_delete():
     util.run([kfctl_path, "delete", "-V", "-f", kfdef_path], cwd=app_path)
Ejemplo n.º 26
0
def test_build_kfctl_go(record_xml_attribute, app_name, app_path, project,
                        use_basic_auth, use_istio, config_path,
                        build_and_apply, kfctl_repo_path,
                        cluster_creation_script, self_signed_cert, values):
    """Test building and deploying Kubeflow.

  Args:
    app_name: kubeflow deployment name.
    app_path: The path to the Kubeflow app.
    project: The GCP project to use.
    use_basic_auth: Whether to use basic_auth.
    use_istio: Whether to use Istio or not
    config_path: Path to the KFDef spec file.
    cluster_creation_script: script invoked to create a new cluster
    build_and_apply: whether to build and apply or apply
    kfctl_repo_path: path to the kubeflow/kfctl repo.
    self_signed_cert: whether to use self-signed cert for ingress.
    values: Comma separated list of variables to substitute into config_path
  """
    util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    # TODO(yanniszark): split this into a separate workflow step
    if cluster_creation_script:
        logging.info("Cluster creation script specified: %s",
                     cluster_creation_script)
        util.run(["/bin/bash", "-c", cluster_creation_script])

    logging.info("using kfctl repo: %s" % kfctl_repo_path)

    if values:
        pairs = values.split(",")
        path_vars = {}
        for p in pairs:
            k, v = p.split("=")
            path_vars[k] = v

        config_path = config_path.format(**path_vars)
        logging.info("config_path after substitution: %s", config_path)

    kfctl_path = kfctl_util.build_kfctl_go(kfctl_repo_path)
    app_path = kfctl_util.kfctl_deploy_kubeflow(app_path, project,
                                                use_basic_auth, use_istio,
                                                config_path, kfctl_path,
                                                build_and_apply)
    if not cluster_creation_script:
        kfctl_util.verify_kubeconfig(app_path)

    # Use self-signed cert for testing to prevent quota limiting.
    if self_signed_cert:
        logging.info("Configuring self signed certificate")
        util.load_kube_credentials()
        api_client = k8s_client.ApiClient()
        ingress_namespace = "istio-system"
        ingress_name = "envoy-ingress"
        tls_endpoint = "{0}.endpoints.{1}.cloud.goog".format(app_name, project)
        logging.info("Configuring self signed cert for %s", tls_endpoint)
        util.use_self_signed_for_ingress(ingress_namespace, ingress_name,
                                         tls_endpoint, api_client)
Ejemplo n.º 27
0
def test_kf_is_ready(namespace, use_basic_auth, use_istio):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """

    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "cert-manager",
        "cloud-endpoints-controller",
        "jupyter-web-app",
        "ml-pipeline",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "notebooks-controller",
        "tf-job-operator",
        "pytorch-operator",
        "studyjob-controller",
        "workflow-controller",
    ]
    ingress_related_deployments = []

    stateful_sets = [
        "backend-updater",
    ]

    if use_basic_auth:
        deployment_names.extend(["basic-auth"])
    else:
        ingress_related_deployments.extend(["iap-enabler"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name)

    for name in stateful_sets:
        logging.info("Verifying that statefulset %s started...", name)
        util.wait_for_statefulset(api_client, ingress_namespace, name)
Ejemplo n.º 28
0
def setup(args):
    """Test deploying Kubeflow."""
    api_client = create_k8s_client(args)

    now = datetime.datetime.now()
    run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    if not os.path.exists(args.test_dir):
        os.makedirs(args.test_dir)

    logging.info("Using test directory: %s", args.test_dir)

    namespace_name = args.namespace

    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    if args.github_token:
        logging.info("Setting GITHUB_TOKEN to %s.", args.github_token)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

    if not os.getenv("GITHUB_TOKEN"):
        logging.warn("GITHUB_TOKEN not set; you will probably hit Github API "
                     "limits.")
    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run([
        "ks",
        "init",
        app_name,
    ], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
             cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
        util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run([
        "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
        "--namespace=" + namespace.metadata.name
    ],
             cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
        util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = [
        "ks",
        "apply",
        "default",
        "-c",
        "kubeflow-core",
    ]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name,
                              jupyter_name)

    if args.deploy_tf_serving:
        logging.info("Deploying tf-serving.")
        util.run([
            "ks", "generate", "tf-serving", "modelServer", "--name=inception",
            "--namespace=" + namespace.metadata.name,
            "--model_path=gs://kubeflow-models/inception",
            "--model_server_image=" + args.model_server_image
        ],
                 cwd=app_dir)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "modelServer",
        ]
        util.run(apply_command, cwd=app_dir)

        core_api = k8s_client.CoreV1Api(api_client)
        deploy = core_api.read_namespaced_service("inception",
                                                  namespace.metadata.name)
        cluster_ip = deploy.spec.cluster_ip

        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 "inception")
        logging.info("Verified TF serving started.")
Ejemplo n.º 29
0
def deploy_minikube(args):
    """Create a VM and setup minikube."""

    credentials = GoogleCredentials.get_application_default()
    gce = discovery.build("compute",
                          "v1",
                          credentials=credentials,
                          cache_discovery=False)
    instances = gce.instances()
    body = {
        "name":
        args.vm_name,
        "machineType":
        "zones/{0}/machineTypes/n1-standard-16".format(args.zone),
        "disks": [
            {
                "boot": True,
                "initializeParams": {
                    "sourceImage":
                    "projects/ubuntu-os-cloud/global/images/family/ubuntu-1604-lts",
                    "diskSizeGb": 100,
                    "autoDelete": True,
                },
            },
        ],
        "networkInterfaces": [
            {
                "accessConfigs": [
                    {
                        "name": "external-nat",
                        "type": "ONE_TO_ONE_NAT",
                    },
                ],
                "network":
                "global/networks/default",
            },
        ],
    }
    request = instances.insert(project=args.project, zone=args.zone, body=body)
    response = None
    try:
        response = request.execute()
        print("done")
    except errors.HttpError as e:
        if not e.content:
            raise
        content = json.loads(e.content)
        if content.get("error", {}).get("code") == requests.codes.CONFLICT:
            # We don't want to keep going so we reraise the error after logging
            # a helpful error message.
            logging.error(
                "Either the VM or the disk %s already exists in zone "
                "%s in project %s ", args.vm_name, args.zone, args.project)
            raise
        else:
            raise

    op_id = response.get("name")
    final_op = vm_util.wait_for_operation(gce, args.project, args.zone, op_id)

    logging.info("Final result for insert operation: %s", final_op)
    if final_op.get("status") != "DONE":
        raise ValueError("Insert operation has status %s",
                         final_op.get("status"))

    if final_op.get("error"):
        message = "Insert operation resulted in error %s".format(
            final_op.get("error"))
        logging.error(message)
        raise ValueError(message)

    # Locate the install minikube script.
    install_script = os.path.join(os.path.dirname(__file__),
                                  "install_minikube.sh")

    if not os.path.exists(install_script):
        logging.error("Could not find minikube install script: %s",
                      install_script)

    vm_util.wait_for_vm(args.project, args.zone, args.vm_name)
    vm_util.execute_script(args.project, args.zone, args.vm_name,
                           install_script)

    # Copy the .kube and .minikube files to test_dir
    target = "~/.kube"
    full_target = "{0}:{1}".format(args.vm_name, target)
    logging.info("Copying %s to %s", target, args.test_dir)
    util.run([
        "gcloud", "compute", "--project=" + args.project, "scp", "--recurse",
        full_target, args.test_dir, "--zone=" + args.zone
    ])

    # The .minikube directory contains some really large ISO and other files that we don't need; so we
    # only copy the files we need.
    minikube_dir = os.path.join(args.test_dir, ".minikube")
    try:
        os.makedirs(minikube_dir)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(minikube_dir):
            pass
        else:
            raise

    for target in ["~/.minikube/*.crt", "~/.minikube/client.key"]:
        full_target = "{0}:{1}".format(args.vm_name, target)
        logging.info("Copying %s to %s", target, minikube_dir)
        util.run([
            "gcloud", "compute", "--project=" + args.project, "scp",
            "--recurse", full_target, minikube_dir, "--zone=" + args.zone
        ])

    config_path = os.path.join(args.test_dir, ".kube", "config")
    modify_minikube_config(config_path, minikube_dir)
Ejemplo n.º 30
0
def get_gcp_identity():
    identity = util.run(["gcloud", "config", "get-value", "account"])
    logging.info("Current GCP account: %s", identity)
    return identity
Ejemplo n.º 31
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    logging.getLogger().setLevel(logging.INFO)  # pylint: disable=too-many-locals
    # create the top-level parser
    parser = argparse.ArgumentParser(description="Test Kubeflow E2E.")

    parser.add_argument(
        "--test_dir",
        default="",
        type=str,
        help="Directory to use for all the test files. If not set a temporary "
        "directory is created.")

    parser.add_argument(
        "--artifacts_dir",
        default="",
        type=str,
        help="Directory to use for artifacts that should be preserved after "
        "the test runs. Defaults to test_dir if not set.")

    parser.add_argument(
        "--as_gcloud_user",
        dest="as_gcloud_user",
        action="store_true",
        help=("Impersonate the user corresponding to the gcloud "
              "command with kubectl and ks."))
    parser.add_argument("--no-as_gcloud_user",
                        dest="as_gcloud_user",
                        action="store_false")
    parser.set_defaults(as_gcloud_user=False)

    # TODO(jlewi): This should not be a global flag.
    parser.add_argument("--project",
                        default=None,
                        type=str,
                        help="The project to use.")

    # TODO(jlewi): This should not be a global flag.
    parser.add_argument("--namespace",
                        default=None,
                        type=str,
                        help=("The namespace to use."))

    parser.add_argument(
        "--github_token",
        default=None,
        type=str,
        help=
        ("The GitHub API token to use. This is needed since ksonnet uses the "
         "GitHub API and without it we get rate limited. For more info see: "
         "https://github.com/ksonnet/ksonnet/blob/master/docs"
         "/troubleshooting.md. Can also be set using environment variable "
         "GITHUB_TOKEN."))

    parser.add_argument("--deploy_name",
                        default="",
                        type=str,
                        help="The name of the deployment.")

    parser.add_argument("--workflow_name",
                        default="",
                        type=str,
                        help="The name of the workflow.")

    subparsers = parser.add_subparsers()

    parser_teardown = subparsers.add_parser(
        "teardown", help="teardown the test infrastructure.")

    parser_teardown.set_defaults(func=teardown)

    parser_tf_serving = subparsers.add_parser(
        "deploy_model", help="Deploy a TF serving model.")

    parser_tf_serving.set_defaults(func=deploy_model)

    parser_tf_serving.add_argument(
        "--params",
        default="",
        type=str,
        help=("Comma separated list of parameters to set on the model."))

    parser_pytorch_job = subparsers.add_parser("deploy_pytorchjob",
                                               help="Deploy a pytorch-job")

    parser_pytorch_job.set_defaults(func=deploy_pytorchjob)

    parser_pytorch_job.add_argument(
        "--params",
        default="",
        type=str,
        help=("Comma separated list of parameters to set on the model."))

    parser_argo_job = subparsers.add_parser("deploy_argo", help="Deploy argo")

    parser_argo_job.set_defaults(func=deploy_argo)

    parser_katib_test = subparsers.add_parser("test_katib", help="Test Katib")

    parser_katib_test.set_defaults(func=test_katib)

    parser_minikube = subparsers.add_parser(
        "deploy_minikube", help="Setup a K8s cluster on minikube.")

    parser_minikube.set_defaults(func=deploy_minikube)

    parser_minikube.add_argument("--vm_name",
                                 required=True,
                                 type=str,
                                 help="The name of the VM to use.")

    parser_minikube.add_argument("--zone",
                                 default="us-east1-d",
                                 type=str,
                                 help="The zone for the cluster.")

    parser_teardown_minikube = subparsers.add_parser(
        "teardown_minikube", help="Delete the VM running minikube.")

    parser_teardown_minikube.set_defaults(func=teardown_minikube)

    parser_teardown_minikube.add_argument("--zone",
                                          default="us-east1-d",
                                          type=str,
                                          help="The zone for the cluster.")

    parser_teardown_minikube.add_argument("--vm_name",
                                          required=True,
                                          type=str,
                                          help="The name of the VM to use.")

    args = parser.parse_args()

    if not args.test_dir:
        logging.info("--test_dir not set; using a temporary directory.")

        now = datetime.datetime.now()
        label = "test_deploy-" + now.strftime(
            "%m%d-%H%M-") + uuid.uuid4().hex[0:4]

        # Create a temporary directory for this test run
        args.test_dir = os.path.join(tempfile.gettempdir(), label)

    if not args.artifacts_dir:
        args.artifacts_dir = args.test_dir

    test_log = os.path.join(
        args.artifacts_dir, "logs",
        "test_deploy." + args.func.__name__ + args.deploy_name + ".log.txt")

    try:
        os.makedirs(os.path.dirname(test_log))
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(
                os.path.dirname(test_log)):
            pass
        else:
            raise

    # TODO(jlewi): We should make this a util routine in kubeflow.testing.util
    # Setup a logging file handler. This way we can upload the log outputs
    # to gubernator.
    root_logger = logging.getLogger()

    file_handler = logging.FileHandler(test_log)
    root_logger.addHandler(file_handler)
    # We need to explicitly set the formatter because it will not pick up
    # the BasicConfig.
    formatter = logging.Formatter(
        fmt=("%(levelname)s|%(asctime)s"
             "|%(pathname)s|%(lineno)d| %(message)s"),
        datefmt="%Y-%m-%dT%H:%M:%S")
    file_handler.setFormatter(formatter)
    logging.info("Logging to %s", test_log)
    util.run([ks, "version"])

    util.maybe_activate_service_account()
    config_file = os.path.expanduser(kube_config.KUBE_CONFIG_DEFAULT_LOCATION)

    # Print out the config to help debugging.
    output = util.run_and_output(["gcloud", "config", "config-helper"])
    logging.info("gcloud config: \n%s", output)
    wrap_test(args)
Ejemplo n.º 32
0
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches
  job_type = os.getenv("JOB_TYPE")
  repo_owner = os.getenv("REPO_OWNER")
  repo_name = os.getenv("REPO_NAME")
  pull_base_sha = os.getenv("PULL_BASE_SHA")

  # For presubmit/postsubmit jobs, find the list of files changed by the PR.
  diff_command = []
  if job_type == "presubmit":
    diff_command = ["git", "diff", "--name-only", "master"]
  elif job_type == "postsubmit":
    diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha]

  changed_files = []
  if job_type == "presubmit" or job_type == "postsubmit":
    changed_files = util.run(diff_command,
      cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines()

  for f in changed_files:
    logging.info("File %s is modified.", f)

  if args.release:
    generate_env_from_head(args)
  workflows = []
  if args.config_file:
    workflows.extend(parse_config_file(args.config_file, args.repos_dir))

  create_started_file(args.bucket)

  util.maybe_activate_service_account()

  util.configure_kubectl(args.project, args.zone, args.cluster)
  util.load_kube_config()

  workflow_names = []
  ui_urls = {}

  for w in workflows:
    # Create the name for the workflow
    # We truncate sha numbers to prevent the workflow name from being too large.
    # Workflow name should not be more than 63 characters because its used
    # as a label on the pods.
    workflow_name = os.getenv("JOB_NAME") + "-" + w.name
    ks_cmd = get_ksonnet_cmd(w)

    # Print ksonnet version
    util.run([ks_cmd, "version"])

    # Skip this workflow if it is scoped to a different job type.
    if w.job_types and not job_type in w.job_types:
      logging.info("Skipping workflow %s because job type %s is not one of "
                   "%s.", w.name, job_type, w.job_types)
      continue

    # If we are scoping this workflow to specific directories, check if any files
    # modified match the specified regex patterns.
    dir_modified = False
    if w.include_dirs:
      for f in changed_files:
        for d in w.include_dirs:
          if fnmatch.fnmatch(f, d):
            dir_modified = True
            logging.info("Triggering workflow %s because %s in dir %s is modified.",
                         w.name, f, d)
            break
        if dir_modified:
          break

    # Only consider modified files when the job is pre or post submit, and if
    # the include_dirs stanza is defined.
    if job_type != "periodic" and w.include_dirs and not dir_modified:
      logging.info("Skipping workflow %s because no code modified in %s.",
                   w.name, w.include_dirs)
      continue

    if job_type == "presubmit":
      workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
      workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

    elif job_type == "postsubmit":
      workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

    workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

    salt = uuid.uuid4().hex[0:4]
    # Add some salt. This is mostly a convenience for the case where you
    # are submitting jobs manually for testing/debugging. Since the prow should
    # vend unique build numbers for each job.
    workflow_name += "-{0}".format(salt)

    workflow_names.append(workflow_name)
    # Create a new environment for this run
    env = workflow_name

    util.run([ks_cmd, "env", "add", env], cwd=w.app_dir)

    util.run([ks_cmd, "param", "set", "--env=" + env, w.component,
              "name", workflow_name],
             cwd=w.app_dir)

    # Set the prow environment variables.
    prow_env = []

    names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
             "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
             "REPO_NAME"]
    names.sort()
    for v in names:
      if not os.getenv(v):
        continue
      prow_env.append("{0}={1}".format(v, os.getenv(v)))

    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env",
             ",".join(prow_env)], cwd=w.app_dir)
    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace",
             get_namespace(args)], cwd=w.app_dir)
    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket",
             args.bucket], cwd=w.app_dir)
    if args.release:
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag",
                os.getenv("VERSION_TAG")], cwd=w.app_dir)

    # Set any extra params. We do this in alphabetical order to make it easier to verify in
    # the unittest.
    param_names = w.params.keys()
    param_names.sort()
    for k in param_names:
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k,
               "{0}".format(w.params[k])], cwd=w.app_dir)

    # For debugging print out the manifest
    util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir)
    util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir)

    ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
              "?tab=workflow".format(workflow_name))
    ui_urls[workflow_name] = ui_url
    logging.info("URL for workflow: %s", ui_url)

  success = True
  workflow_phase = {}
  try:
    results = argo_client.wait_for_workflows(get_namespace(args),
                                             workflow_names,
                                             timeout=datetime.timedelta(minutes=180),
                                             status_callback=argo_client.log_status)
    for r in results:
      phase = r.get("status", {}).get("phase")
      name = r.get("metadata", {}).get("name")
      workflow_phase[name] = phase
      if phase != "Succeeded":
        success = False
      logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase)
  except util.TimeoutError:
    success = False
    logging.exception("Time out waiting for Workflows %s to finish", ",".join(workflow_names))
  except Exception as e:
    # We explicitly log any exceptions so that they will be captured in the
    # build-log.txt that is uploaded to Gubernator.
    logging.exception("Exception occurred: %s", e)
    raise
  finally:
    success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls)

    # Upload logs to GCS. No logs after this point will appear in the
    # file in gcs
    file_handler.flush()
    util.upload_file_to_gcs(
      file_handler.baseFilename,
      os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt"))

  return success
Ejemplo n.º 33
0
def deploy_kubeflow(test_case):
  """Deploy Kubeflow."""
  args = parse_args()
  test_dir = test_case.test_suite.test_dir
  namespace = args.namespace
  api_client = deploy_utils.create_k8s_client()
  app_dir = deploy_utils.setup_kubeflow_ks_app(test_dir, namespace, args.github_token, api_client)


  # ks generate tf-job-operator tf-job-operator
  # TODO(jlewi): We don't need to generate a core component if we are
  # just deploying TFServing. Might be better to refactor this code.
  # Deploy Kubeflow
  util.run(
    [
      "ks", "generate", "tf-job-operator", "tf-job-operator",
    ],
    cwd=app_dir)

  util.run(
    [
      "ks", "generate", "pytorch-operator", "pytorch-operator",
    ],
    cwd=app_dir)

  util.run(
    [
      "ks", "generate", "jupyter", "jupyter",
    ],
    cwd=app_dir)

  util.run(
    [
      "ks", "generate", "katib", "katib",
    ],
    cwd=app_dir)

  apply_command = [
    "ks",
    "apply",
    "default",
    "-c",
    "tf-job-operator",
    "-c",
    "pytorch-operator",
    "-c",
    "jupyter",
    "-c",
    "katib",
  ]

  if args.as_gcloud_user:
    account = deploy_utils.get_gcp_identity()
    logging.info("Impersonate %s", account)

    # If we don't use --as to impersonate the service account then we
    # observe RBAC errors when doing certain operations. The problem appears
    # to be that we end up using the in cluster config (e.g. pod service account)
    # and not the GCP service account which has more privileges.
    apply_command.append("--as=" + account)
  util.run(apply_command, cwd=app_dir)

  # Verify that Jupyter is actually deployed.
  jupyter_name = "jupyter"
  logging.info("Verifying TfHub started.")
  util.wait_for_statefulset(api_client, namespace, jupyter_name)

  # Verify that core components are actually deployed.
  deployment_names = ["tf-job-operator-v1beta1", "pytorch-operator", "studyjob-controller"]
  for deployment_name in deployment_names:
    logging.info("Verifying that %s started...", deployment_name)
    util.wait_for_deployment(api_client, namespace, deployment_name)