Beispiel #1
0
def run_notebook_test(notebook_path, parameters=None):
    # Ensure workload identity is ready.
    # TODO(jlewi): Need to skip this when not running on GCP.
    gcp_util.get_gcp_credentials()
    output_path = execute_notebook(notebook_path, parameters=parameters)

    logging.info(f"Reading notebook {output_path}")
    with open(output_path, "r") as hf:
        actual_output = hf.read()

    logging.info("Converting notebook to html")
    nb = nbformat.reads(actual_output, as_version=4)
    html_exporter = nbconvert.HTMLExporter()
    (html_output, _) = html_exporter.from_notebook_node(nb)
    gcs_path = os.getenv("OUTPUT_GCS")

    # Per https://github.com/kubeflow/testing/issues/715
    # we need to add some uniquness to the name since different test runs
    # will use the same OUTPUT_GCS directory
    subdir = datetime.datetime.now().strftime("%Y%m%d-%H%M")
    subdir = subdir + "-" + uuid.uuid4().hex[0:4]

    gcs_path = os.path.join(gcs_path, subdir, "notebook.html")

    logging.info(f"Uploading notebook to {gcs_path}")
    _upload_notebook_html(html_output, gcs_path)
Beispiel #2
0
def run_notebook_test(notebook_path, parameters=None):
  # Ensure workload identity is ready.
  # TODO(jlewi): Need to skip this when not running on GCP.
  gcp_util.get_gcp_credentials()
  output_path = execute_notebook(notebook_path, parameters=parameters)

  logging.info(f"Reading notebook {output_path}")
  with open(output_path, "r") as hf:
    actual_output = hf.read()

  logging.info("Converting notebook to html")
  nb = nbformat.reads(actual_output, as_version=4)
  html_exporter = nbconvert.HTMLExporter()
  (html_output, _) = html_exporter.from_notebook_node(nb)
  gcs_path = os.getenv("OUTPUT_GCS")
  logging.info(f"Uploading notebook to {gcs_path}")
  _upload_notebook_html(html_output, gcs_path)
Beispiel #3
0
  def run(self, period=datetime.timedelta(minutes=5)):
    """Continuously reconcile."""

    # Ensure we can get GCP credentials
    if not gcp_util.get_gcp_credentials():
      raise RuntimeError("Could not get GCP application default credentials")

    while True:
      self._reconcile()
      logging.info(f"Wait {period}(HH:MM:SS) before reconciling; ")
      time.sleep(period.total_seconds())
Beispiel #4
0
def main(): # pylint: disable=too-many-locals,too-many-statements
  logging.basicConfig(level=logging.INFO,
                            format=('%(levelname)s|%(asctime)s'
                                '|%(pathname)s|%(lineno)d| %(message)s'),
                        datefmt='%Y-%m-%dT%H:%M:%S',
                      )
  logging.getLogger().setLevel(logging.INFO)

  parser = argparse.ArgumentParser()

  parser.add_argument(
          "--project", default="kubeflow-ci-deployment", type=str,
            help=("The project."))

  parser.add_argument(
          "--zone", default="us-east1-d", type=str, help=("The zone to deploy in."))
  parser.add_argument(
          "--oauth_file",
            default=("gs://kubeflow-ci-deployment_kf-data/"
                     "kf-iap-oauth.kubeflow-ci-deployment.yaml"),
      type=str, help=("The file containing the OAuth client ID & secret"
                    "for IAP."))

  # TODO(jlewi): Should rename this argument to something like kfctl_src
  # We should try to do it in a backwards compatible way.
  parser.add_argument(
          "--kubeflow_repo",
            default="/src/kubeflow/kubeflow",
      type=str, help=("Path to the source for kfctl. Should be the directory "
                      "containing the Makefile to build kfctl"))

  parser.add_argument(
          "--kfctl_path",
            default="",
      type=str, help=("Path to kfctl; can be a URL."))

  parser.add_argument(
          "--kfctl_config",
            default=("https://raw.githubusercontent.com/kubeflow/manifests"
                     "/master/kfdef/kfctl_gcp_iap.yaml"),
            type=str, help=("Path to the kfctl config to use"))

  parser.add_argument(
          "--apps_dir",
            default=os.getcwd(),
      type=str, help=("Directory to store kubeflow apps."))

  parser.add_argument(
          "--name", type=str, default="kf-vmaster-{uid}",
          help=("Name for the deployment. This can be a python format string "
                "with the variable uid. Uid will automatically be substituted "
                "for a unique value based on the time."))

  parser.add_argument(
          "--email", type=str, default="",
          help=("(Optional). Email of the person to create the default profile"
                "for. If not specificied uses the gcloud config value."))

  parser.add_argument(
          "--extra_users", type=str, default="",
          help=("Comma separated list of additional users to grant access. "
                "Should be in the form user:[email protected] or"
                "serviceAccount:[email protected]"))

  parser.add_argument(
          "--labels", type=str, default="",
          help=("Comma separated list of extra labels; e.g "
                "--labels=k1=v1,k2=v2"))

  parser.add_argument("--setup_project", dest="setup_project",
                      action="store_true", help="Setup the project")
  parser.add_argument("--no-setup_project", dest="setup_project",
                      action="store_false", help="Do not setup the project")
  parser.set_defaults(setup_project=True)

  parser.add_argument("--use_self_cert", dest="use_self_cert",
                      action="store_true",
                      help="Use a self signed certificate")
  parser.add_argument("--no-use_self_cert", dest="use_self_cert",
                      action="store_false",
                      help="Do not use a self signed certificate")
  parser.set_defaults(use_self_cert=True)

  args = parser.parse_args()

  util.maybe_activate_service_account()

  # Wait for credentials to deal with workload identity issues
  gcp_util.get_gcp_credentials()

  # Wrap gcloud commands in retry loop to deal with metadata; workload
  # identity issues.
  @retrying.retry(stop_max_delay=5*60*1000, wait_exponential_max=10000)
  def _gcloud_list():
    # For debugging purposes output the command
    util.run(["gcloud", "config", "list"])
    util.run(["gcloud", "auth", "list"])
  _gcloud_list()

  bucket, blob_path = util.split_gcs_uri(args.oauth_file)

  client = storage.Client(project=args.project)
  bucket = client.get_bucket(bucket)

  blob = bucket.get_blob(blob_path)
  contents = blob.download_as_string()

  oauth_info = yaml.load(contents)

  if args.kubeflow_repo and args.kfctl_path:
    raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds "
                     "to be set.")

  if not args.kubeflow_repo and not args.kfctl_path:
    raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds "
                     "to be set.")

  git_describe = ""
  if args.kubeflow_repo:
    git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"],
                             cwd=args.kubeflow_repo).strip("'")

    kfctl_path = build_kfctl_go(args)
  else:
    if args.kfctl_path.startswith("http"):
      temp_dir = tempfile.mkdtemp()

      filename = "kfctl"

      zipped = False
      if args.kfctl_path.endswith(".tar.gz"):
        zipped = True
        filename = filename + ".tar.gz"

      util.run(["curl", "-L", "-o", filename, args.kfctl_path],
               cwd=temp_dir)
      if zipped:
        util.run(["tar", "-xvf", "kfctl.tar.gz"], cwd=temp_dir)

      kfctl_path = os.path.join(temp_dir, "kfctl")
      logging.info("Changing permissions on %s", kfctl_path)
      os.chmod(kfctl_path, 0o777)
    else:
      kfctl_path = args.kfctl_path

  git_describe = util.run([kfctl_path, "version"])

  logging.info("kfctl path set to %s", kfctl_path)

  # We need to keep the name short to avoid hitting limits with certificates.
  uid = datetime.datetime.now().strftime("%m%d") + "-"
  uid = uid + uuid.uuid4().hex[0:3]

  args.name = args.name.format(uid=uid)
  logging.info("Using name %s", args.name)

  app_dir = os.path.join(args.apps_dir, args.name)

  if not os.path.exists(args.apps_dir):
    os.makedirs(args.apps_dir)

  env = {}
  env.update(os.environ)
  env.update(oauth_info)

  # GCP labels can only take as input alphanumeric characters, hyphens, and
  # underscores. Replace not valid characters with hyphens.
  labels = {"kfctl-git": git_describe,
            "purpose": "kf-test-cluster",
            "auto-deploy": "true"}

  for k, v in labels.items():
    val = v.lower().replace("\"", "")
    val = re.sub(r"[^a-z0-9\-_]", "-", val)
    labels[k] = val

  if args.labels:
    logging.info("Parsing labels %s", args.labels)
    for pair in args.labels.split(","):
      pieces = pair.split("=")
      if len(pieces) != 2:
        logging.error("Skipping pair %s; not of the form key=value", pair)
        continue
      key = pieces[0].strip()
      value = pieces[1].strip()

      labels[key] = value
  logging.info("labels: %s", labels)
  deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=labels)
  add_extra_users(args.project, args.extra_users)
    def deploy(blueprint_dir,
               management_context,
               name="kf-vbp-{uid}",
               project="kubeflow-ci-deployment",
               location="us-central1-c",
               zone="us-central1-c",
               labels_file=None,
               oauth_file=DEFAULT_OAUTH_FILE
               ):  # pylinet: disable=too-many-arguments
        """Deploy the blueprint:

    Args:
      blueprint_dir: The directory where
         https://github.com/kubeflow/gcp-blueprints/tree/master/kubeflow is checked
         out.
      management_context: The name of the management context.
      name: Name for the deployment. This can be a python format string
            with the variable uid. Uid will automatically be substituted "
          for a unique value based on the time.
      project: The GCP project where the blueprint should be created.
      location: The zone or region where Kubeflow should be deployed.
      zone: The zone to use for disks must be in the same region as location
        when using a regional cluster and must be location when location
        is zone.
      labels_file: (Optional): Path to a file containing additional labels
        to add to the deployment.
      oauth_file: The file containing the OAuth client ID & secret for IAP.
    """
        # Wait for credentials to deal with workload identity issues
        gcp_util.get_gcp_credentials()

        try:
            util.run(["make", "get-pkg"], cwd=blueprint_dir)
        except subprocess.CalledProcessError as e:
            if re.search(
                    ".*resources must be annotated with config.kubernetes.io/"
                    "index.*", e.output):
                logging.warning(
                    f"make get-pkg returned error: {e.output}; ignoring "
                    "and continuing")

            elif re.search(".*already exists.*", e.output):
                logging.warning(
                    "The package directory already exists; continuing")
            else:
                logging.error(f"Command exited with error: {e.output}")
                raise

        util.run(
            ["kpt", "cfg", "set", "instance", "mgmt-ctxt", management_context],
            cwd=blueprint_dir)

        # We need to keep the name short to avoid hitting limits with certificates.
        uid = datetime.datetime.now().strftime("%m%d") + "-"
        uid = uid + uuid.uuid4().hex[0:3]

        name = name.format(uid=uid)
        logging.info("Using name %s", name)

        email = util.run(["gcloud", "config", "get-value", "account"])

        logging.info(f"Using email {email}")

        values = {
            "name": name,
            "gcloud.core.project": project,
            "gcloud.compute.zone": zone,
            "location": location,
        }

        def set_values(pairs, subdir):
            for k, v in pairs.items():
                util.run(["kpt", "cfg", "set", subdir, k, v],
                         cwd=blueprint_dir)

        set_values(values, "./upstream/manifests/gcp")

        values = {
            "name": name,
            "gcloud.core.project": project,
        }

        set_values(values, "./upstream/manifests/stacks/gcp")

        values = {
            "name": name,
            "gcloud.core.project": project,
            "location": location,
            "email": email,
        }

        set_values(values, "./instance")

        # TODO(jlewi): We should add an expiration time; either as a label
        # or as as an annotation.
        # GCP labels can only take as input alphanumeric characters, hyphens, and
        # underscores. Replace not valid characters with hyphens.
        # TODO(jlewi): We are assuming all blueprints created by
        # create_kf_from_gcp_blueprint.py are auto-deployed. How could
        # we inject appropriate labels when creating auto-deploy jobs?
        labels = {}

        if labels_file:
            logging.info(f"Reading labels from file: {labels_file}")

            pattern = re.compile("([^=]+)=(.+)")
            with open(labels_file) as f:
                while True:
                    l = f.readline()
                    if not l:
                        break
                    m = pattern.match(l)
                    if not m:
                        logging.info(
                            f"Skipping line {l} it doesn't match pattern "
                            f"{pattern.pattern}")
                    labels[m.group(1)] = m.group(2)
        else:
            logging.info("No labels file provided.")

        kustomization_file = os.path.join(blueprint_dir, "instance",
                                          "gcp_config", "kustomization.yaml")

        add_common_labels(kustomization_file, labels)

        oauth_info = get_oauth(project, oauth_file)

        env = {}
        env.update(os.environ)
        env.update(oauth_info)

        # To work around various bugs in our manifests that can be fixed by
        # retrying we see if a particular error occurs and then retry.
        # As thes issues are fixed we should remove the retries.
        retryable_errors = [
            # TODO(https://github.com/kubeflow/manifests/issues/1149):
            # Once this is fixed we should be able to remove this.
            re.compile(".*no matches for kind \"Application\" in version "
                       "\"app.k8s.io/v1beta1\""),
            # TODO(https://github.com/kubeflow/gcp-blueprints/issues/43):
            # Remmove this once the underlying issue is fixed.
            re.compile(".*webhook\.cert-manager\.io.*"),
        ]

        # The total time to wait needs to take into account the actual time
        # it takes to run otherwise we won't retry.
        total_time = datetime.timedelta(minutes=30)

        def is_retryable_exception(exception):
            """Return True if we should retry False otherwise"""

            if not isinstance(exception, subprocess.CalledProcessError):
                return False

            for m in retryable_errors:
                if m.search(exception.output):
                    logging.warning(
                        "make apply failed with retryable error. The "
                        f"output matched regex: {m.pattern}")
                    return True

            return False

        @retrying.retry(stop_max_delay=total_time.total_seconds() * 1000,
                        retry_on_exception=is_retryable_exception)
        def run_apply():
            util.run(["make", "apply"], cwd=blueprint_dir, env=env)

        run_apply()