def run_notebook_test(notebook_path, parameters=None): # Ensure workload identity is ready. # TODO(jlewi): Need to skip this when not running on GCP. gcp_util.get_gcp_credentials() output_path = execute_notebook(notebook_path, parameters=parameters) logging.info(f"Reading notebook {output_path}") with open(output_path, "r") as hf: actual_output = hf.read() logging.info("Converting notebook to html") nb = nbformat.reads(actual_output, as_version=4) html_exporter = nbconvert.HTMLExporter() (html_output, _) = html_exporter.from_notebook_node(nb) gcs_path = os.getenv("OUTPUT_GCS") # Per https://github.com/kubeflow/testing/issues/715 # we need to add some uniquness to the name since different test runs # will use the same OUTPUT_GCS directory subdir = datetime.datetime.now().strftime("%Y%m%d-%H%M") subdir = subdir + "-" + uuid.uuid4().hex[0:4] gcs_path = os.path.join(gcs_path, subdir, "notebook.html") logging.info(f"Uploading notebook to {gcs_path}") _upload_notebook_html(html_output, gcs_path)
def run_notebook_test(notebook_path, parameters=None): # Ensure workload identity is ready. # TODO(jlewi): Need to skip this when not running on GCP. gcp_util.get_gcp_credentials() output_path = execute_notebook(notebook_path, parameters=parameters) logging.info(f"Reading notebook {output_path}") with open(output_path, "r") as hf: actual_output = hf.read() logging.info("Converting notebook to html") nb = nbformat.reads(actual_output, as_version=4) html_exporter = nbconvert.HTMLExporter() (html_output, _) = html_exporter.from_notebook_node(nb) gcs_path = os.getenv("OUTPUT_GCS") logging.info(f"Uploading notebook to {gcs_path}") _upload_notebook_html(html_output, gcs_path)
def run(self, period=datetime.timedelta(minutes=5)): """Continuously reconcile.""" # Ensure we can get GCP credentials if not gcp_util.get_gcp_credentials(): raise RuntimeError("Could not get GCP application default credentials") while True: self._reconcile() logging.info(f"Wait {period}(HH:MM:SS) before reconciling; ") time.sleep(period.total_seconds())
def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig(level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--project", default="kubeflow-ci-deployment", type=str, help=("The project.")) parser.add_argument( "--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) parser.add_argument( "--oauth_file", default=("gs://kubeflow-ci-deployment_kf-data/" "kf-iap-oauth.kubeflow-ci-deployment.yaml"), type=str, help=("The file containing the OAuth client ID & secret" "for IAP.")) # TODO(jlewi): Should rename this argument to something like kfctl_src # We should try to do it in a backwards compatible way. parser.add_argument( "--kubeflow_repo", default="/src/kubeflow/kubeflow", type=str, help=("Path to the source for kfctl. Should be the directory " "containing the Makefile to build kfctl")) parser.add_argument( "--kfctl_path", default="", type=str, help=("Path to kfctl; can be a URL.")) parser.add_argument( "--kfctl_config", default=("https://raw.githubusercontent.com/kubeflow/manifests" "/master/kfdef/kfctl_gcp_iap.yaml"), type=str, help=("Path to the kfctl config to use")) parser.add_argument( "--apps_dir", default=os.getcwd(), type=str, help=("Directory to store kubeflow apps.")) parser.add_argument( "--name", type=str, default="kf-vmaster-{uid}", help=("Name for the deployment. This can be a python format string " "with the variable uid. Uid will automatically be substituted " "for a unique value based on the time.")) parser.add_argument( "--email", type=str, default="", help=("(Optional). Email of the person to create the default profile" "for. If not specificied uses the gcloud config value.")) parser.add_argument( "--extra_users", type=str, default="", help=("Comma separated list of additional users to grant access. " "Should be in the form user:[email protected] or" "serviceAccount:[email protected]")) parser.add_argument( "--labels", type=str, default="", help=("Comma separated list of extra labels; e.g " "--labels=k1=v1,k2=v2")) parser.add_argument("--setup_project", dest="setup_project", action="store_true", help="Setup the project") parser.add_argument("--no-setup_project", dest="setup_project", action="store_false", help="Do not setup the project") parser.set_defaults(setup_project=True) parser.add_argument("--use_self_cert", dest="use_self_cert", action="store_true", help="Use a self signed certificate") parser.add_argument("--no-use_self_cert", dest="use_self_cert", action="store_false", help="Do not use a self signed certificate") parser.set_defaults(use_self_cert=True) args = parser.parse_args() util.maybe_activate_service_account() # Wait for credentials to deal with workload identity issues gcp_util.get_gcp_credentials() # Wrap gcloud commands in retry loop to deal with metadata; workload # identity issues. @retrying.retry(stop_max_delay=5*60*1000, wait_exponential_max=10000) def _gcloud_list(): # For debugging purposes output the command util.run(["gcloud", "config", "list"]) util.run(["gcloud", "auth", "list"]) _gcloud_list() bucket, blob_path = util.split_gcs_uri(args.oauth_file) client = storage.Client(project=args.project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) if args.kubeflow_repo and args.kfctl_path: raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds " "to be set.") if not args.kubeflow_repo and not args.kfctl_path: raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds " "to be set.") git_describe = "" if args.kubeflow_repo: git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") kfctl_path = build_kfctl_go(args) else: if args.kfctl_path.startswith("http"): temp_dir = tempfile.mkdtemp() filename = "kfctl" zipped = False if args.kfctl_path.endswith(".tar.gz"): zipped = True filename = filename + ".tar.gz" util.run(["curl", "-L", "-o", filename, args.kfctl_path], cwd=temp_dir) if zipped: util.run(["tar", "-xvf", "kfctl.tar.gz"], cwd=temp_dir) kfctl_path = os.path.join(temp_dir, "kfctl") logging.info("Changing permissions on %s", kfctl_path) os.chmod(kfctl_path, 0o777) else: kfctl_path = args.kfctl_path git_describe = util.run([kfctl_path, "version"]) logging.info("kfctl path set to %s", kfctl_path) # We need to keep the name short to avoid hitting limits with certificates. uid = datetime.datetime.now().strftime("%m%d") + "-" uid = uid + uuid.uuid4().hex[0:3] args.name = args.name.format(uid=uid) logging.info("Using name %s", args.name) app_dir = os.path.join(args.apps_dir, args.name) if not os.path.exists(args.apps_dir): os.makedirs(args.apps_dir) env = {} env.update(os.environ) env.update(oauth_info) # GCP labels can only take as input alphanumeric characters, hyphens, and # underscores. Replace not valid characters with hyphens. labels = {"kfctl-git": git_describe, "purpose": "kf-test-cluster", "auto-deploy": "true"} for k, v in labels.items(): val = v.lower().replace("\"", "") val = re.sub(r"[^a-z0-9\-_]", "-", val) labels[k] = val if args.labels: logging.info("Parsing labels %s", args.labels) for pair in args.labels.split(","): pieces = pair.split("=") if len(pieces) != 2: logging.error("Skipping pair %s; not of the form key=value", pair) continue key = pieces[0].strip() value = pieces[1].strip() labels[key] = value logging.info("labels: %s", labels) deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=labels) add_extra_users(args.project, args.extra_users)
def deploy(blueprint_dir, management_context, name="kf-vbp-{uid}", project="kubeflow-ci-deployment", location="us-central1-c", zone="us-central1-c", labels_file=None, oauth_file=DEFAULT_OAUTH_FILE ): # pylinet: disable=too-many-arguments """Deploy the blueprint: Args: blueprint_dir: The directory where https://github.com/kubeflow/gcp-blueprints/tree/master/kubeflow is checked out. management_context: The name of the management context. name: Name for the deployment. This can be a python format string with the variable uid. Uid will automatically be substituted " for a unique value based on the time. project: The GCP project where the blueprint should be created. location: The zone or region where Kubeflow should be deployed. zone: The zone to use for disks must be in the same region as location when using a regional cluster and must be location when location is zone. labels_file: (Optional): Path to a file containing additional labels to add to the deployment. oauth_file: The file containing the OAuth client ID & secret for IAP. """ # Wait for credentials to deal with workload identity issues gcp_util.get_gcp_credentials() try: util.run(["make", "get-pkg"], cwd=blueprint_dir) except subprocess.CalledProcessError as e: if re.search( ".*resources must be annotated with config.kubernetes.io/" "index.*", e.output): logging.warning( f"make get-pkg returned error: {e.output}; ignoring " "and continuing") elif re.search(".*already exists.*", e.output): logging.warning( "The package directory already exists; continuing") else: logging.error(f"Command exited with error: {e.output}") raise util.run( ["kpt", "cfg", "set", "instance", "mgmt-ctxt", management_context], cwd=blueprint_dir) # We need to keep the name short to avoid hitting limits with certificates. uid = datetime.datetime.now().strftime("%m%d") + "-" uid = uid + uuid.uuid4().hex[0:3] name = name.format(uid=uid) logging.info("Using name %s", name) email = util.run(["gcloud", "config", "get-value", "account"]) logging.info(f"Using email {email}") values = { "name": name, "gcloud.core.project": project, "gcloud.compute.zone": zone, "location": location, } def set_values(pairs, subdir): for k, v in pairs.items(): util.run(["kpt", "cfg", "set", subdir, k, v], cwd=blueprint_dir) set_values(values, "./upstream/manifests/gcp") values = { "name": name, "gcloud.core.project": project, } set_values(values, "./upstream/manifests/stacks/gcp") values = { "name": name, "gcloud.core.project": project, "location": location, "email": email, } set_values(values, "./instance") # TODO(jlewi): We should add an expiration time; either as a label # or as as an annotation. # GCP labels can only take as input alphanumeric characters, hyphens, and # underscores. Replace not valid characters with hyphens. # TODO(jlewi): We are assuming all blueprints created by # create_kf_from_gcp_blueprint.py are auto-deployed. How could # we inject appropriate labels when creating auto-deploy jobs? labels = {} if labels_file: logging.info(f"Reading labels from file: {labels_file}") pattern = re.compile("([^=]+)=(.+)") with open(labels_file) as f: while True: l = f.readline() if not l: break m = pattern.match(l) if not m: logging.info( f"Skipping line {l} it doesn't match pattern " f"{pattern.pattern}") labels[m.group(1)] = m.group(2) else: logging.info("No labels file provided.") kustomization_file = os.path.join(blueprint_dir, "instance", "gcp_config", "kustomization.yaml") add_common_labels(kustomization_file, labels) oauth_info = get_oauth(project, oauth_file) env = {} env.update(os.environ) env.update(oauth_info) # To work around various bugs in our manifests that can be fixed by # retrying we see if a particular error occurs and then retry. # As thes issues are fixed we should remove the retries. retryable_errors = [ # TODO(https://github.com/kubeflow/manifests/issues/1149): # Once this is fixed we should be able to remove this. re.compile(".*no matches for kind \"Application\" in version " "\"app.k8s.io/v1beta1\""), # TODO(https://github.com/kubeflow/gcp-blueprints/issues/43): # Remmove this once the underlying issue is fixed. re.compile(".*webhook\.cert-manager\.io.*"), ] # The total time to wait needs to take into account the actual time # it takes to run otherwise we won't retry. total_time = datetime.timedelta(minutes=30) def is_retryable_exception(exception): """Return True if we should retry False otherwise""" if not isinstance(exception, subprocess.CalledProcessError): return False for m in retryable_errors: if m.search(exception.output): logging.warning( "make apply failed with retryable error. The " f"output matched regex: {m.pattern}") return True return False @retrying.retry(stop_max_delay=total_time.total_seconds() * 1000, retry_on_exception=is_retryable_exception) def run_apply(): util.run(["make", "apply"], cwd=blueprint_dir, env=env) run_apply()