def testSplitGcsUri(self): bucket, path = util.split_gcs_uri("gs://some-bucket/some/path") self.assertEquals("some-bucket", bucket) self.assertEquals("some/path", path) bucket, path = util.split_gcs_uri("gs://some-bucket") self.assertEquals("some-bucket", bucket) self.assertEquals("", path)
def check_no_errors(gcs_client, artifacts_dir): """Check that all the XML files exist and there were no errors. Args: gcs_client: The GCS client. artifacts_dir: The directory where artifacts should be stored. Returns: True if there were no errors and false otherwise. """ bucket_name, prefix = util.split_gcs_uri(artifacts_dir) bucket = gcs_client.get_bucket(bucket_name) no_errors = True for b in bucket.list_blobs(prefix=os.path.join(prefix, "junit")): full_path = util.to_gcs_uri(b.bucket, b.path) if not os.path.splitext(b.path)[-1] == ".xml": logging.info("Skipping %s; not an xml file", full_path) continue logging.info("Checking %s", full_path) xml_contents = b.download_as_string() if test_util.get_num_failures(xml_contents) > 0: logging.info("Test failures in %s", full_path) no_errors = False return no_errors
def check_no_errors(gcs_client, artifacts_dir): """Check that all the XML files exist and there were no errors. Args: gcs_client: The GCS client. artifacts_dir: The directory where artifacts should be stored. Returns: True if there were no errors and false otherwise. """ bucket_name, prefix = util.split_gcs_uri(artifacts_dir) bucket = gcs_client.get_bucket(bucket_name) no_errors = True # Get a list of actual junit files. actual_junit = _get_actual_junit_files(bucket, prefix) for f in actual_junit: full_path = os.path.join(artifacts_dir, f) logging.info("Checking %s", full_path) b = bucket.blob(os.path.join(prefix, f)) xml_contents = b.download_as_string() if test_util.get_num_failures(xml_contents) > 0: logging.info("Test failures in %s", full_path) no_errors = False return no_errors
def upload_file_to_gcs(source, target): gcs_client = storage.Client() bucket_name, path = util.split_gcs_uri(target) bucket = gcs_client.get_bucket(bucket_name) logging.info("Uploading file %s to %s.", source, target) blob = bucket.blob(path) blob.upload_from_filename(source)
def upload_to_gcs(contents, target): gcs_client = storage.Client() bucket_name, path = util.split_gcs_uri(target) bucket = gcs_client.get_bucket(bucket_name) logging.info("Writing %s", target) blob = bucket.blob(path) blob.upload_from_string(contents)
def _load_oauth_file(self, oauth_file, admin_project): bucket, blob_path = util.split_gcs_uri(oauth_file) client = storage.Client(project=admin_project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() return yaml.load(contents)
def _upload_notebook_html(content, target): gcs_client = storage.Client() bucket_name, path = util.split_gcs_uri(target) bucket = gcs_client.get_bucket(bucket_name) logging.info("Uploading notebook to %s.", target) blob = bucket.blob(path) # Need to set content type so that if we browse in GCS we end up rendering # as html. blob.upload_from_string(content, content_type="text/html")
def get_oauth(project, oauth_file): """Get the OAuth information""" bucket, blob_path = util.split_gcs_uri(oauth_file) client = storage.Client(project=project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) return oauth_info
def create_junit_xml_file(test_cases, output_path, gcs_client=None): """Create a JUnit XML file. The junit schema is specified here: https://www.ibm.com/support/knowledgecenter/en/SSQ2R2_9.5.0/com.ibm.rsar.analysis.codereview.cobol.doc/topics/cac_useresults_junit.html Args: test_cases: TestSuite or List of test case objects. output_path: Path to write the XML gcs_client: GCS client to use if output is GCS. """ t = create_xml(test_cases) logging.info("Creating %s", output_path) if output_path.startswith("gs://"): b = six.StringIO() t.write(b) bucket_name, path = util.split_gcs_uri(output_path) bucket = gcs_client.get_bucket(bucket_name) blob = bucket.blob(path) blob.upload_from_string(b.getvalue()) else: dir_name = os.path.dirname(output_path) if not os.path.exists(dir_name): logging.info("Creating directory %s", dir_name) try: os.makedirs(dir_name) except OSError as e: if e.errno == errno.EEXIST: # The path already exists. This is probably a race condition # with some other test creating the directory. # We should just be able to continue pass else: raise t.write(output_path)
def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig(level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--project", default="kubeflow-ci-deployment", type=str, help=("The project.")) parser.add_argument( "--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) parser.add_argument( "--oauth_file", default=("gs://kubeflow-ci-deployment_kf-data/" "kf-iap-oauth.kubeflow-ci-deployment.yaml"), type=str, help=("The file containing the OAuth client ID & secret" "for IAP.")) # TODO(jlewi): Should rename this argument to something like kfctl_src # We should try to do it in a backwards compatible way. parser.add_argument( "--kubeflow_repo", default="/src/kubeflow/kubeflow", type=str, help=("Path to the source for kfctl. Should be the directory " "containing the Makefile to build kfctl")) parser.add_argument( "--kfctl_path", default="", type=str, help=("Path to kfctl; can be a URL.")) parser.add_argument( "--kfctl_config", default=("https://raw.githubusercontent.com/kubeflow/manifests" "/master/kfdef/kfctl_gcp_iap.yaml"), type=str, help=("Path to the kfctl config to use")) parser.add_argument( "--apps_dir", default=os.getcwd(), type=str, help=("Directory to store kubeflow apps.")) parser.add_argument( "--name", type=str, default="kf-vmaster-{uid}", help=("Name for the deployment. This can be a python format string " "with the variable uid. Uid will automatically be substituted " "for a unique value based on the time.")) parser.add_argument( "--email", type=str, default="", help=("(Optional). Email of the person to create the default profile" "for. If not specificied uses the gcloud config value.")) parser.add_argument( "--extra_users", type=str, default="", help=("Comma separated list of additional users to grant access. " "Should be in the form user:[email protected] or" "serviceAccount:[email protected]")) parser.add_argument( "--labels", type=str, default="", help=("Comma separated list of extra labels; e.g " "--labels=k1=v1,k2=v2")) parser.add_argument("--setup_project", dest="setup_project", action="store_true", help="Setup the project") parser.add_argument("--no-setup_project", dest="setup_project", action="store_false", help="Do not setup the project") parser.set_defaults(setup_project=True) parser.add_argument("--use_self_cert", dest="use_self_cert", action="store_true", help="Use a self signed certificate") parser.add_argument("--no-use_self_cert", dest="use_self_cert", action="store_false", help="Do not use a self signed certificate") parser.set_defaults(use_self_cert=True) args = parser.parse_args() util.maybe_activate_service_account() # Wait for credentials to deal with workload identity issues gcp_util.get_gcp_credentials() # Wrap gcloud commands in retry loop to deal with metadata; workload # identity issues. @retrying.retry(stop_max_delay=5*60*1000, wait_exponential_max=10000) def _gcloud_list(): # For debugging purposes output the command util.run(["gcloud", "config", "list"]) util.run(["gcloud", "auth", "list"]) _gcloud_list() bucket, blob_path = util.split_gcs_uri(args.oauth_file) client = storage.Client(project=args.project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) if args.kubeflow_repo and args.kfctl_path: raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds " "to be set.") if not args.kubeflow_repo and not args.kfctl_path: raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds " "to be set.") git_describe = "" if args.kubeflow_repo: git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") kfctl_path = build_kfctl_go(args) else: if args.kfctl_path.startswith("http"): temp_dir = tempfile.mkdtemp() filename = "kfctl" zipped = False if args.kfctl_path.endswith(".tar.gz"): zipped = True filename = filename + ".tar.gz" util.run(["curl", "-L", "-o", filename, args.kfctl_path], cwd=temp_dir) if zipped: util.run(["tar", "-xvf", "kfctl.tar.gz"], cwd=temp_dir) kfctl_path = os.path.join(temp_dir, "kfctl") logging.info("Changing permissions on %s", kfctl_path) os.chmod(kfctl_path, 0o777) else: kfctl_path = args.kfctl_path git_describe = util.run([kfctl_path, "version"]) logging.info("kfctl path set to %s", kfctl_path) # We need to keep the name short to avoid hitting limits with certificates. uid = datetime.datetime.now().strftime("%m%d") + "-" uid = uid + uuid.uuid4().hex[0:3] args.name = args.name.format(uid=uid) logging.info("Using name %s", args.name) app_dir = os.path.join(args.apps_dir, args.name) if not os.path.exists(args.apps_dir): os.makedirs(args.apps_dir) env = {} env.update(os.environ) env.update(oauth_info) # GCP labels can only take as input alphanumeric characters, hyphens, and # underscores. Replace not valid characters with hyphens. labels = {"kfctl-git": git_describe, "purpose": "kf-test-cluster", "auto-deploy": "true"} for k, v in labels.items(): val = v.lower().replace("\"", "") val = re.sub(r"[^a-z0-9\-_]", "-", val) labels[k] = val if args.labels: logging.info("Parsing labels %s", args.labels) for pair in args.labels.split(","): pieces = pair.split("=") if len(pieces) != 2: logging.error("Skipping pair %s; not of the form key=value", pair) continue key = pieces[0].strip() value = pieces[1].strip() labels[key] = value logging.info("labels: %s", labels) deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=labels) add_extra_users(args.project, args.extra_users)
def run_papermill_job( notebook_path, name, namespace, # pylint: disable=too-many-branches,too-many-statements repos, image): """Generate a K8s job to run a notebook using papermill Args: notebook_path: Path to the notebook. This should be in the form "{REPO_OWNER}/{REPO}/path/to/notebook.ipynb" name: Name for the K8s job namespace: The namespace where the job should run. repos: Which repos to checkout; if None or empty tries to infer based on PROW environment variables image: The docker image to run the notebook in. """ util.maybe_activate_service_account() with open("job.yaml") as hf: job = yaml.load(hf) if notebook_path.startswith("/"): raise ValueError( "notebook_path={0} should not start with /".format(notebook_path)) # We need to checkout the correct version of the code # in presubmits and postsubmits. We should check the environment variables # for the prow environment variables to get the appropriate values. # We should probably also only do that if the # See # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables if not repos: repos = argo_build_util.get_repo_from_prow_env() if not repos: raise ValueError("Could not get repos from prow environment variable " "and --repos isn't explicitly set") repos += ",kubeflow/testing@HEAD" logging.info("Repos set to %s", repos) job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [ "/usr/local/bin/checkout_repos.sh", "--repos=" + repos, "--src_dir=/src", "--depth=all", ] job["spec"]["template"]["spec"]["containers"][0]["image"] = image full_notebook_path = os.path.join("/src", notebook_path) job["spec"]["template"]["spec"]["containers"][0]["command"] = [ "python3", "-m", "kubeflow.examples.notebook_tests.execute_notebook", "--notebook_path", full_notebook_path ] job["spec"]["template"]["spec"]["containers"][0][ "workingDir"] = os.path.dirname(full_notebook_path) # The prow bucket to use for results/artifacts prow_bucket = prow_artifacts.PROW_RESULTS_BUCKET if os.getenv("REPO_OWNER") and os.getenv("REPO_NAME"): # Running under prow prow_dir = prow_artifacts.get_gcs_dir(prow_bucket) logging.info("Prow artifacts dir: %s", prow_dir) prow_dir = os.path.join(prow_dir, "artifacts") if os.getenv("TEST_TARGET_NAME"): prow_dir = os.path.join(prow_dir, os.getenv("TEST_TARGET_NAME").lstrip("/")) prow_bucket, prow_path = util.split_gcs_uri(prow_dir) else: prow_path = "notebook-test" + datetime.datetime.now().strftime( "%H%M%S") prow_path = prow_path + "-" + uuid.uuid4().hex[0:3] prow_dir = util.to_gcs_uri(prow_bucket, prow_path) prow_path = os.path.join(prow_path, name + ".html") output_gcs = util.to_gcs_uri(NB_BUCKET, prow_path) job["spec"]["template"]["spec"]["containers"][0]["env"] = [ { "name": "OUTPUT_GCS", "value": output_gcs }, { "name": "PYTHONPATH", "value": "/src/kubeflow/testing/py:/src/kubeflow/examples/py" }, ] logging.info("Notebook will be written to %s", output_gcs) util.load_kube_config(persist_config=False) if name: job["metadata"]["name"] = name else: job["metadata"]["name"] = ("notebook-test-" + datetime.datetime.now().strftime("%H%M%S") + "-" + uuid.uuid4().hex[0:3]) name = job["metadata"]["name"] job["metadata"]["namespace"] = namespace # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() batch_api = k8s_client.BatchV1Api(api_client) logging.info("Creating job:\n%s", yaml.dump(job)) actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"], job) logging.info("Created job %s.%s:\n%s", namespace, name, yaml.safe_dump(actual_job.to_dict())) final_job = util.wait_for_job(api_client, namespace, name, timeout=datetime.timedelta(minutes=30)) logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict())) # Download notebook html to artifacts logging.info("Copying %s to bucket %s", output_gcs, prow_bucket) storage_client = storage.Client() bucket = storage_client.get_bucket(NB_BUCKET) blob = bucket.get_blob(prow_path) destination_bucket = storage_client.get_bucket(prow_bucket) bucket.copy_blob(blob, destination_bucket) if not final_job.status.conditions: raise RuntimeError("Job {0}.{1}; did not complete".format( namespace, name)) last_condition = final_job.status.conditions[-1] if last_condition.type not in ["Complete"]: logging.error("Job didn't complete successfully") raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument("--project", default="kubeflow-ci-deployment", type=str, help=("The project.")) parser.add_argument("--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) parser.add_argument( "--oauth_file", default=("gs://kubeflow-ci-deployment_kf-data/" "kf-iap-oauth.kubeflow-ci-deployment.yaml"), type=str, help=("The file containing the OAuth client ID & secret" "for IAP.")) parser.add_argument("--kubeflow_repo", default="/home/jlewi/git_kubeflow", type=str, help=("Path to the Kubeflow repo to use")) parser.add_argument( "--kfctl_config", default=("https://raw.githubusercontent.com/kubeflow/kubeflow/master" "/bootstrap/config/kfctl_gcp_iap.yaml"), type=str, help=("Path to the kfctl config to use")) parser.add_argument("--apps_dir", default=os.getcwd(), type=str, help=("Directory to store kubeflow apps.")) parser.add_argument("--name", type=str, default="", help=("Name for the deployment.")) parser.add_argument("--snapshot_file", default="", type=str, help=("A json file containing information about the " "snapshot to use.")) parser.add_argument("--job_name", default="", type=str, help=("Pod name running the job.")) args = parser.parse_args() bucket, blob_path = util.split_gcs_uri(args.oauth_file) client = storage.Client(project=args.project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) git_describe = util.run( ["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") if args.snapshot_file: logging.info("Loading info from snapshot file %s", args.snapshot_file) with open(args.snapshot_file) as hf: snapshot_info = json.load(hf) name = snapshot_info["name"] else: name = args.name kfctl_path = build_kfctl_go(args) app_dir = os.path.join(args.apps_dir, name) # Clean up previous deployment. We attempt to run "kfctl delete all" # but we don't depend on it succeeding because the app directory might # not be up to date. # since we are not able to guarantee apps config in repository is up to date. if os.path.exists(app_dir): try: util.run([kfctl_path, "delete", "all", "--delete_storage"], cwd=app_dir) except subprocess.CalledProcessError as e: logging.error("kfctl delete all failed; %s", e) if os.path.exists(app_dir): shutil.rmtree(app_dir) if not os.path.exists(args.apps_dir): os.makedirs(args.apps_dir) # Delete deployment beforehand. If not, updating action might be failed when # resource permission/requirement is changed. It's cleaner to delete and # re-create it. delete_deployment = os.path.join(args.kubeflow_repo, "scripts", "gke", "delete_deployment.sh") util.run([ delete_deployment, "--project=" + args.project, "--deployment=" + name, "--zone=" + args.zone ], cwd=args.apps_dir) # Delete script doesn't delete storage deployment by design. delete_storage_deployment(args.project, name + "-storage") env = {} env.update(os.environ) env.update(oauth_info) labels = { "GIT_LABEL": git_describe, "PURPOSE": "kf-test-cluster", } label_args = [] for k, v in labels.items(): # labels can only take as input alphanumeric characters, hyphens, and # underscores. Replace not valid characters with hyphens. val = v.lower().replace("\"", "") val = re.sub(r"[^a-z0-9\-_]", "-", val) label_args.append("{key}={val}".format(key=k.lower(), val=val)) endpoint = "{name}.endpoints.{project}.cloud.goog".format( name=name, project=args.project) # Fire-and-forgot process to undelete endpoint services. Deletion to # endpoint service is soft-deletion, e.g. it will be purged after 30 # days. If any deployments is trying to re-use the same endpoint, it # will be an error if it's in soft-deletion. Need to undelete it so # that endpoint-controller could complete its job. try: util.run([ "gcloud", "endpoints", "services", "undelete", endpoint, "--verbosity=info", "--project=" + args.project ]) except subprocess.CalledProcessError as e: logging.info("endpoint undeletion is failed: %s", e) deploy_with_kfctl_go(kfctl_path, args, app_dir, env) create_info_file(args, app_dir, git_describe) logging.info("Annotating cluster with labels: %s", str(label_args)) # Set labels on the deployment util.run([ "gcloud", "--project", args.project, "deployment-manager", "deployments", "update", name, "--update-labels", ",".join(label_args) ], cwd=app_dir) # Set labels on the cluster. Labels on the deployment is not shown on # Pantheon - it's easier for users to read if cluster also has labels. util.run([ "gcloud", "container", "clusters", "update", name, "--project", args.project, "--zone", args.zone, "--update-labels", ",".join(label_args) ], cwd=app_dir) # To work around lets-encrypt certificate uses create a self-signed # certificate kubeflow_branch = None for repo in snapshot_info["repos"]: if repo["repo"] == "kubeflow": kubeflow_branch = repo["branch"] logging.info("kubeflow branch %s", kubeflow_branch) if kubeflow_branch == "v0.6-branch": logging.info("Creating a self signed certificate") util.run(["kubectl", "config", "use-context", name]) tls_endpoint = "--host={0}.endpoints.{1}.cloud.goog".format( name, args.project) cert_dir = tempfile.mkdtemp() util.run(["kube-rsa", tls_endpoint], cwd=cert_dir) util.run([ "kubectl", "-n", "kubeflow", "create", "secret", "tls", "envoy-ingress-tls", "--cert=ca.pem", "--key=ca-key.pem" ], cwd=cert_dir) shutil.rmtree(cert_dir) else: # starting with 0.7 we are moving to managed GKE certificates. # So we can't just generate a self-signed certificate # TODO(jlewi): If we still hit lets-encrypt quota issues then # we can fix this by generating new hostnames logging.info("Not creating a self signed certificate")
def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--base_name", default="kf-v0-4", type=str, help=( "The base name for the deployment typically kf-vX-Y or kf-vmaster." )) parser.add_argument("--project", default="kubeflow-ci", type=str, help=("The project.")) parser.add_argument("--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) parser.add_argument( "--oauth_file", default="gs://kubeflow-ci_kf-data/kf-iap-oauth.kubeflow-ci.yaml", type=str, help=("The file containing the OAuth client ID & secret" "for IAP.")) parser.add_argument("--kubeflow_repo", default="/home/jlewi/git_kubeflow", type=str, help=("Path to the Kubeflow repo to use")) parser.add_argument("--apps_dir", default=os.getcwd(), type=str, help=("Directory to store kubeflow apps.")) parser.add_argument( "--deployment_worker_cluster", default="kubeflow-testing", type=str, help=("Name of cluster deployment cronjob workers use.")) parser.add_argument("--cluster_num", default="", type=int, help=("Number of cluster to deploy to.")) parser.add_argument("--timestamp", default="", type=str, help=("Timestamp deployment takes snapshot.")) parser.add_argument("--job_name", default="", type=str, help=("Pod name running the job.")) args = parser.parse_args() bucket, blob_path = util.split_gcs_uri(args.oauth_file) client = storage.Client(project=args.project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) git_describe = util.run( ["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") # TODO(https://github.com/kubeflow/testing/issues/95): We want to cycle # between N different names e.g. # kf-vX-Y-n00, kf-vX-Y-n01, ... kf-vX-Y-n05 # The reason to reuse names is because for IAP we need to manually # set the redirect URIs. So we want to cycle between a set of known # endpoints. We should add logic to automatically recycle deployments. # i.e. we should find the oldest one and reuse that. num = args.cluster_num name = "{0}-n{1:02d}".format(args.base_name, num) # Clean up previous deployment. We are not able to run "kfctl delete all" # since we are not able to guarantee apps config in repository is up to date. util.run(["rm", "-rf", name], cwd=args.apps_dir) # Delete deployment beforehand. If not, updating action might be failed when # resource permission/requirement is changed. It's cleaner to delete and # re-create it. delete_deployment = os.path.join(args.kubeflow_repo, "scripts", "gke", "delete_deployment.sh") util.run([ delete_deployment, "--project=" + args.project, "--deployment=" + name, "--zone=" + args.zone ], cwd=args.apps_dir) # Create a dummy kubeconfig in cronjob worker. util.run([ "gcloud", "container", "clusters", "get-credentials", args.deployment_worker_cluster, "--zone", args.zone, "--project", args.project ], cwd=args.apps_dir) app_dir = os.path.join(args.apps_dir, name) kfctl = os.path.join(args.kubeflow_repo, "scripts", "kfctl.sh") ks_app_dir = os.path.join(app_dir, "ks_app") util.run([ kfctl, "init", name, "--project", args.project, "--zone", args.zone, "--platform", "gcp", "--skipInitProject", "true" ], cwd=args.apps_dir) labels = {} with open(os.path.join(app_dir, "kf_app.yaml"), "w") as hf: app = { "labels": { "GIT_LABEL": git_describe, "PURPOSE": "kf-test-cluster", }, } if args.timestamp: app["labels"]["SNAPSHOT_TIMESTAMP"] = args.timestamp if args.job_name: app["labels"]["DEPLOYMENT_JOB"] = args.job_name labels = app.get("labels", {}) yaml.dump(app, hf) label_args = [] for k, v in labels.items(): # labels can only take as input alphanumeric characters, hyphens, and # underscores. Replace not valid characters with hyphens. val = v.lower().replace("\"", "") val = re.sub(r"[^a-z0-9\-_]", "-", val) label_args.append("{key}={val}".format(key=k.lower(), val=val)) util.run([kfctl, "generate", "all"], cwd=app_dir) util.run(["ks", "generate", "seldon", "seldon"], cwd=ks_app_dir) env = {} env.update(os.environ) env.update(oauth_info) # kfctl apply all might break during cronjob invocation when depending # components are not ready. Make it retry several times should be enough. kfctl_apply_with_retry(kfctl, app_dir, env) logging.info("Annotating cluster with labels: %s", str(label_args)) util.run([ "gcloud", "container", "clusters", "update", name, "--zone", args.zone, "--update-labels", ",".join(label_args) ], cwd=app_dir)
def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--base_name", default="kf-v0-4", type=str, help=( "The base name for the deployment typically kf-vX-Y or kf-vmaster." )) parser.add_argument("--project", default="kubeflow-ci", type=str, help=("The project.")) parser.add_argument("--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) parser.add_argument( "--oauth_file", default="gs://kubeflow-ci_kf-data/kf-iap-oauth.kubeflow-ci.yaml", type=str, help=("The file containing the OAuth client ID & secret" "for IAP.")) parser.add_argument("--kubeflow_repo", default="/home/jlewi/git_kubeflow", type=str, help=("Path to the Kubeflow repo to use")) parser.add_argument("--apps_dir", default=os.getcwd(), type=str, help=("Directory to store kubeflow apps.")) parser.add_argument( "--deployment_worker_cluster", default="kubeflow-testing", type=str, help=("Name of cluster deployment cronjob workers use.")) parser.add_argument("--cluster_num", default="", type=int, help=("Number of cluster to deploy to.")) args = parser.parse_args() bucket, blob_path = util.split_gcs_uri(args.oauth_file) client = storage.Client(project=args.project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) git_describe = util.run( ["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") # TODO(https://github.com/kubeflow/testing/issues/95): We want to cycle # between N different names e.g. # kf-vX-Y-n00, kf-vX-Y-n01, ... kf-vX-Y-n05 # The reason to reuse names is because for IAP we need to manually # set the redirect URIs. So we want to cycle between a set of known # endpoints. We should add logic to automatically recycle deployments. # i.e. we should find the oldest one and reuse that. num = args.cluster_num name = "{0}-n{1:02d}".format(args.base_name, num) # Clean up previous deployment. We are not able to run "kfctl delete all" # since we are not able to guarantee apps config in repository is up to date. util.run(["rm", "-rf", name], cwd=args.apps_dir) # TODO(gabrielwen): # https://github.com/kubeflow/testing/issues/295 # 1. Is deployment deletion still needed? # 2. If it is, figure out permission set up for it. # 3. Should use # https://github.com/kubeflow/kubeflow/blob/master/scripts/gke/delete_deployment.sh # Create a dummy kubeconfig in cronjob worker. util.run([ "gcloud", "container", "clusters", "get-credentials", args.deployment_worker_cluster, "--zone", args.zone, "--project", args.project ], cwd=args.apps_dir) app_dir = os.path.join(args.apps_dir, name) kfctl = os.path.join(args.kubeflow_repo, "scripts", "kfctl.sh") util.run([ kfctl, "init", name, "--project", args.project, "--zone", args.zone, "--platform", "gcp", "--skipInitProject", "true" ], cwd=args.apps_dir) with open(os.path.join(app_dir, "kf_app.yaml"), "w") as hf: app = { "labels": { "GIT_LABEL": git_describe, "PURPOSE": "kf-test-cluster", "CREATOR": getpass.getuser(), }, } yaml.dump(app, hf) util.run([kfctl, "generate", "all"], cwd=app_dir) env = {} env.update(os.environ) env.update(oauth_info) # kfctl apply all might break during cronjob invocation when depending # components are not ready. Make it retry several times should be enough. kfctl_apply_with_retry(kfctl, app_dir, env)
def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument("--project", default="kubeflow-ci", type=str, help=("The project.")) parser.add_argument("--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) parser.add_argument( "--oauth_file", default="gs://kubeflow-ci_kf-data/kf-iap-oauth.kubeflow-ci.yaml", type=str, help=("The file containing the OAuth client ID & secret" "for IAP.")) parser.add_argument("--kubeflow_repo", default="/home/jlewi/git_kubeflow", type=str, help=("Path to the Kubeflow repo to use")) parser.add_argument("--apps_dir", default=os.getcwd(), type=str, help=("Directory to store kubeflow apps.")) parser.add_argument("--name", default="", type=str, help=("Name for the deployment.")) parser.add_argument("--snapshot_file", default="", type=str, help=("A json file containing information about the " "snapshot to use.")) parser.add_argument("--timestamp", default="", type=str, help=("Timestamp deployment takes snapshot.")) parser.add_argument("--job_name", default="", type=str, help=("Pod name running the job.")) args = parser.parse_args() bucket, blob_path = util.split_gcs_uri(args.oauth_file) client = storage.Client(project=args.project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) git_describe = util.run( ["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") timestamp = args.timestamp if args.snapshot_file: logging.info("Loading info from snapshot file %s", args.snapshot_file) with open(args.snapshot_file) as hf: snapshot_info = json.load(hf) name = snapshot_info["name"] timestamp = snapshot_info.get("timestamp", "") else: name = args.name # Clean up previous deployment. We are not able to run "kfctl delete all" # since we are not able to guarantee apps config in repository is up to date. util.run(["rm", "-rf", name], cwd=args.apps_dir) # Delete deployment beforehand. If not, updating action might be failed when # resource permission/requirement is changed. It's cleaner to delete and # re-create it. delete_deployment = os.path.join(args.kubeflow_repo, "scripts", "gke", "delete_deployment.sh") util.run([ delete_deployment, "--project=" + args.project, "--deployment=" + name, "--zone=" + args.zone ], cwd=args.apps_dir) # Delete script doesn't delete storage deployment by design. delete_storage_deployment(args.project, name + "-storage") app_dir = os.path.join(args.apps_dir, name) kfctl = os.path.join(args.kubeflow_repo, "scripts", "kfctl.sh") ks_app_dir = os.path.join(app_dir, "ks_app") util.run([ kfctl, "init", name, "--project", args.project, "--zone", args.zone, "--platform", "gcp", "--skipInitProject", "true" ], cwd=args.apps_dir) labels = {} with open(os.path.join(app_dir, "kf_app.yaml"), "w") as hf: app = { "labels": { "GIT_LABEL": git_describe, "PURPOSE": "kf-test-cluster", }, } if timestamp: app["labels"]["SNAPSHOT_TIMESTAMP"] = timestamp if args.job_name: app["labels"]["DEPLOYMENT_JOB"] = args.job_name labels = app.get("labels", {}) yaml.dump(app, hf) label_args = [] for k, v in labels.items(): # labels can only take as input alphanumeric characters, hyphens, and # underscores. Replace not valid characters with hyphens. val = v.lower().replace("\"", "") val = re.sub(r"[^a-z0-9\-_]", "-", val) label_args.append("{key}={val}".format(key=k.lower(), val=val)) env = {} env.update(os.environ) env.update(oauth_info) # We need to apply platform before doing generate k8s because we need # to have a cluster for ksonnet. # kfctl apply all might break during cronjob invocation when depending # components are not ready. Make it retry several times should be enough. run_with_retry([kfctl, "generate", "platform"], cwd=app_dir, env=env) run_with_retry([kfctl, "apply", "platform"], cwd=app_dir, env=env) run_with_retry([kfctl, "generate", "k8s"], cwd=app_dir, env=env) run_with_retry([kfctl, "apply", "k8s"], cwd=app_dir, env=env) run_with_retry(["ks", "generate", "seldon", "seldon"], cwd=ks_app_dir, env=env) logging.info("Annotating cluster with labels: %s", str(label_args)) # Set labels on the deployment util.run([ "gcloud", "--project", args.project, "deployment-manager", "deployments", "update", name, "--update-labels", ",".join(label_args) ], cwd=app_dir) # To work around lets-encrypt certificate uses create a self-signed # certificate util.run([ "gcloud", "container", "clusters", "get-credentials", name, "--zone", args.zone, "--project", args.project ]) tls_endpoint = "--host=%s.endpoints.kubeflow-ci.cloud.goog" % name util.run(["kube-rsa", tls_endpoint]) util.run([ "kubectl", "-n", "kubeflow", "create", "secret", "tls", "envoy-ingress-tls", "--cert=ca.pem", "--key=ca-key.pem" ])
def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig(level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--base_name", default="kf-v0-4", type=str, help=("The base name for the deployment typically kf-vX-Y or kf-vmaster.")) parser.add_argument( "--project", default="kubeflow-ci", type=str, help=("The project.")) parser.add_argument( "--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) parser.add_argument( "--oauth_file", default="gs://kubeflow-ci_kf-data/kf-iap-oauth.kubeflow-ci.yaml", type=str, help=("The file containing the OAuth client ID & secret" "for IAP.")) parser.add_argument( "--kubeflow_repo", default="/home/jlewi/git_kubeflow", type=str, help=("Path to the Kubeflow repo to use")) parser.add_argument( "--apps_dir", default=os.getcwd(), type=str, help=("Directory to store kubeflow apps.")) args = parser.parse_args() bucket, blob_path = util.split_gcs_uri(args.oauth_file) client = storage.Client(project=args.project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") # TODO(https://github.com/kubeflow/testing/issues/95): We want to cycle # between N different names e.g. # kf-vX-Y-n00, kf-vX-Y-n01, ... kf-vX-Y-n05 # The reason to reuse names is because for IAP we need to manually # set the redirect URIs. So we want to cycle between a set of known # endpoints. We should add logic to automatically recycle deployments. # i.e. we should find the oldest one and reuse that. num = 0 name = "{0}-n{1:02d}".format(args.base_name, num) app_dir = os.path.join(args.apps_dir, name) kfctl = os.path.join(args.kubeflow_repo, "scripts", "kfctl.sh") util.run([kfctl, "init", name, "--project", args.project, "--zone", args.zone, "--platform", "gcp", "--skipInitProject", "true"], cwd=args.apps_dir ) with open(os.path.join(app_dir, "kf_app.yaml"), "w") as hf: app = { "labels": { "GIT_LABEL": git_describe, "PURPOSE": "kf-test-cluster", "CREATOR": getpass.getuser(), }, } yaml.dump(app, hf) util.run([kfctl, "generate", "all"], cwd=app_dir) env = {} env.update(os.environ) env.update(oauth_info) util.run([kfctl, "apply", "all"], cwd=app_dir, env=env)