def main(): # pylint: disable=too-many-locals,too-many-statements logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals # create the top-level parser parser = argparse.ArgumentParser(description="Test Kubeflow E2E.") parser.add_argument( "--test_dir", default="", type=str, help="Directory to use for all the test files. If not set a temporary " "directory is created.") parser.add_argument( "--artifacts_dir", default="", type=str, help="Directory to use for artifacts that should be preserved after " "the test runs. Defaults to test_dir if not set.") parser.add_argument( "--as_gcloud_user", dest="as_gcloud_user", action="store_true", help=("Impersonate the user corresponding to the gcloud " "command with kubectl and ks.")) parser.add_argument("--no-as_gcloud_user", dest="as_gcloud_user", action="store_false") parser.set_defaults(as_gcloud_user=False) # TODO(jlewi): This should not be a global flag. parser.add_argument("--project", default=None, type=str, help="The project to use.") # TODO(jlewi): This should not be a global flag. parser.add_argument("--namespace", default=None, type=str, help=("The namespace to use.")) parser.add_argument( "--github_token", default=None, type=str, help= ("The GitHub API token to use. This is needed since ksonnet uses the " "GitHub API and without it we get rate limited. For more info see: " "https://github.com/ksonnet/ksonnet/blob/master/docs" "/troubleshooting.md. Can also be set using environment variable " "GITHUB_TOKEN.")) parser.add_argument("--deploy_name", default="", type=str, help="The name of the deployment.") parser.add_argument("--workflow_name", default="", type=str, help="The name of the workflow.") subparsers = parser.add_subparsers() parser_teardown = subparsers.add_parser( "teardown", help="teardown the test infrastructure.") parser_teardown.set_defaults(func=teardown) parser_tf_serving = subparsers.add_parser( "deploy_model", help="Deploy a TF serving model.") parser_tf_serving.set_defaults(func=deploy_model) parser_tf_serving.add_argument( "--params", default="", type=str, help=("Comma separated list of parameters to set on the model.")) parser_pytorch_job = subparsers.add_parser("deploy_pytorchjob", help="Deploy a pytorch-job") parser_pytorch_job.set_defaults(func=deploy_pytorchjob) parser_pytorch_job.add_argument( "--params", default="", type=str, help=("Comma separated list of parameters to set on the model.")) parser_argo_job = subparsers.add_parser("deploy_argo", help="Deploy argo") parser_argo_job.set_defaults(func=deploy_argo) parser_katib_test = subparsers.add_parser("test_katib", help="Test Katib") parser_katib_test.set_defaults(func=test_katib) parser_minikube = subparsers.add_parser( "deploy_minikube", help="Setup a K8s cluster on minikube.") parser_minikube.set_defaults(func=deploy_minikube) parser_minikube.add_argument("--vm_name", required=True, type=str, help="The name of the VM to use.") parser_minikube.add_argument("--zone", default="us-east1-d", type=str, help="The zone for the cluster.") parser_teardown_minikube = subparsers.add_parser( "teardown_minikube", help="Delete the VM running minikube.") parser_teardown_minikube.set_defaults(func=teardown_minikube) parser_teardown_minikube.add_argument("--zone", default="us-east1-d", type=str, help="The zone for the cluster.") parser_teardown_minikube.add_argument("--vm_name", required=True, type=str, help="The name of the VM to use.") args = parser.parse_args() if not args.test_dir: logging.info("--test_dir not set; using a temporary directory.") now = datetime.datetime.now() label = "test_deploy-" + now.strftime( "%m%d-%H%M-") + uuid.uuid4().hex[0:4] # Create a temporary directory for this test run args.test_dir = os.path.join(tempfile.gettempdir(), label) if not args.artifacts_dir: args.artifacts_dir = args.test_dir test_log = os.path.join( args.artifacts_dir, "logs", "test_deploy." + args.func.__name__ + args.deploy_name + ".log.txt") try: os.makedirs(os.path.dirname(test_log)) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir( os.path.dirname(test_log)): pass else: raise # TODO(jlewi): We should make this a util routine in kubeflow.testing.util # Setup a logging file handler. This way we can upload the log outputs # to gubernator. root_logger = logging.getLogger() file_handler = logging.FileHandler(test_log) root_logger.addHandler(file_handler) # We need to explicitly set the formatter because it will not pick up # the BasicConfig. formatter = logging.Formatter( fmt=("%(levelname)s|%(asctime)s" "|%(pathname)s|%(lineno)d| %(message)s"), datefmt="%Y-%m-%dT%H:%M:%S") file_handler.setFormatter(formatter) logging.info("Logging to %s", test_log) util.run([ks, "version"]) util.maybe_activate_service_account() config_file = os.path.expanduser(kube_config.KUBE_CONFIG_DEFAULT_LOCATION) # Print out the config to help debugging. output = util.run_and_output(["gcloud", "config", "config-helper"]) logging.info("gcloud config: \n%s", output) wrap_test(args)
def get_gcp_identity(): identity = util.run_and_output( ["gcloud", "config", "get-value", "account"]) logging.info("Current GCP account: %s", identity) return identity
def setup_kubeflow(args): """Setup Kubeflow. Args: args: Command line arguments that control the setup process. """ project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, "tfJobVersion": args.tf_job_version, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) ks_deploy(args.test_app_dir, component, params, account=account) # Verify that the TfJob operator is actually deployed. if args.tf_job_version == "v1alpha2": tf_job_deployment_name = "tf-job-operator-v1alpha2" elif args.tf_job_version == "v1beta1": tf_job_deployment_name = "tf-job-operator-v1beta1" else: raise ValueError("Unrecognized value for tf_job_version %s" % args.tf_job_version) logging.info("Verifying TfJob deployment %s started.", tf_job_deployment_name) # TODO(jlewi): We should verify the image of the operator is the correct # one. try: util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) finally: # Run kubectl describe to get useful information about the deployment. # This will help troubleshoot any errors. util.run([ "kubectl", "-n", args.namespace, "describe", "deploy", tf_job_deployment_name ]) util.run([ "kubectl", "-n", args.namespace, "describe", "pods", "-l", "name=tf-job-operator" ]) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup_cluster(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split( "=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) util.run([ "kubectl", "create", "clusterrolebinding", "default-admin", "--clusterrole=cluster-admin", "--user="******"setup-cluster failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "setup-cluster" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split("=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, "tfJobVersion": args.tf_job_version, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) util.run([ "kubectl", "create", "clusterrolebinding", "default-admin", "--clusterrole=cluster-admin", "--user="******"v1alpha1": tf_job_deployment_name = "tf-job-operator" elif args.tf_job_version == "v1alpha2": tf_job_deployment_name = "tf-job-operator-v1alpha2" else: raise ValueError( "Unrecognized value for tf_job_version %s" % args.tf_job_version) logging.info("Verifying TfJob deployment %s started.", tf_job_deployment_name) # TODO(jlewi): We should verify the image of the operator is the correct. util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)