Ejemplo n.º 1
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    logging.getLogger().setLevel(logging.INFO)  # pylint: disable=too-many-locals
    # create the top-level parser
    parser = argparse.ArgumentParser(description="Test Kubeflow E2E.")

    parser.add_argument(
        "--test_dir",
        default="",
        type=str,
        help="Directory to use for all the test files. If not set a temporary "
        "directory is created.")

    parser.add_argument(
        "--artifacts_dir",
        default="",
        type=str,
        help="Directory to use for artifacts that should be preserved after "
        "the test runs. Defaults to test_dir if not set.")

    parser.add_argument(
        "--as_gcloud_user",
        dest="as_gcloud_user",
        action="store_true",
        help=("Impersonate the user corresponding to the gcloud "
              "command with kubectl and ks."))
    parser.add_argument("--no-as_gcloud_user",
                        dest="as_gcloud_user",
                        action="store_false")
    parser.set_defaults(as_gcloud_user=False)

    # TODO(jlewi): This should not be a global flag.
    parser.add_argument("--project",
                        default=None,
                        type=str,
                        help="The project to use.")

    # TODO(jlewi): This should not be a global flag.
    parser.add_argument("--namespace",
                        default=None,
                        type=str,
                        help=("The namespace to use."))

    parser.add_argument(
        "--github_token",
        default=None,
        type=str,
        help=
        ("The GitHub API token to use. This is needed since ksonnet uses the "
         "GitHub API and without it we get rate limited. For more info see: "
         "https://github.com/ksonnet/ksonnet/blob/master/docs"
         "/troubleshooting.md. Can also be set using environment variable "
         "GITHUB_TOKEN."))

    parser.add_argument("--deploy_name",
                        default="",
                        type=str,
                        help="The name of the deployment.")

    parser.add_argument("--workflow_name",
                        default="",
                        type=str,
                        help="The name of the workflow.")

    subparsers = parser.add_subparsers()

    parser_teardown = subparsers.add_parser(
        "teardown", help="teardown the test infrastructure.")

    parser_teardown.set_defaults(func=teardown)

    parser_tf_serving = subparsers.add_parser(
        "deploy_model", help="Deploy a TF serving model.")

    parser_tf_serving.set_defaults(func=deploy_model)

    parser_tf_serving.add_argument(
        "--params",
        default="",
        type=str,
        help=("Comma separated list of parameters to set on the model."))

    parser_pytorch_job = subparsers.add_parser("deploy_pytorchjob",
                                               help="Deploy a pytorch-job")

    parser_pytorch_job.set_defaults(func=deploy_pytorchjob)

    parser_pytorch_job.add_argument(
        "--params",
        default="",
        type=str,
        help=("Comma separated list of parameters to set on the model."))

    parser_argo_job = subparsers.add_parser("deploy_argo", help="Deploy argo")

    parser_argo_job.set_defaults(func=deploy_argo)

    parser_katib_test = subparsers.add_parser("test_katib", help="Test Katib")

    parser_katib_test.set_defaults(func=test_katib)

    parser_minikube = subparsers.add_parser(
        "deploy_minikube", help="Setup a K8s cluster on minikube.")

    parser_minikube.set_defaults(func=deploy_minikube)

    parser_minikube.add_argument("--vm_name",
                                 required=True,
                                 type=str,
                                 help="The name of the VM to use.")

    parser_minikube.add_argument("--zone",
                                 default="us-east1-d",
                                 type=str,
                                 help="The zone for the cluster.")

    parser_teardown_minikube = subparsers.add_parser(
        "teardown_minikube", help="Delete the VM running minikube.")

    parser_teardown_minikube.set_defaults(func=teardown_minikube)

    parser_teardown_minikube.add_argument("--zone",
                                          default="us-east1-d",
                                          type=str,
                                          help="The zone for the cluster.")

    parser_teardown_minikube.add_argument("--vm_name",
                                          required=True,
                                          type=str,
                                          help="The name of the VM to use.")

    args = parser.parse_args()

    if not args.test_dir:
        logging.info("--test_dir not set; using a temporary directory.")

        now = datetime.datetime.now()
        label = "test_deploy-" + now.strftime(
            "%m%d-%H%M-") + uuid.uuid4().hex[0:4]

        # Create a temporary directory for this test run
        args.test_dir = os.path.join(tempfile.gettempdir(), label)

    if not args.artifacts_dir:
        args.artifacts_dir = args.test_dir

    test_log = os.path.join(
        args.artifacts_dir, "logs",
        "test_deploy." + args.func.__name__ + args.deploy_name + ".log.txt")

    try:
        os.makedirs(os.path.dirname(test_log))
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(
                os.path.dirname(test_log)):
            pass
        else:
            raise

    # TODO(jlewi): We should make this a util routine in kubeflow.testing.util
    # Setup a logging file handler. This way we can upload the log outputs
    # to gubernator.
    root_logger = logging.getLogger()

    file_handler = logging.FileHandler(test_log)
    root_logger.addHandler(file_handler)
    # We need to explicitly set the formatter because it will not pick up
    # the BasicConfig.
    formatter = logging.Formatter(
        fmt=("%(levelname)s|%(asctime)s"
             "|%(pathname)s|%(lineno)d| %(message)s"),
        datefmt="%Y-%m-%dT%H:%M:%S")
    file_handler.setFormatter(formatter)
    logging.info("Logging to %s", test_log)
    util.run([ks, "version"])

    util.maybe_activate_service_account()
    config_file = os.path.expanduser(kube_config.KUBE_CONFIG_DEFAULT_LOCATION)

    # Print out the config to help debugging.
    output = util.run_and_output(["gcloud", "config", "config-helper"])
    logging.info("gcloud config: \n%s", output)
    wrap_test(args)
Ejemplo n.º 2
0
def get_gcp_identity():
    identity = util.run_and_output(
        ["gcloud", "config", "get-value", "account"])
    logging.info("Current GCP account: %s", identity)
    return identity
Ejemplo n.º 3
0
def setup_kubeflow(args):
    """Setup Kubeflow.

  Args:
    args: Command line arguments that control the setup process.
  """
    project = args.project
    cluster_name = args.cluster
    zone = args.zone

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    try:
        start = time.time()

        params = {
            "tfJobImage": args.image,
            "name": "kubeflow-core",
            "namespace": args.namespace,
            "tfJobVersion": args.tf_job_version,
        }

        component = "core"

        account = util.run_and_output(
            ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
        logging.info("Using GCP account %s", account)

        ks_deploy(args.test_app_dir, component, params, account=account)

        # Verify that the TfJob operator is actually deployed.
        if args.tf_job_version == "v1alpha2":
            tf_job_deployment_name = "tf-job-operator-v1alpha2"
        elif args.tf_job_version == "v1beta1":
            tf_job_deployment_name = "tf-job-operator-v1beta1"
        else:
            raise ValueError("Unrecognized value for tf_job_version %s" %
                             args.tf_job_version)
        logging.info("Verifying TfJob deployment %s started.",
                     tf_job_deployment_name)

        # TODO(jlewi): We should verify the image of the operator is the correct
        # one.
        try:
            util.wait_for_deployment(api_client, args.namespace,
                                     tf_job_deployment_name)
        finally:
            # Run kubectl describe to get useful information about the deployment.
            # This will help troubleshoot any errors.
            util.run([
                "kubectl", "-n", args.namespace, "describe", "deploy",
                tf_job_deployment_name
            ])
            util.run([
                "kubectl", "-n", args.namespace, "describe", "pods", "-l",
                "name=tf-job-operator"
            ])

    # Reraise the exception so that the step fails because there's no point
    # continuing the test.
    except subprocess.CalledProcessError as e:
        t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
        raise
    except util.TimeoutError as e:
        t.failure = e.message
        raise
    finally:
        t.time = time.time() - start
        t.name = "kubeflow-deploy"
        t.class_name = "GKE"
        gcs_client = storage.Client(project=args.project)
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Ejemplo n.º 4
0
def setup_cluster(args):
    """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
    gke = discovery.build("container", "v1")

    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    machine_type = "n1-standard-8"

    cluster_request = {
        "cluster": {
            "name": cluster_name,
            "description": "A GKE cluster for TF.",
            "initialNodeCount": 1,
            "nodeConfig": {
                "machineType":
                machine_type,
                "oauthScopes": [
                    "https://www.googleapis.com/auth/cloud-platform",
                ],
            },
        }
    }

    if args.accelerators:
        # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
        cluster_request["cluster"]["enableKubernetesAlpha"] = True

        cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
        for accelerator_spec in args.accelerators:
            accelerator_type, accelerator_count = accelerator_spec.split(
                "=", 1)
            cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
                "acceleratorCount":
                accelerator_count,
                "acceleratorType":
                accelerator_type,
            })

    util.create_cluster(gke, project, zone, cluster_request)

    util.configure_kubectl(project, zone, cluster_name)

    util.load_kube_config()
    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    try:
        start = time.time()

        account = util.run_and_output(
            ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
        logging.info("Using GCP account %s", account)
        util.run([
            "kubectl", "create", "clusterrolebinding", "default-admin",
            "--clusterrole=cluster-admin", "--user="******"setup-cluster failed;\n" + (e.output or "")
        raise
    except util.TimeoutError as e:
        t.failure = e.message
        raise
    finally:
        t.time = time.time() - start
        t.name = "setup-cluster"
        t.class_name = "GKE"
        gcs_client = storage.Client(project=args.project)
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Ejemplo n.º 5
0
def setup(args):
  """Setup a GKE cluster for TensorFlow jobs.

  Args:
    args: Command line arguments that control the setup process.
  """
  gke = discovery.build("container", "v1")

  project = args.project
  cluster_name = args.cluster
  zone = args.zone
  machine_type = "n1-standard-8"

  cluster_request = {
    "cluster": {
      "name": cluster_name,
      "description": "A GKE cluster for TF.",
      "initialNodeCount": 1,
      "nodeConfig": {
        "machineType": machine_type,
        "oauthScopes": [
          "https://www.googleapis.com/auth/cloud-platform",
        ],
      },
    }
  }

  if args.accelerators:
    # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha
    cluster_request["cluster"]["enableKubernetesAlpha"] = True

    cluster_request["cluster"]["nodeConfig"]["accelerators"] = []
    for accelerator_spec in args.accelerators:
      accelerator_type, accelerator_count = accelerator_spec.split("=", 1)
      cluster_request["cluster"]["nodeConfig"]["accelerators"].append({
        "acceleratorCount":
        accelerator_count,
        "acceleratorType":
        accelerator_type,
      })

  util.create_cluster(gke, project, zone, cluster_request)

  util.configure_kubectl(project, zone, cluster_name)

  util.load_kube_config()
  # Create an API client object to talk to the K8s master.
  api_client = k8s_client.ApiClient()

  t = test_util.TestCase()
  try:
    start = time.time()

    params = {
      "tfJobImage": args.image,
      "name": "kubeflow-core",
      "namespace": args.namespace,
      "tfJobVersion":  args.tf_job_version,
    }

    component = "core"

    account = util.run_and_output(
      ["gcloud", "config", "get-value", "account", "--quiet"]).strip()
    logging.info("Using GCP account %s", account)
    util.run([
      "kubectl", "create", "clusterrolebinding", "default-admin",
      "--clusterrole=cluster-admin", "--user="******"v1alpha1":
      tf_job_deployment_name = "tf-job-operator"
    elif args.tf_job_version == "v1alpha2":
      tf_job_deployment_name = "tf-job-operator-v1alpha2"
    else:
      raise ValueError(
        "Unrecognized value for tf_job_version %s" % args.tf_job_version)
    logging.info("Verifying TfJob deployment %s started.",
                 tf_job_deployment_name)

    # TODO(jlewi): We should verify the image of the operator is the correct.
    util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name)

  # Reraise the exception so that the step fails because there's no point
  # continuing the test.
  except subprocess.CalledProcessError as e:
    t.failure = "kubeflow-deploy failed;\n" + (e.output or "")
    raise
  except util.TimeoutError as e:
    t.failure = e.message
    raise
  finally:
    t.time = time.time() - start
    t.name = "kubeflow-deploy"
    t.class_name = "GKE"
    gcs_client = storage.Client(project=args.project)
    test_util.create_junit_xml_file([t], args.junit_path, gcs_client)