Exemple #1
0
def test(args):
    """Run the tests."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)

    t = test_util.TestCase()
    try:
        start = time.time()
        util.run(["helm", "test", "tf-job"])
    except subprocess.CalledProcessError as e:
        t.failure = "helm test failed;\n" + (e.output or "")
        # Reraise the exception so that the prow job will fail and the test
        # is marked as a failure.
        # TODO(jlewi): It would be better to this wholistically; e.g. by
        # processing all the junit xml files and checking for any failures. This
        # should be more tractable when we migrate off Airflow to Argo.
        raise
    finally:
        t.time = time.time() - start
        t.name = "e2e-test"
        t.class_name = "GKE"
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Exemple #2
0
def teardown_cluster(dag_run=None, ti=None, **_kwargs):
    conf = dag_run.conf
    if not conf:
        conf = {}

    dryrun = bool(conf.get("dryrun", False))

    cluster = ti.xcom_pull("setup_cluster", key="cluster")

    gcs_path = run_path(dag_run.dag_id, dag_run.run_id)

    artifacts_path = conf.get("ARTIFACTS_PATH", gcs_path)
    logging.info("artifacts_path %s", artifacts_path)

    junit_path = os.path.join(artifacts_path, "junit_teardown.xml")
    logging.info("junit_path %s", junit_path)
    ti.xcom_push(key="cluster", value=cluster)

    args = ["python", "-m", "py.deploy", "teardown"]
    args.append("--cluster=" + cluster)
    args.append("--junit_path=" + junit_path)
    args.append("--project=" + GCB_PROJECT)

    # We want subprocess output to bypass logging module otherwise multiline
    # output is squashed together.
    util.run(args, use_print=True, dryrun=dryrun)
Exemple #3
0
def build_container(use_gcb, src_dir, test_dir, project):
    """Build the CRD container.

  Args:
    use_gcb: Boolean indicating whether to build the image with GCB or Docker.
    src_dir: The directory containing the source.
    test_dir: Scratch directory for runner.py.
    project: Project to use.
  Returns:
    image: The URI of the newly built image.
  """
    # Build and push the image
    # We use Google Container Builder because Prow currently doesn't allow using
    # docker build.
    if use_gcb:
        gcb_arg = "--gcb"
    else:
        gcb_arg = "--no-gcb"

    build_info_file = os.path.join(test_dir, "build_info.yaml")
    util.run([
        "./images/tf_operator/build_and_push.py", gcb_arg,
        "--project=" + project, "--registry=gcr.io/mlkube-testing",
        "--output=" + build_info_file
    ],
             cwd=src_dir)

    with open(build_info_file) as hf:
        build_info = yaml.load(hf)

    return build_info["image"]
Exemple #4
0
    def run_py_checks(dag_run=None, ti=None, **_kwargs):
        """Run some of the python checks."""

        conf = dag_run.conf
        if not conf:
            conf = {}

        dryrun = bool(conf.get("dryrun", False))

        src_dir = ti.xcom_pull(None, key="src_dir")
        logging.info("src_dir %s", src_dir)

        gcs_path = run_path(dag_run.dag_id, dag_run.run_id)

        artifacts_path = conf.get("ARTIFACTS_PATH", gcs_path)
        logging.info("artifacts_path %s", artifacts_path)

        junit_path = os.path.join(artifacts_path,
                                  "junit_pychecks{0}.xml".format(command))
        logging.info("junit_path %s", junit_path)

        args = ["python", "-m", "py.py_checks", command]
        args.append("--src_dir=" + src_dir)
        args.append("--junit_path=" + junit_path)
        args.append("--project=" + GCB_PROJECT)

        # We want subprocess output to bypass logging module otherwise multiline
        # output is squashed together.
        util.run(args, use_print=True, dryrun=dryrun)
Exemple #5
0
def run(ti, *extra_args, **kwargs):
    # Set the PYTHONPATH
    env = kwargs.get("env", os.environ)
    env = env.copy()

    python_path = set(env.get("PYTHONPATH", "").split(":"))

    # Ensure the BOOTSTRAP_DIR isn't in the PYTHONPATH as this could cause
    # unexpected issues by unexpectedly pulling the version baked into the
    # container.
    if BOOTSTRAP_DIR in python_path:
        logging.info("Removing %s from PYTHONPATH", BOOTSTRAP_DIR)
        python_path.remove(BOOTSTRAP_DIR)

    src_dir = ti.xcom_pull(None, key="src_dir")

    if not src_dir:
        src_dir = BOOTSTRAP_DIR

    python_path.add(src_dir)

    env["PYTHONPATH"] = ":".join(python_path)

    # We need to delay the import of util because for all steps (except the
    # clone step) we want to use the version checked out from the github.
    # But airflow needs be able to import the module e2e_tests_daga.py
    from py import util

    kwargs["env"] = env

    # Printing out the file location of util should help us debug issues
    # with the path.
    logging.info("Using util located at %s", util.__file__)
    util.run(*extra_args, **kwargs)
Exemple #6
0
def run_gpu_test(dag_run=None, ti=None, **_kwargs):
    conf = dag_run.conf
    if not conf:
        conf = {}

    cluster = ti.xcom_pull(None, key="cluster")

    src_dir = ti.xcom_pull(None, key="src_dir")

    logging.info("conf=%s", conf)
    artifacts_path = conf.get("ARTIFACTS_PATH",
                              run_path(dag_run.dag_id, dag_run.run_id))
    logging.info("artifacts_path %s", artifacts_path)
    # I think we can only have one underscore in the name for gubernator to
    # work.
    junit_path = os.path.join(artifacts_path, "junit_gpu-tests.xml")
    logging.info("junit_path %s", junit_path)
    ti.xcom_push(key="cluster", value=cluster)

    spec = os.path.join(src_dir, "examples/tf_job_gpu.yaml")
    args = ["python", "-m", "py.test_runner", "test"]
    args.append("--spec=" + spec)
    args.append("--zone=" + ZONE)
    args.append("--cluster=" + cluster)
    args.append("--junit_path=" + junit_path)
    args.append("--project=" + GCB_PROJECT)
    # tf_job_gpu.yaml has the image tag hardcoded so the tag doesn't matter.
    # TODO(jlewi): The example should be a template and we should rebuild and
    # and use the newly built sample container.
    args.append("--image_tag=notag")

    # We want subprocess output to bypass logging module otherwise multiline
    # output is squashed together.
    util.run(args, use_print=True)
Exemple #7
0
def run_tests(args):
    # Print out the pylint version because different versions can produce
    # different results.
    util.run(["pylint", "--version"])

    # kubeflow_testing is imported as a submodule so we should exclude it
    # TODO(jlewi): Perhaps we should get a list of submodules and exclude
    # them automatically?
    dir_excludes = ["kubeflow_testing", "vendor"]
    includes = ["*_test.py"]
    test_cases = []

    env = os.environ.copy()
    # TODO(jlewi): Once we switch to using Argo I think we can stop setting
    # the PYTHONPATH here and just inheriting it from the environment.
    # When we use ARGO each step will run in its own pod and we can set the
    # PYTHONPATH environment variable as needed for that pod.
    env["PYTHONPATH"] = (args.src_dir + ":" +
                         os.path.join(args.src_dir, "kubeflow_testing", "py"))

    num_failed = 0
    for root, dirs, files in os.walk(args.src_dir, topdown=True):
        # excludes can be done with fnmatch.filter and complementary set,
        # but it's more annoying to read.
        dirs[:] = [d for d in dirs if d not in dir_excludes]
        for pat in includes:
            for f in fnmatch.filter(files, pat):
                full_path = os.path.join(root, f)

                test_case = test_util.TestCase()
                test_case.class_name = "pytest"
                test_case.name = full_path[len(args.src_dir):]
                start_time = time.time()
                test_cases.append(test_case)
                try:
                    util.run(["python", full_path], cwd=args.src_dir, env=env)
                except subprocess.CalledProcessError:
                    test_case.failure = "{0} failed.".format(test_case.name)
                    num_failed += 1
                finally:
                    test_case.time = time.time() - start_time

    if num_failed:
        logging.error("%s tests failed.", num_failed)
    else:
        logging.info("No lint issues.")

    if not args.junit_path:
        logging.info("No --junit_path.")
        return

    gcs_client = None
    if args.junit_path.startswith("gs://"):
        gcs_client = storage.Client(project=args.project)

    test_util.create_junit_xml_file(test_cases, args.junit_path, gcs_client)
Exemple #8
0
def clone_repo(dag_run=None, ti=None, **_kwargs):  # pylint: disable=too-many-statements
    # Create a temporary directory suitable for checking out and building the
    # code.
    if not dag_run:
        # When running via airflow test dag_run isn't set
        logging.warn("Using fake dag_run")
        dag_run = FakeDagrun()

    logging.info("dag_id: %s", dag_run.dag_id)
    logging.info("run_id: %s", dag_run.run_id)

    conf = dag_run.conf
    if not conf:
        conf = {}
    logging.info("conf=%s", conf)

    # Pick the directory the top level directory to use for this run of the
    # pipeline.
    # This should be a persistent location that is accessible from subsequent
    # tasks; e.g. an NFS share or PD. The environment variable SRC_DIR is used
    # to allow the directory to be specified as part of the deployment
    run_dir = os.path.join(os.getenv("SRC_DIR", tempfile.gettempdir()),
                           dag_run.dag_id.replace(":", "_"),
                           dag_run.run_id.replace(":", "_"))
    logging.info("Using run_dir %s", run_dir)
    os.makedirs(run_dir)
    logging.info("xcom push: run_dir=%s", run_dir)
    ti.xcom_push(key="run_dir", value=run_dir)

    # Directory where we will clone the src
    src_dir = os.path.join(run_dir, "tensorflow_k8s")
    logging.info("xcom push: src_dir=%s", src_dir)
    ti.xcom_push(key="src_dir", value=src_dir)

    # Make sure pull_number is a string
    pull_number = "{0}".format(conf.get("PULL_NUMBER", ""))
    args = ["python", "-m", "py.release", "clone", "--src_dir=" + src_dir]
    if pull_number:
        commit = conf.get("PULL_PULL_SHA", "")
        args.append("pr")
        args.append("--pr=" + pull_number)
        if commit:
            args.append("--commit=" + commit)
    else:
        commit = conf.get("PULL_BASE_SHA", "")
        args.append("postsubmit")
        if commit:
            args.append("--commit=" + commit)

    util.run(args, use_print=True)
Exemple #9
0
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir, use_print=True)

    app_dir = os.path.join(args.test_dir, app_name)

    # TODO(jlewi): In presubmits we probably want to change this so we can
    # pull the changes on a branch. Its not clear whether that's well supported
    # in Ksonnet yet.
    kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    # TODO(jlewi): For presubmits how do we pull the package from the desired
    # branch at the desired commit.
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
      with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf:
        key = json.load(hf)
        apply_command.append("--as=" + key["client_email"])
    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
Exemple #10
0
def run_tests(args):
  # Print out the pylint version because different versions can produce
  # different results.
  util.run(["pylint", "--version"])

  dir_excludes = ["vendor"]
  includes = ["*_test.py"]
  test_cases = []

  env = os.environ.copy()
  env["PYTHONPATH"] = args.src_dir

  num_failed = 0
  for root, dirs, files in os.walk(args.src_dir, topdown=True):
    # excludes can be done with fnmatch.filter and complementary set,
    # but it's more annoying to read.
    dirs[:] = [d for d in dirs if d not in dir_excludes]
    for pat in includes:
      for f in fnmatch.filter(files, pat):
        full_path = os.path.join(root, f)

        test_case = test_util.TestCase()
        test_case.class_name = "pytest"
        test_case.name = full_path.strip(args.src_dir)
        start_time = time.time()
        test_cases.append(test_case)
        try:
          util.run(["python", full_path], cwd=args.src_dir, env=env)
        except subprocess.CalledProcessError:
          test_case.failure = "{0} failed.".format(test_case.name)
          num_failed += 1
        finally:
          test_case.time = time.time() - start_time

  if num_failed:
    logging.error("%s tests failed.", num_failed)
  else:
    logging.info("No lint issues.")


  if not args.junit_path:
    logging.info("No --junit_path.")
    return

  gcs_client = None
  if args.junit_path.startswith("gs://"):
    gcs_client = storage.Client(project=args.project)

  test_util.create_junit_xml_file(test_cases, args.junit_path, gcs_client)
Exemple #11
0
def run_lint(src_dir):
    """Run lint.

  Args:
    src_dir: the directory containing the source.

  Returns:
    success: Boolean indicating success or failure
  """
    try:
        util.run(["./lint.sh"], cwd=src_dir)
    except subprocess.CalledProcessError as e:
        logging.error("Lint checks failed; %s", e)
        return False
    return True
Exemple #12
0
def run_lint(args):
    start_time = time.time()
    # Print out the pylint version because different versions can produce
    # different results.
    util.run(["pylint", "--version"])

    dir_excludes = ["vendor"]
    includes = ["*.py"]
    failed_files = []
    rc_file = os.path.join(args.src_dir, ".pylintrc")
    for root, dirs, files in os.walk(args.src_dir, topdown=True):
        # excludes can be done with fnmatch.filter and complementary set,
        # but it's more annoying to read.
        dirs[:] = [d for d in dirs if d not in dir_excludes]
        for pat in includes:
            for f in fnmatch.filter(files, pat):
                full_path = os.path.join(root, f)
                try:
                    util.run(["pylint", "--rcfile=" + rc_file, full_path],
                             cwd=args.src_dir)
                except subprocess.CalledProcessError:
                    failed_files.append(full_path.strip(args.src_dir))

    if failed_files:
        logging.error("%s files had lint errors.", len(failed_files))
    else:
        logging.info("No lint issues.")

    if not args.junit_path:
        logging.info("No --junit_path.")
        return

    test_case = test_util.TestCase()
    test_case.class_name = "pylint"
    test_case.name = "pylint"
    test_case.time = time.time() - start_time
    if failed_files:
        test_case.failure = "Files with lint issues: {0}".format(
            ", ".join(failed_files))

    gcs_client = None
    if args.junit_path.startswith("gs://"):
        gcs_client = storage.Client(project=args.project)

    test_util.create_junit_xml_file([test_case], args.junit_path, gcs_client)
Exemple #13
0
def create_cluster(gke, name, project, zone):
    """Create the cluster.

  Args:
    gke: Client for GKE.

  """
    cluster_request = {
        "cluster": {
            "name": name,
            "description": "A GKE cluster for testing GPUs with Cloud ML",
            "initialNodeCount": 1,
            "nodeConfig": {
                "machineType": "n1-standard-8",
            },
        }
    }
    request = gke.projects().zones().clusters().create(body=cluster_request,
                                                       projectId=project,
                                                       zone=zone)

    try:
        logging.info("Creating cluster; project=%s, zone=%s, name=%s", project,
                     zone, name)
        response = request.execute()
        logging.info("Response %s", response)
        create_op = wait_for_operation(gke, project, zone, response["name"])
        logging.info("Cluster creation done.\n %s", create_op)

    except errors.HttpError as e:
        logging.error("Exception occured creating cluster: %s, status: %s", e,
                      e.resp["status"])
        # Status appears to be a string.
        if e.resp["status"] == '409':
            # TODO(jlewi): What should we do if the cluster already exits?
            pass
        else:
            raise

    logging.info("Configuring kubectl")
    util.run([
        "gcloud", "--project=" + project, "container", "clusters",
        "--zone=" + zone, "get-credentials", name
    ])
Exemple #14
0
def test(args):
    """Run the tests."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)

    t = test_util.TestCase()
    try:
        start = time.time()
        util.run(["helm", "test", "tf-job"])
    except subprocess.CalledProcessError as e:
        t.failure = "helm test failed;\n" + e.output
    finally:
        t.time = time.time() - start
        t.name = "e2e-test"
        t.class_name = "GKE"
        test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Exemple #15
0
def _push_image(image, latest_image):
    if "gcr.io" in image:
        util.run(["gcloud", "docker", "--", "push", image])
        logging.info("Pushed image: %s", image)

        util.run(["gcloud", "docker", "--", "push", latest_image])
        logging.info("Pushed image: %s", latest_image)

    else:
        util.run(["docker", "push", image])
        logging.info("Pushed image: %s", image)

        util.run(["docker", "push", latest_image])
        logging.info("Pushed image: %s", latest_image)
Exemple #16
0
def deploy_and_test(image, test_dir):
    """Deploy and test the CRD.

  Args:
    image: The Docker image for the CRD to use.
    test_dir: The directory where test outputs should be written.

  Returns:
    success: Boolean indicating success or failure
  """

    target = os.path.join("github.com", GO_REPO_OWNER, GO_REPO_NAME,
                          "test-infra", "helm-test")
    util.run(["go", "install", target])

    binary = os.path.join(os.getenv("GOPATH"), "bin", "helm-test")
    try:
        util.run([binary, "--image=" + image, "--output_dir=" + test_dir])
    except subprocess.CalledProcessError as e:
        logging.error("helm-test failed; %s", e)
        return False
    return True
Exemple #17
0
def setup_cluster(dag_run=None, ti=None, **_kwargs):
    conf = dag_run.conf
    if not conf:
        conf = {}

    dryrun = bool(conf.get("dryrun", False))

    chart = ti.xcom_pull("build_images", key="helm_chart")

    now = datetime.now()
    cluster = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    logging.info("conf=%s", conf)
    artifacts_path = conf.get("ARTIFACTS_PATH",
                              run_path(dag_run.dag_id, dag_run.run_id))
    logging.info("artifacts_path %s", artifacts_path)

    # Gubernator only recognizes XML files whos name matches
    # junit_[^_]*.xml which is why its "setupcluster" and not "setup_cluster"
    junit_path = os.path.join(artifacts_path, "junit_setupcluster.xml")
    logging.info("junit_path %s", junit_path)

    args = ["python", "-m", "py.deploy", "setup"]
    args.append("--cluster=" + cluster)
    args.append("--junit_path=" + junit_path)
    args.append("--project=" + GCB_PROJECT)
    args.append("--chart=" + chart)
    args.append("--zone=" + ZONE)
    args.append("--accelerator=nvidia-tesla-k80=1")
    # We want subprocess output to bypass logging module otherwise multiline
    # output is squashed together.
    util.run(args, use_print=True, dryrun=dryrun)

    values = {
        "cluster": cluster,
    }
    for k, v in six.iteritems(values):
        logging.info("xcom push: %s=%s", k, v)
        ti.xcom_push(key=k, value=v)
Exemple #18
0
def copy_artifacts(args):
  """Sync artifacts to GCS."""
  job_name = os.getenv("JOB_NAME")

  # GCS layout is defined here:
  # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout
  pull_number = os.getenv("PULL_NUMBER")

  repo_owner = os.getenv("REPO_OWNER")
  repo_name = os.getenv("REPO_NAME")

  if pull_number:
    output = _get_pr_gcs_dir(args.bucket)

  elif repo_owner:
    # It is a postsubmit job
    output = ("gs://{bucket}/logs/{owner}_{repo}/"
              "{job}/{build}").format(
                  owner=repo_owner, repo=repo_name,
                  job=job_name,
                  build=os.getenv("BUILD_NUMBER"))
  else:
    # Its a periodic job
    output = ("gs://{bucket}/logs/{job}/{build}").format(
        bucket=bucket,
        job=job_name,
        build=os.getenv("BUILD_NUMBER"))


  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud "
                 "to use service account.")
    # Since a service account is set tell gcloud to use it.
    util.run(["gcloud", "auth", "activate-service-account", "--key-file=" +
              os.getenv("GOOGLE_APPLICATION_CREDENTIALS")])

  util.run(["gsutil", "-m", "rsync", "-r", args.artifacts_dir, output])
Exemple #19
0
def copy_artifacts(args):
    """Sync artifacts to GCS."""
    job_name = os.getenv("JOB_NAME")

    # GCS layout is defined here:
    # https://github.com/kubernetes/test-infra/tree/master/gubernator#job-artifact-gcs-layout
    pull_number = os.getenv("PULL_NUMBER")

    repo_owner = os.getenv("REPO_OWNER")
    repo_name = os.getenv("REPO_NAME")

    output = get_gcs_dir(args.bucket)

    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        logging.info(
            "GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud "
            "to use service account.")
        # Since a service account is set tell gcloud to use it.
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
        ])

    util.run(["gsutil", "-m", "rsync", "-r", args.artifacts_dir, output])
Exemple #20
0
def ks_deploy(app_dir, component, params, env=None, account=None):
    """Deploy the specified ksonnet component.

  Args:
    app_dir: The ksonnet directory
    component: Name of the component to deployed
    params: A dictionary of parameters to set; can be empty but should not be
      None.
    env: (Optional) The environment to use, if none is specified a new one
      is created.
    account: (Optional) The account to use.

  Raises:
    ValueError: If input arguments aren't valid.
  """
    if not component:
        raise ValueError("component can't be None.")

    # TODO(jlewi): It might be better if the test creates the app and uses
    # the latest stable release of the ksonnet configs. That however will cause
    # problems when we make changes to the TFJob operator that require changes
    # to the ksonnet configs. One advantage of checking in the app is that
    # we can modify the files in vendor if needed so that changes to the code
    # and config can be submitted in the same pr.
    now = datetime.datetime.now()
    if not env:
        env = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    logging.info("Using app directory: %s", app_dir)

    util.run(["ks", "env", "add", env], cwd=app_dir)

    for k, v in params.iteritems():
        util.run(["ks", "param", "set", "--env=" + env, component, k, v],
                 cwd=app_dir)

    apply_command = ["ks", "apply", env, "-c", component]
    if account:
        apply_command.append("--as=" + account)
    util.run(apply_command, cwd=app_dir)
Exemple #21
0
def run_test(args):
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    salt = uuid.uuid4().hex[0:4]

    # Create a new environment for this run
    env = "test-env-{0}".format(salt)

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    name = None
    namespace = None
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        if k == "name":
            name = v

        if k == "namespace":
            namespace = v
        util.run(["ks", "param", "set", "--env=" + env, args.component, k, v],
                 cwd=args.app_dir)

    if not name:
        raise ValueError("name must be provided as a parameter.")

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(name)

    if not namespace:
        raise ValueError("namespace must be provided as a parameter.")

    start = time.time()

    try:
        util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir)

        logging.info("Created job %s in namespaces %s", name, namespace)
        results = tf_job_client.wait_for_job(
            api_client,
            namespace,
            name,
            status_callback=tf_job_client.log_status)

        if results.get("status", {}).get("state", {}).lower() != "succeeded":
            t.failure = "Job {0} in namespace {1} in state {2}".format(
                name, namespace,
                results.get("status", {}).get("state", None))

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check tensorboard is created if its part of the job spec.
        #  2. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.error("There was a problem running the job; Exception %s", e)
        logging.error(
            "There was a problem running the job; Exception "
            "message: %s", e.message)
        logging.error("Exception type: %s", e.__class__)
        logging.error("Exception args: %s", e.args)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Exemple #22
0
def build_images(dag_run=None, ti=None, **_kwargs):  # pylint: disable=too-many-statements
    """
  Args:
    dag_run: A DagRun object. This is passed in as a result of setting
      provide_context to true for the operator.
  """
    # Create a temporary directory suitable for checking out and building the
    # code.
    if not dag_run:
        # When running via airflow test dag_run isn't set
        logging.warn("Using fake dag_run")
        dag_run = FakeDagrun()

    logging.info("dag_id: %s", dag_run.dag_id)
    logging.info("run_id: %s", dag_run.run_id)

    run_dir = ti.xcom_pull(None, key="run_dir")
    logging.info("Using run_dir=%s", run_dir)

    src_dir = ti.xcom_pull(None, key="src_dir")
    logging.info("Using src_dir=%s", src_dir)

    gcs_path = run_path(dag_run.dag_id, dag_run.run_id)
    logging.info("gcs_path %s", gcs_path)

    conf = dag_run.conf
    if not conf:
        conf = {}
    logging.info("conf=%s", conf)
    artifacts_path = conf.get("ARTIFACTS_PATH", gcs_path)
    logging.info("artifacts_path %s", artifacts_path)

    # We use a GOPATH that is specific to this run because we don't want
    # interference from different runs.
    newenv = os.environ.copy()
    newenv["GOPATH"] = os.path.join(run_dir, "go")

    # Make sure pull_number is a string
    pull_number = "{0}".format(conf.get("PULL_NUMBER", ""))
    args = ["python", "-m", "py.release", "build", "--src_dir=" + src_dir]

    dryrun = bool(conf.get("dryrun", False))

    build_info_file = os.path.join(gcs_path, "build_info.yaml")
    args.append("--build_info_path=" + build_info_file)
    args.append("--releases_path=" + gcs_path)
    args.append("--project=" + GCB_PROJECT)
    # We want subprocess output to bypass logging module otherwise multiline
    # output is squashed together.
    util.run(args, use_print=True, dryrun=dryrun, env=newenv)

    # Read the output yaml and publish relevant values to xcom.
    if not dryrun:
        gcs_client = storage.Client(project=GCB_PROJECT)
        logging.info("Reading %s", build_info_file)
        bucket_name, build_path = util.split_gcs_uri(build_info_file)
        bucket = gcs_client.get_bucket(bucket_name)
        blob = bucket.blob(build_path)
        contents = blob.download_as_string()
        build_info = yaml.load(contents)
    else:
        build_info = {
            "image": "gcr.io/dryrun/dryrun:latest",
            "commit": "1234abcd",
            "helm_chart": "gs://dryrun/dryrun.latest.",
        }
    for k, v in six.iteritems(build_info):
        logging.info("xcom push: %s=%s", k, v)
        ti.xcom_push(key=k, value=v)
Exemple #23
0
def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    # TODO(jlewi): When using GKE we should copy the .kube config and any other
    # files to the test directory. We should then set the environment variable
    # KUBECONFIG to point at that file. This should prevent us from having
    # to rerun util.configure_kubectl on each step. Instead we could run it once
    # as part of GKE cluster creation and store the config in the NFS directory.
    # This would make the handling of credentials
    # and KUBECONFIG more consistent between GKE and minikube and eventually
    # this could be extended to other K8s deployments.
    if cluster_name:
        util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    salt = uuid.uuid4().hex[0:4]

    # Create a new environment for this run
    env = "test-env-{0}".format(salt)

    util.run(["ks", "env", "add", env], cwd=args.app_dir)

    name = None
    namespace = None
    for pair in args.params.split(","):
        k, v = pair.split("=", 1)
        if k == "name":
            name = v

        if k == "namespace":
            namespace = v
        util.run(["ks", "param", "set", "--env=" + env, args.component, k, v],
                 cwd=args.app_dir)

    if not name:
        raise ValueError("name must be provided as a parameter.")

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    t.name = os.path.basename(name)

    if not namespace:
        raise ValueError("namespace must be provided as a parameter.")

    start = time.time()

    try:
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.

        # TODO(jlewi): We should make this an argument.
        num_trials = 2

        for trial in range(num_trials):
            logging.info("Trial %s", trial)
            util.run(["ks", "apply", env, "-c", args.component],
                     cwd=args.app_dir)

            logging.info("Created job %s in namespaces %s", name, namespace)
            results = tf_job_client.wait_for_job(
                api_client,
                namespace,
                name,
                status_callback=tf_job_client.log_status)

            if results.get("status", {}).get("state",
                                             {}).lower() != "succeeded":
                t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format(
                    trial, name, namespace,
                    results.get("status", {}).get("state", None))
                logging.error(t.failure)
                break

            runtime_id = results.get("spec", {}).get("RuntimeId")
            logging.info("Trial %s Job %s in namespace %s runtime ID %s",
                         trial, name, namespace, runtime_id)

            uid = results.get("metadata", {}).get("uid")
            events = get_events(api_client, namespace, uid)
            created_pods, created_services = parse_events(events)

            num_expected = 0
            for replica in results.get("spec", {}).get("replicaSpecs", []):
                num_expected += replica.get("replicas", 0)

            creation_failures = []
            if len(created_pods) != num_expected:
                message = ("Expected {0} pods to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_pods))
                creation_failures.append(message)

            if len(created_services) != num_expected:
                message = ("Expected {0} services to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_services))
                creation_failures.append(message)

            if creation_failures:
                t.failure = "Trial {0} Job {1} in namespace {2}: {3}".format(
                    trial, name, namespace, ", ".join(creation_failures))
                logging.error(t.failure)
                break
            pod_labels = get_labels(name, runtime_id)
            pod_selector = to_selector(pod_labels)

            wait_for_pods_to_be_deleted(api_client, namespace, pod_selector)

            tf_job_client.delete_tf_job(api_client, namespace, name)

            logging.info("Waiting for job %s in namespaces %s to be deleted.",
                         name, namespace)
            wait_for_delete(api_client,
                            namespace,
                            name,
                            status_callback=tf_job_client.log_status)

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
        logging.error(t.failure)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.error("There was a problem running the job; Exception %s", e)
        logging.error(
            "There was a problem running the job; Exception "
            "message: %s", e.message)
        logging.error("Exception type: %s", e.__class__)
        logging.error("Exception args: %s", e.args)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
Exemple #24
0
def build_operator_image(root_dir,
                         registry,
                         project=None,
                         should_push=True,
                         version_tag=None):
    """Build the main docker image for the TFJob CRD.
  Args:
    root_dir: Root directory of the repository.
    registry: The registry to use.
    project: If set it will be built using GCB.
    should_push: Should push the image to the registry, Defaule is True.
    version_tag: Optional tag for the version. If not specified derive
      the tag from the git hash.
  Returns:
    build_info: Dictionary containing information about the build.
  """
    context_dir = tempfile.mkdtemp(prefix="tmpTFJobCrdContext")
    logging.info("context_dir: %s", context_dir)
    if not os.path.exists(context_dir):
        os.makedirs(context_dir)

    # Build the go binaries
    go_path = os.environ["GOPATH"]
    commit = build_and_push_image.GetGitHash(root_dir)

    targets = [
        "github.com/kubeflow/tf-operator/cmd/tf-operator",
        "github.com/kubeflow/tf-operator/test/e2e",
        "github.com/kubeflow/tf-operator/dashboard/backend",
    ]
    for t in targets:
        if t == "github.com/kubeflow/tf-operator/cmd/tf-operator":
            util.run([
                "go", "install", "-ldflags",
                "-X github.com/kubeflow/tf-operator/version.GitSHA={}".format(
                    commit), t
            ])
        util.run(["go", "install", t])

    # Dashboard's frontend:
    # Resolving dashboard's front-end dependencies
    util.run(
        ["yarn", "--cwd", "{}/dashboard/frontend".format(root_dir), "install"])
    # Building dashboard's front-end
    util.run(
        ["yarn", "--cwd", "{}/dashboard/frontend".format(root_dir), "build"])

    # If the release is not done from a Linux machine
    # we need to grab the artefacts from /bin/linux_amd64
    bin_path = "bin"
    if platform.system() != "Linux":
        bin_path += "/linux_amd64"

    # List of paths to copy relative to root.
    sources = [
        "build/images/tf_operator/Dockerfile",
        "examples/tf_sample/tf_sample/tf_smoke.py",
        os.path.join(go_path, bin_path, "tf-operator"),
        os.path.join(go_path, bin_path, "e2e"),
        os.path.join(go_path, bin_path, "backend"), "dashboard/frontend/build"
    ]

    for s in sources:
        src_path = os.path.join(root_dir, s)
        dest_path = os.path.join(context_dir, os.path.basename(s))
        if os.path.exists(dest_path):
            os.unlink(dest_path)
        if os.path.isdir(src_path):
            shutil.copytree(src_path, dest_path)
        else:
            shutil.copyfile(src_path, dest_path)

    image_base = registry + "/tf_operator"

    if not version_tag:
        logging.info("No version tag specified; computing tag automatically.")
        n = datetime.datetime.now()
        version_tag = n.strftime("v%Y%m%d") + "-" + commit
    logging.info("Using version tag: %s", version_tag)
    image = image_base + ":" + version_tag
    latest_image = image_base + ":latest"

    if project:
        util.run([
            "gcloud", "container", "builds", "submit", context_dir,
            "--tag=" + image, "--project=" + project
        ])

        # Add the latest tag.
        util.run([
            "gcloud", "container", "images", "add-tag", "--quiet", image,
            latest_image
        ])

    else:
        util.run(["docker", "build", "-t", image, context_dir])
        logging.info("Built image: %s", image)

        util.run(["docker", "tag", image, latest_image])

        if should_push:
            _push_image(image, latest_image)

    output = {
        "image": image,
        "commit": commit,
    }
    return output
Exemple #25
0
def main():  # pylint: disable=too-many-locals
    logging.getLogger().setLevel(logging.INFO)  # pylint: disable=too-many-locals
    # create the top-level parser
    parser = argparse.ArgumentParser(description="Test Kubeflow E2E.")

    parser.add_argument(
        "--test_dir",
        default="",
        type=str,
        help="Directory to use for all the test files. If not set a temporary "
        "directory is created.")

    parser.add_argument(
        "--artifacts_dir",
        default="",
        type=str,
        help="Directory to use for artifacts that should be preserved after "
        "the test runs. Defaults to test_dir if not set.")

    parser.add_argument("--project",
                        default=None,
                        type=str,
                        help="The project to use.")

    parser.add_argument(
        "--cluster",
        default=None,
        type=str,
        help=("The name of the cluster. If not set assumes the "
              "script is running in a cluster and uses that cluster."))

    parser.add_argument("--zone",
                        default="us-east1-d",
                        type=str,
                        help="The zone for the cluster.")

    parser.add_argument(
        "--github_token",
        default=None,
        type=str,
        help=
        ("The GitHub API token to use. This is needed since ksonnet uses the "
         "GitHub API and without it we get rate limited. For more info see: "
         "https://github.com/ksonnet/ksonnet/blob/master/docs"
         "/troubleshooting.md"))

    args = parser.parse_args()

    if not args.test_dir:
        logging.info("--test_dir not set; using a temporary directory.")

        now = datetime.datetime.now()
        label = "test_deploy-" + now.strftime(
            "%m%d-%H%M-") + uuid.uuid4().hex[0:4]

        # Create a temporary directory for this test run
        args.test_dir = os.path.join(tempfile.gettempdir(), label)

    if not args.artifacts_dir:
        args.artifacts_dir = args.test_dir
    # Setup a logging file handler. This way we can upload the log outputs
    # to gubernator.
    root_logger = logging.getLogger()

    test_log = os.path.join(args.artifacts_dir, "logs", "test_deploy.log.txt")
    if not os.path.exists(os.path.dirname(test_log)):
        os.makedirs(os.path.dirname(test_log))

    file_handler = logging.FileHandler(test_log)
    root_logger.addHandler(file_handler)
    # We need to explicitly set the formatter because it will not pick up
    # the BasicConfig.
    formatter = logging.Formatter(
        fmt=("%(levelname)s|%(asctime)s"
             "|%(pathname)s|%(lineno)d| %(message)s"),
        datefmt="%Y-%m-%dT%H:%M:%S")
    file_handler.setFormatter(formatter)
    logging.info("Logging to %s", test_log)

    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        logging.info(
            "GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud "
            "to use service account.")
        # Since a service account is set tell gcloud to use it.
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
        ])
    setup(args)
Exemple #26
0
def build_and_push_artifacts(go_dir,
                             src_dir,
                             registry,
                             publish_path=None,
                             gcb_project=None,
                             build_info_path=None):
    """Build and push the artifacts.

  Args:
    go_dir: The GOPATH directory
    src_dir: The root directory where we checked out the repo.
    registry: Docker registry to use.
    publish_path: (Optional) The GCS path where artifacts should be published.
       Set to none to only build locally.
    gcb_project: The project to use with GCB to build docker images.
      If set to none uses docker to build.
    build_info_path: (Optional): GCS location to write YAML file containing
      information about the build.
  """
    # Update the GOPATH to the temporary directory.
    env = os.environ.copy()
    if go_dir:
        env["GOPATH"] = go_dir

    bin_dir = os.path.join(src_dir, "bin")
    if not os.path.exists(bin_dir):
        os.makedirs(bin_dir)

    build_info = build_operator_image(src_dir, registry, project=gcb_project)

    # Copy the chart to a temporary directory because we will modify some
    # of its YAML files.
    chart_build_dir = tempfile.mkdtemp(prefix="tmpTFJobChartBuild")
    shutil.copytree(os.path.join(src_dir, "tf-job-operator-chart"),
                    os.path.join(chart_build_dir, "tf-job-operator-chart"))
    version = build_info["image"].split(":")[-1]
    values_file = os.path.join(chart_build_dir, "tf-job-operator-chart",
                               "values.yaml")
    update_values(values_file, build_info["image"])

    chart_file = os.path.join(chart_build_dir, "tf-job-operator-chart",
                              "Chart.yaml")
    update_chart(chart_file, version)

    # Delete any existing matches because we assume there is only 1 below.
    matches = glob.glob(os.path.join(bin_dir, "tf-job-operator-chart*.tgz"))
    for m in matches:
        logging.info("Delete previous build: %s", m)
        os.unlink(m)

    util.run([
        "helm", "package", "--save=false", "--destination=" + bin_dir,
        "./tf-job-operator-chart"
    ],
             cwd=chart_build_dir)

    matches = glob.glob(os.path.join(bin_dir, "tf-job-operator-chart*.tgz"))

    if len(matches) != 1:
        raise ValueError(
            "Expected 1 chart archive to match but found {0}".format(matches))

    chart_archive = matches[0]

    release_path = version

    targets = [
        os.path.join(release_path, os.path.basename(chart_archive)),
        "latest/tf-job-operator-chart-latest.tgz",
    ]

    if publish_path:
        gcs_client = storage.Client(project=gcb_project)
        bucket_name, base_path = util.split_gcs_uri(publish_path)
        bucket = gcs_client.get_bucket(bucket_name)
        for t in targets:
            blob = bucket.blob(os.path.join(base_path, t))
            gcs_path = util.to_gcs_uri(bucket_name, blob.name)
            if not t.startswith("latest"):
                build_info["helm_chart"] = gcs_path
            if blob.exists() and not t.startswith("latest"):
                logging.warn("%s already exists", gcs_path)
                continue
            logging.info("Uploading %s to %s.", chart_archive, gcs_path)
            blob.upload_from_filename(chart_archive)

        create_latest(bucket, build_info["commit"],
                      util.to_gcs_uri(bucket_name, targets[0]))

    # Always write to the bin dir.
    paths = [os.path.join(bin_dir, "build_info.yaml")]

    if build_info_path:
        paths.append(build_info_path)

    write_build_info(build_info, paths, project=gcb_project)
Exemple #27
0
def build_operator_image(root_dir, registry, project=None, should_push=True):
    """Build the main docker image for the TFJob CRD.
  Args:
    root_dir: Root directory of the repository.
    registry: The registry to use.
    project: If set it will be built using GCB.
  Returns:
    build_info: Dictionary containing information about the build.
  """
    context_dir = tempfile.mkdtemp(prefix="tmpTFJobCrdContext")
    logging.info("context_dir: %s", context_dir)
    if not os.path.exists(context_dir):
        os.makedirs(context_dir)

    # Build the go binaries
    go_path = os.environ["GOPATH"]
    commit = build_and_push_image.GetGitHash(root_dir)

    targets = [
        "github.com/tensorflow/k8s/cmd/tf_operator",
        "github.com/tensorflow/k8s/test/e2e",
        "github.com/tensorflow/k8s/dashboard/backend",
    ]
    for t in targets:
        if t == "github.com/tensorflow/k8s/cmd/tf_operator":
            util.run([
                "go", "install", "-ldflags",
                "-X github.com/tensorflow/k8s/version.GitSHA={}".format(
                    commit), t
            ])
        util.run(["go", "install", t])

    # Dashboard's frontend:
    # Resolving dashboard's front-end dependencies
    util.run(
        ["yarn", "--cwd", "{}/dashboard/frontend".format(root_dir), "install"])
    # Building dashboard's front-end
    util.run(
        ["yarn", "--cwd", "{}/dashboard/frontend".format(root_dir), "build"])

    # List of paths to copy relative to root.
    sources = [
        "build/images/tf_operator/Dockerfile",
        os.path.join(go_path, "bin/tf_operator"),
        os.path.join(go_path, "bin/e2e"),
        os.path.join(go_path, "bin/backend"), "dashboard/frontend/build"
    ]

    for s in sources:
        src_path = os.path.join(root_dir, s)
        dest_path = os.path.join(context_dir, os.path.basename(s))
        if os.path.exists(dest_path):
            os.unlink(dest_path)
        if os.path.isdir(src_path):
            shutil.copytree(src_path, dest_path)
        else:
            shutil.copyfile(src_path, dest_path)

    image_base = registry + "/tf_operator"

    n = datetime.datetime.now()
    image = (image_base + ":" + n.strftime("v%Y%m%d") + "-" + commit)
    latest_image = image_base + ":latest"

    if project:
        util.run([
            "gcloud", "container", "builds", "submit", context_dir,
            "--tag=" + image, "--project=" + project
        ])

        # Add the latest tag.
        util.run([
            "gcloud", "container", "images", "add-tag", "--quiet", image,
            latest_image
        ])

    else:
        util.run(["docker", "build", "-t", image, context_dir])
        logging.info("Built image: %s", image)

        util.run(["docker", "tag", image, latest_image])

        if should_push:
            util.run(["gcloud", "docker", "--", "push", image])
            logging.info("Pushed image: %s", image)

            util.run(["gcloud", "docker", "--", "push", latest_image])
            logging.info("Pushed image: %s", latest_image)

    output = {
        "image": image,
        "commit": commit,
    }
    return output
Exemple #28
0
def setup(args):
    """Test deploying Kubeflow."""
    if args.cluster:
        project = args.project
        cluster_name = args.cluster
        zone = args.zone
        logging.info("Using cluster: %s in project: %s in zone: %s",
                     cluster_name, project, zone)
        # Print out config to help debug issues with accounts and
        # credentials.
        util.run(["gcloud", "config", "list"])
        util.configure_kubectl(project, zone, cluster_name)
        util.load_kube_config()
    else:
        # TODO(jlewi): This is sufficient for API access but it doesn't create
        # a kubeconfig file which ksonnet needs for ks init.
        logging.info("Running inside cluster.")
        incluster_config.load_incluster_config()

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    now = datetime.datetime.now()
    run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

    if not os.path.exists(args.test_dir):
        os.makedirs(args.test_dir)

    logging.info("Using test directory: %s", args.test_dir)

    namespace_name = run_label

    def run():
        namespace = _setup_test(api_client, namespace_name)
        logging.info("Using namespace: %s", namespace)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

        # Initialize a ksonnet app.
        app_name = "kubeflow-test"
        util.run([
            "ks",
            "init",
            app_name,
        ], cwd=args.test_dir, use_print=True)

        app_dir = os.path.join(args.test_dir, app_name)

        kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
        util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
                 cwd=app_dir)

        # Install required packages
        packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

        for p in packages:
            util.run(["ks", "pkg", "install", p], cwd=app_dir)

        # Delete the vendor directory and replace with a symlink to the src
        # so that we use the code at the desired commit.
        target_dir = os.path.join(app_dir, "vendor", "kubeflow")

        logging.info("Deleting %s", target_dir)
        shutil.rmtree(target_dir)

        source = os.path.join(args.test_dir, "src", "kubeflow")
        logging.info("Creating link %s -> %s", target_dir, source)
        os.symlink(source, target_dir)

        # Deploy Kubeflow
        util.run([
            "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace.metadata.name
        ],
                 cwd=app_dir)

        # TODO(jlewi): For reasons I don't understand even though we ran
        # configure_kubectl above, if we don't rerun it we get rbac errors
        # when we do ks apply; I think because we aren't using the proper service
        # account. This might have something to do with the way ksonnet gets
        # its credentials; maybe we need to configure credentials after calling
        # ks init?
        if args.cluster:
            util.configure_kubectl(args.project, args.zone, args.cluster)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "kubeflow-core",
        ]

        util.run(apply_command, cwd=app_dir)

        # Verify that the TfJob operator is actually deployed.
        tf_job_deployment_name = "tf-job-operator"
        logging.info("Verifying TfJob controller started.")
        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 tf_job_deployment_name)

        # Verify that JupyterHub is actually deployed.
        jupyter_name = "tf-hub"
        logging.info("Verifying TfHub started.")
        util.wait_for_statefulset(api_client, namespace.metadata.name,
                                  jupyter_name)

    main_case = test_util.TestCase()
    main_case.class_name = "KubeFlow"
    main_case.name = "deploy-kubeflow"
    try:
        test_util.wrap_test(run, main_case)
    finally:
        # Delete the namespace
        logging.info("Deleting namespace %s", namespace_name)

        # We report teardown as a separate test case because this will help
        # us track down issues with garbage collecting namespaces.
        teardown = test_util.TestCase(main_case.class_name, "teardown")

        def run_teardown():
            core_api = k8s_client.CoreV1Api(api_client)
            core_api.delete_namespace(namespace_name, {})

        try:
            test_util.wrap_test(run_teardown, teardown)
        except Exception as e:  # pylint: disable-msg=broad-except
            logging.error("There was a problem deleting namespace: %s; %s",
                          namespace_name, e.message)
        junit_path = os.path.join(args.artifacts_dir,
                                  "junit_kubeflow-deploy.xml")
        logging.info("Writing test results to %s", junit_path)
        test_util.create_junit_xml_file([main_case, teardown], junit_path)
Exemple #29
0
def run_lint(args):
    start_time = time.time()
    # Print out the pylint version because different versions can produce
    # different results.
    util.run(["pylint", "--version"])

    # kubeflow_testing is imported as a submodule so we should exclude it
    # TODO(jlewi): Perhaps we should get a list of submodules and exclude
    # them automatically?
    dir_excludes = [
        "dashboard/frontend/node_modules",
        "kubeflow_testing",
        "test/test-app",
        "vendor",
    ]
    full_dir_excludes = [
        os.path.join(os.path.abspath(args.src_dir), f) for f in dir_excludes
    ]
    includes = ["*.py"]
    failed_files = []
    rc_file = os.path.join(args.src_dir, ".pylintrc")
    for root, dirs, files in os.walk(os.path.abspath(args.src_dir),
                                     topdown=True):
        # excludes can be done with fnmatch.filter and complementary set,
        # but it's more annoying to read.
        exclude = False
        for e in full_dir_excludes:
            if root.startswith(e):
                exclude = True
                break
        if exclude:
            continue

        dirs[:] = [d for d in dirs]
        for pat in includes:
            for f in fnmatch.filter(files, pat):
                full_path = os.path.join(root, f)
                try:
                    util.run(["pylint", "--rcfile=" + rc_file, full_path],
                             cwd=args.src_dir)
                except subprocess.CalledProcessError:
                    failed_files.append(full_path[len(args.src_dir):])

    if failed_files:
        failed_files.sort()
        logging.error("%s files had lint errors:\n%s", len(failed_files),
                      "\n".join(failed_files))
    else:
        logging.info("No lint issues.")

    if not args.junit_path:
        logging.info("No --junit_path.")
        return

    test_case = test_util.TestCase()
    test_case.class_name = "pylint"
    test_case.name = "pylint"
    test_case.time = time.time() - start_time
    if failed_files:
        test_case.failure = "Files with lint issues: {0}".format(
            ", ".join(failed_files))

    gcs_client = None
    if args.junit_path.startswith("gs://"):
        gcs_client = storage.Client(project=args.project)

    test_util.create_junit_xml_file([test_case], args.junit_path, gcs_client)
Exemple #30
0
    def run():
        namespace = _setup_test(api_client, namespace_name)
        logging.info("Using namespace: %s", namespace)
        # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
        # see: https://github.com/ksonnet/ksonnet/issues/233
        os.environ["GITHUB_TOKEN"] = args.github_token

        # Initialize a ksonnet app.
        app_name = "kubeflow-test"
        util.run([
            "ks",
            "init",
            app_name,
        ], cwd=args.test_dir, use_print=True)

        app_dir = os.path.join(args.test_dir, app_name)

        kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
        util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry],
                 cwd=app_dir)

        # Install required packages
        packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

        for p in packages:
            util.run(["ks", "pkg", "install", p], cwd=app_dir)

        # Delete the vendor directory and replace with a symlink to the src
        # so that we use the code at the desired commit.
        target_dir = os.path.join(app_dir, "vendor", "kubeflow")

        logging.info("Deleting %s", target_dir)
        shutil.rmtree(target_dir)

        source = os.path.join(args.test_dir, "src", "kubeflow")
        logging.info("Creating link %s -> %s", target_dir, source)
        os.symlink(source, target_dir)

        # Deploy Kubeflow
        util.run([
            "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
            "--namespace=" + namespace.metadata.name
        ],
                 cwd=app_dir)

        # TODO(jlewi): For reasons I don't understand even though we ran
        # configure_kubectl above, if we don't rerun it we get rbac errors
        # when we do ks apply; I think because we aren't using the proper service
        # account. This might have something to do with the way ksonnet gets
        # its credentials; maybe we need to configure credentials after calling
        # ks init?
        if args.cluster:
            util.configure_kubectl(args.project, args.zone, args.cluster)

        apply_command = [
            "ks",
            "apply",
            "default",
            "-c",
            "kubeflow-core",
        ]

        util.run(apply_command, cwd=app_dir)

        # Verify that the TfJob operator is actually deployed.
        tf_job_deployment_name = "tf-job-operator"
        logging.info("Verifying TfJob controller started.")
        util.wait_for_deployment(api_client, namespace.metadata.name,
                                 tf_job_deployment_name)

        # Verify that JupyterHub is actually deployed.
        jupyter_name = "tf-hub"
        logging.info("Verifying TfHub started.")
        util.wait_for_statefulset(api_client, namespace.metadata.name,
                                  jupyter_name)