Beispiel #1
0
    def _create_checkout_task(self, task_template):
        """Checkout the kubeflow/testing and kubeflow/kubeflow code"""
        main_repo = argo_build_util.get_repo_from_prow_env()
        if not main_repo:
            logging.info("Prow environment variables for repo not set")
            main_repo = "kubeflow/testing@HEAD"
        logging.info("Main repository: %s", main_repo)
        repos = [main_repo]

        checkout = argo_build_util.deep_copy(task_template)

        checkout["name"] = "checkout"
        checkout["container"]["command"] = [
            "/usr/local/bin/checkout_repos.sh",
            "--repos=" + ",".join(repos),
            "--src_dir=" + self.src_root_dir,
        ]

        return checkout
Beispiel #2
0
def run_papermill_job(
        notebook_path,
        name,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        repos,
        image):
    """Generate a K8s job to run a notebook using papermill

  Args:
    notebook_path: Path to the notebook. This should be in the form
      "{REPO_OWNER}/{REPO}/path/to/notebook.ipynb"
    name: Name for the K8s job
    namespace: The namespace where the job should run.
    repos: Which repos to checkout; if None or empty tries
      to infer based on PROW environment variables
    image: The docker image to run the notebook in.
  """

    util.maybe_activate_service_account()

    with open("job.yaml") as hf:
        job = yaml.load(hf)

    if notebook_path.startswith("/"):
        raise ValueError(
            "notebook_path={0} should not start with /".format(notebook_path))

    # We need to checkout the correct version of the code
    # in presubmits and postsubmits. We should check the environment variables
    # for the prow environment variables to get the appropriate values.
    # We should probably also only do that if the
    # See
    # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
    if not repos:
        repos = argo_build_util.get_repo_from_prow_env()

    if not repos:
        raise ValueError("Could not get repos from prow environment variable "
                         "and --repos isn't explicitly set")

    repos += ",kubeflow/testing@HEAD"

    logging.info("Repos set to %s", repos)
    job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
        "/usr/local/bin/checkout_repos.sh",
        "--repos=" + repos,
        "--src_dir=/src",
        "--depth=all",
    ]

    job["spec"]["template"]["spec"]["containers"][0]["image"] = image

    full_notebook_path = os.path.join("/src", notebook_path)
    job["spec"]["template"]["spec"]["containers"][0]["command"] = [
        "python3", "-m", "kubeflow.examples.notebook_tests.execute_notebook",
        "--notebook_path", full_notebook_path
    ]

    job["spec"]["template"]["spec"]["containers"][0][
        "workingDir"] = os.path.dirname(full_notebook_path)

    # The prow bucket to use for results/artifacts
    prow_bucket = prow_artifacts.PROW_RESULTS_BUCKET

    if os.getenv("REPO_OWNER") and os.getenv("REPO_NAME"):
        # Running under prow
        prow_dir = prow_artifacts.get_gcs_dir(prow_bucket)
        logging.info("Prow artifacts dir: %s", prow_dir)
        prow_dir = os.path.join(prow_dir, "artifacts")

        if os.getenv("TEST_TARGET_NAME"):
            prow_dir = os.path.join(prow_dir,
                                    os.getenv("TEST_TARGET_NAME").lstrip("/"))
        prow_bucket, prow_path = util.split_gcs_uri(prow_dir)

    else:
        prow_path = "notebook-test" + datetime.datetime.now().strftime(
            "%H%M%S")
        prow_path = prow_path + "-" + uuid.uuid4().hex[0:3]
        prow_dir = util.to_gcs_uri(prow_bucket, prow_path)

    prow_path = os.path.join(prow_path, name + ".html")
    output_gcs = util.to_gcs_uri(NB_BUCKET, prow_path)

    job["spec"]["template"]["spec"]["containers"][0]["env"] = [
        {
            "name": "OUTPUT_GCS",
            "value": output_gcs
        },
        {
            "name": "PYTHONPATH",
            "value": "/src/kubeflow/testing/py:/src/kubeflow/examples/py"
        },
    ]

    logging.info("Notebook will be written to %s", output_gcs)
    util.load_kube_config(persist_config=False)

    if name:
        job["metadata"]["name"] = name
    else:
        job["metadata"]["name"] = ("notebook-test-" +
                                   datetime.datetime.now().strftime("%H%M%S") +
                                   "-" + uuid.uuid4().hex[0:3])
    name = job["metadata"]["name"]

    job["metadata"]["namespace"] = namespace

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
    batch_api = k8s_client.BatchV1Api(api_client)

    logging.info("Creating job:\n%s", yaml.dump(job))
    actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
                                                 job)
    logging.info("Created job %s.%s:\n%s", namespace, name,
                 yaml.safe_dump(actual_job.to_dict()))

    final_job = util.wait_for_job(api_client,
                                  namespace,
                                  name,
                                  timeout=datetime.timedelta(minutes=30))

    logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict()))

    # Download notebook html to artifacts
    logging.info("Copying %s to bucket %s", output_gcs, prow_bucket)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(NB_BUCKET)
    blob = bucket.get_blob(prow_path)

    destination_bucket = storage_client.get_bucket(prow_bucket)
    bucket.copy_blob(blob, destination_bucket)

    if not final_job.status.conditions:
        raise RuntimeError("Job {0}.{1}; did not complete".format(
            namespace, name))

    last_condition = final_job.status.conditions[-1]

    if last_condition.type not in ["Complete"]:
        logging.error("Job didn't complete successfully")
        raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
Beispiel #3
0
    def build(self):
        self.workflow = self._build_workflow()
        task_template = self._build_task_template()

        #**************************************************************************
        # Checkout

        # create the checkout step
        main_repo = argo_build_util.get_repo_from_prow_env()
        if not main_repo:
            logging.info("Prow environment variables for repo not set")
            main_repo = MAIN_REPO + "@HEAD"
        logging.info("Main repository: %s", main_repo)
        repos = [main_repo]

        repos.extend(EXTRA_REPOS)

        checkout = argo_build_util.deep_copy(task_template)

        checkout["name"] = "checkout"
        checkout["container"]["command"] = [
            "/usr/local/bin/checkout_repos.sh", "--repos=" + ",".join(repos),
            "--src_dir=" + self.src_root_dir
        ]

        argo_build_util.add_task_to_dag(self.workflow, E2E_DAG_NAME, checkout,
                                        [])

        # Change the workfing directory for all subsequent steps
        task_template["container"]["workingDir"] = os.path.join(
            self.kfctl_pytest_dir)

        #**************************************************************************
        # Run build_kfctl and deploy kubeflow

        step_name = "kfctl-build-deploy"
        command = [
            "pytest",
            "kfctl_go_test.py",
            # I think -s mean stdout/stderr will print out to aid in debugging.
            # Failures still appear to be captured and stored in the junit file.
            "-s",
            "--config_path=" + self.config_path,
            "--build_and_apply=" + str(self.build_and_apply),
            # Increase the log level so that info level log statements show up.
            # TODO(https://github.com/kubeflow/testing/issues/372): If we
            # set a unique artifacts dir for each workflow with the proper
            # prefix that should work.
            "--log-cli-level=info",
            "--junitxml=" + self.artifacts_dir + "/junit_kfctl-build-test" +
            self.config_name + ".xml",
            # TODO(jlewi) Test suite name needs to be unique based on parameters.
            #
            "-o",
            "junit_suite_name=test_kfctl_go_deploy_" + self.config_name,
            "--app_path=" + self.app_dir,
        ]

        dependences = [checkout["name"]]
        build_kfctl = self._build_step(step_name, self.workflow, E2E_DAG_NAME,
                                       task_template, command, dependences)

        #**************************************************************************
        # Wait for Kubeflow to be ready
        step_name = "kubeflow-is-ready"
        command = [
            "pytest",
            "kf_is_ready_test.py",
            # I think -s mean stdout/stderr will print out to aid in debugging.
            # Failures still appear to be captured and stored in the junit file.
            "-s",
            # TODO(jlewi): We should update kf_is_ready_test to take the config
            # path and then based on the KfDef spec kf_is_ready_test should
            # figure out what to do.
            "--use_basic_auth={0}".format(self.use_basic_auth),
            # TODO(jlewi): We should be using ISTIO always so can we stop
            # setting this
            "--use_istio=true",
            # Increase the log level so that info level log statements show up.
            "--log-cli-level=info",
            "--junitxml=" + os.path.join(
                self.artifacts_dir,
                "junit_kfctl-is-ready-test-" + self.config_name + ".xml"),
            # Test suite name needs to be unique based on parameters
            "-o",
            "junit_suite_name=test_kf_is_ready_" + self.config_name,
            "--app_path=" + self.app_dir,
        ]

        dependences = [build_kfctl["name"]]
        kf_is_ready = self._build_step(step_name, self.workflow, E2E_DAG_NAME,
                                       task_template, command, dependences)

        #**************************************************************************
        # Wait for endpoint to be ready
        if self.test_endpoint:
            step_name = "endpoint-is-ready"
            command = [
                "pytest",
                "endpoint_ready_test.py",
                # I think -s mean stdout/stderr will print out to aid in debugging.
                # Failures still appear to be captured and stored in the junit file.
                "-s",
                # Increase the log level so that info level log statements show up.
                "--log-cli-level=info",
                # Test timeout in seconds.
                "--timeout=1800",
                "--junitxml=" + self.artifacts_dir +
                "/junit_endpoint-is-ready-test-" + self.config_name + ".xml",
                # Test suite name needs to be unique based on parameters
                "-o",
                "junit_suite_name=test_endpoint_is_ready_" + self.config_name,
                "--app_path=" + self.app_dir,
                "--app_name=" + self.app_name,
            ]

            dependencies = [build_kfctl["name"]]
            endpoint_ready = self._build_step(step_name, self.workflow,
                                              E2E_DAG_NAME, task_template,
                                              command, dependencies)

        self._build_tests_dag()

        # Add a task to run the dag
        dependencies = [kf_is_ready["name"]]
        argo_build_util.add_task_only_to_dag(self.workflow, E2E_DAG_NAME,
                                             TESTS_DAG_NAME, TESTS_DAG_NAME,
                                             dependencies)

        #***************************************************************************
        # create_pr_symlink
        #***************************************************************************
        # TODO(jlewi): run_e2e_workflow.py should probably create the PR symlink
        step_name = "create-pr-symlink"
        command = [
            "python",
            "-m",
            "kubeflow.testing.prow_artifacts",
            "--artifacts_dir=" + self.output_dir,
            "create_pr_symlink",
            "--bucket=" + self.bucket,
        ]

        dependences = [checkout["name"]]
        symlink = self._build_step(step_name, self.workflow, E2E_DAG_NAME,
                                   task_template, command, dependences)

        self._build_exit_dag()

        # Set the labels on all templates
        self.workflow = argo_build_util.set_task_template_labels(self.workflow)

        return self.workflow
  def __init__(self, name=None, namespace=None,
               config_path=("https://raw.githubusercontent.com/kubeflow"
                            "/manifests/master/kfdef/kfctl_gcp_iap.yaml"),
               bucket=None,
               test_endpoint=False,
               use_basic_auth=False,
               build_and_apply=False,
               test_target_name=None,
               kf_app_name=None, delete_kf=True,
               extra_repos="",
               **kwargs):
    """Initialize a builder.

    Args:
      name: Name for the workflow.
      namespace: Namespace for the workflow.
      config_path: Path to the KFDef spec file.
      bucket: The bucket to upload artifacts to. If not set use default determined by prow_artifacts.py.
      test_endpoint: Whether to test the endpoint is ready.
      use_basic_auth: Whether to use basic_auth.
      test_target_name: (Optional) Name to use as the test target to group
        tests.
      kf_app_name: (Optional) Name to use for the Kubeflow deployment.
        If not set a unique name is assigned. Only set this if you want to
        reuse an existing deployment across runs.
      delete_kf: (Optional) Don't run the step to delete Kubeflow. Set to
        true if you want to leave the deployment up for some reason.
    """
    self.name = name
    self.namespace = namespace
    self.bucket = bucket
    self.config_path = config_path
    self.build_and_apply = build_and_apply
    #****************************************************************************
    # Define directory locations
    #****************************************************************************
    # mount_path is the directory where the volume to store the test data
    # should be mounted.
    self.mount_path = "/mnt/" + "test-data-volume"
    # test_dir is the root directory for all data for a particular test run.
    self.test_dir = self.mount_path + "/" + self.name
    # output_dir is the directory to sync to GCS to contain the output for this
    # job.
    self.output_dir = self.test_dir + "/output"

    # We prefix the artifacts directory with junit because
    # that's what spyglass/prow requires. This ensures multiple
    # instances of a workflow triggered by the same prow job
    # don't end up clobbering each other
    self.artifacts_dir = self.output_dir + "/artifacts/junit_{0}".format(name)

    # source directory where all repos should be checked out
    self.src_root_dir = self.test_dir + "/src"
    # The directory containing the kubeflow/kfctl repo
    self.src_dir = self.src_root_dir + "/kubeflow/kfctl"
    self.kubeflow_dir = self.src_root_dir + "/kubeflow/kubeflow"

    # Directory in kubeflow/kfctl containing the pytest files.
    self.kfctl_pytest_dir = os.path.join(self.src_dir, "py/kubeflow/kfctl/testing/pytests")

    # Top level directories for python testing code in kfctl.
    self.kfctl_py = os.path.join(self.src_dir, "py")

    # Build a string of key value pairs that can be passed to various test
    # steps to allow them to do substitution into different values.
    values = {
      "srcrootdir": self.src_root_dir,
    }

    value_pairs = ["{0}={1}".format(k,v) for k,v in values.items()]
    self.values_str = ",".join(value_pairs)

    # The directory within the kubeflow_testing submodule containing
    # py scripts to use.
    self.kubeflow_testing_py = self.src_root_dir + "/kubeflow/testing/py"

    self.tf_operator_root = os.path.join(self.src_root_dir,
                                         "kubeflow/tf-operator")
    self.tf_operator_py = os.path.join(self.tf_operator_root, "py")

    self.go_path = self.test_dir

    # Name for the Kubeflow app.
    # This needs to be unique for each test run because it is
    # used to name GCP resources
    # TODO(jlewi): Might be good to include pull number or build id in the name
    # Not sure if being non-deterministic is a good idea.
    # A better approach might be to hash the workflow name to generate a unique
    # name dependent on the workflow name. We know there will be one workflow
    # per cluster.
    self.uuid = uuid.uuid4().hex[0:4]

    # Config name is the name of the config file. This is used to give junit
    # files unique names.
    self.config_name = os.path.splitext(os.path.basename(config_path))[0]

    # The class name to label junit files.
    # We want to be able to group related tests in test grid.
    # Test grid allows grouping by target which corresponds to the classname
    # attribute in junit files.
    # So we set an environment variable to the desired class name.
    # The pytest modules can then look at this environment variable to
    # explicitly override the classname.
    # The classname should be unique for each run so it should take into
    # account the different parameters
    if test_target_name:
      self.test_target_name = test_target_name
    else:
      self.test_target_name = self.config_name

    # app_name is the name of the Kubeflow deployment.
    # This needs to be unique per run since we name GCP resources with it.
    self.app_name = kf_app_name
    if not self.app_name:
      self.app_name = "kfctl-" +  self.uuid

    self.delete_kf = delete_kf

    # GCP service accounts can only be max 30 characters. Service account names
    # are generated by taking the app_name and appending suffixes like "user"
    # and "admin"
    if len(self.app_name) > 20:
      raise ValueError(("app_name {0} is longer than 20 characters; this will"
                        "likely exceed GCP naming restrictions.").format(
                          self.app_name))
    # Directory for the KF app.
    self.app_dir = os.path.join(self.test_dir, self.app_name)
    self.use_basic_auth = use_basic_auth

    # The name space we create KF artifacts in; e.g. TFJob and notebooks.
    # TODO(jlewi): These should no longer be running the system namespace but
    # should move into the namespace associated with the default profile.
    self.steps_namespace = "kubeflow"
    self.test_endpoint = test_endpoint

    self.kfctl_path = os.path.join(self.src_dir, "bin/kfctl")

    # Fetch the main repo from Prow environment.
    self.main_repo = argo_build_util.get_repo_from_prow_env()

    # extra_repos is a list of comma separated repo names with commits,
    # in the format <repo_owner>/<repo_name>@<commit>,
    # e.g. "kubeflow/tf-operator@12345,kubeflow/manifests@23456".
    # This will be used to override the default repo branches.
    self.extra_repos = []
    if extra_repos:
      self.extra_repos = extra_repos.split(',')

    # Keep track of step names that subclasses might want to list as dependencies
    self._run_tests_step_name = None
    self._test_endpoint_step_name = None
    self._test_endpoint_template_name = None
Beispiel #5
0
  def build(self):
    workflow = self._build_workflow()
    task_template = self._build_task_template()

    #**************************************************************************
    # Checkout

    # create the checkout step
    main_repo = argo_build_util.get_repo_from_prow_env()
    if not main_repo:
      logging.info("Prow environment variables for repo not set")
      main_repo = "kubeflow/testing@HEAD"
    logging.info("Main repository: %s", main_repo)
    repos = [main_repo]

    checkout = argo_build_util.deep_copy(task_template)

    checkout["name"] = "checkout"
    checkout["container"]["command"] = ["/usr/local/bin/checkout_repos.sh",
                                        "--repos=" + ",".join(repos),
                                        "--src_dir=" + self.src_root_dir]

    argo_build_util.add_task_to_dag(workflow, E2E_DAG_NAME, checkout, [])

    #**************************************************************************
    # Make dir
    # pytest was failing trying to call makedirs. My suspicion is its
    # because the two steps ended up trying to create the directory at the
    # same time and classing. So we create a separate step to do it.
    mkdir_step = argo_build_util.deep_copy(task_template)

    mkdir_step["name"] = "make-artifacts-dir"
    mkdir_step["container"]["command"] = ["mkdir",
                                          "-p",
                                          self.artifacts_dir]


    argo_build_util.add_task_to_dag(workflow, E2E_DAG_NAME, mkdir_step,
                                    [checkout["name"]])

    #**************************************************************************
    # Run python unittests
    py_tests = argo_build_util.deep_copy(task_template)

    py_tests["name"] = "py-test"
    py_tests["container"]["command"] = ["python",
                                        "-m",
                                        "kubeflow.testing.test_py_checks",
                                        "--artifacts_dir=" + self.artifacts_dir,
                                        # TODO(jlewi): Should we be searching
                                        # the entire py/kubeflo/testing tree?
                                        "--src_dir=" + self.kubeflow_testing_py
                                        + "kubeflow/tests"]


    argo_build_util.add_task_to_dag(workflow, E2E_DAG_NAME, py_tests,
                                    [mkdir_step["name"]])


    #***************************************************************************
    # py lint
    #***************************************************************************
    py_lint = argo_build_util.deep_copy(task_template)

    py_lint["name"] = "py-lint"
    py_lint["container"]["command"] = ["pytest",
                                       "test_py_lint.py",
                                       # I think -s mean stdout/stderr will
                                       # print out to aid in debugging.
                                       # Failures still appear to be captured
                                       # and stored in the junit file.
                                       "-s",
                                       "--src_dir=" + self.kubeflow_testing_py,
                                       "--rcfile=" + os.path.join(
                                         self.testing_src_dir, ".pylintrc"),
                                       # Test timeout in seconds.
                                       "--timeout=500",
                                       "--junitxml=" + self.artifacts_dir +
                                       "/junit_py-lint.xml"]

    py_lint_step = argo_build_util.add_task_to_dag(workflow, E2E_DAG_NAME,
                                                   py_lint,
                                                   [mkdir_step["name"]])

    py_lint_step["container"]["workingDir"] = os.path.join(
      self.testing_src_dir, "py/kubeflow/testing")

    #*****************************************************************************
    # create_pr_symlink
    #****************************************************************************
    # TODO(jlewi): run_e2e_workflow.py should probably create the PR symlink
    symlink = argo_build_util.deep_copy(task_template)

    symlink["name"] = "create-pr-symlink"
    symlink["container"]["command"] = ["python",
                                       "-m",
                                       "kubeflow.testing.prow_artifacts",
                                       "--artifacts_dir=" + self.output_dir,
                                       "create_pr_symlink",
                                       ]

    if self.bucket:
      symlink["container"]["command"].append("--bucket=" + self.bucket)

    argo_build_util.add_task_to_dag(workflow, E2E_DAG_NAME, symlink,
                                    [checkout["name"]])

    #*****************************************************************************
    # Exit handler workflow
    #*****************************************************************************
    copy_artifacts = argo_build_util.deep_copy(task_template)

    copy_artifacts["name"] = "copy-artifacts"
    copy_artifacts["container"]["command"] = ["python",
                                              "-m",
                                              "kubeflow.testing.prow_artifacts",
                                              "--artifacts_dir=" +
                                              self.output_dir,
                                              "copy_artifacts"]

    if self.bucket:
      copy_artifacts["container"]["command"].append("--bucket=" + self.bucket)


    argo_build_util.add_task_to_dag(workflow, EXIT_DAG_NAME, copy_artifacts, [])


    # Set the labels on all templates
    workflow = argo_build_util.set_task_template_labels(workflow)

    return workflow
Beispiel #6
0
def test_xgboost_synthetic(
        record_xml_attribute,
        name,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        repos,
        image,
        notebook_artifacts_dir):
    '''Generate Job and summit.'''
    util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")

    util.maybe_activate_service_account()

    with open("job.yaml") as hf:
        job = yaml.load(hf)

    # We need to checkout the correct version of the code
    # in presubmits and postsubmits. We should check the environment variables
    # for the prow environment variables to get the appropriate values.
    # We should probably also only do that if the
    # See
    # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
    if not repos:
        repos = argo_build_util.get_repo_from_prow_env()

    repos += ",kubeflow/testing@HEAD"
    logging.info("Repos set to %s", repos)
    job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
        "/usr/local/bin/checkout_repos.sh",
        "--repos=" + repos,
        "--src_dir=/src",
        "--depth=all",
    ]

    nb_bucket = "kubeflow-ci-deployment"
    nb_path = os.path.join("xgboost_synthetic_testing", os.getenv("JOB_TYPE"),
                           os.getenv("HOSTNAME"), "notebook.html")
    output_gcs = util.to_gcs_uri(nb_bucket, nb_path)
    logging.info("Tested notebook will be outputed to: %s", output_gcs)
    job["spec"]["template"]["spec"]["containers"][0]["env"] = [
        {
            "name": "PYTHONPATH",
            "value": "/src/kubeflow/testing/py"
        },
        {
            "name": "OUTPUT_GCS",
            "value": output_gcs
        },
    ]
    job["spec"]["template"]["spec"]["containers"][0]["image"] = image
    util.load_kube_config(persist_config=False)

    if name:
        job["metadata"]["name"] = name
    else:
        job["metadata"]["name"] = ("xgboost-test-" +
                                   datetime.datetime.now().strftime("%H%M%S") +
                                   "-" + uuid.uuid4().hex[0:3])
        name = job["metadata"]["name"]

    job["metadata"]["namespace"] = namespace

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
    batch_api = k8s_client.BatchV1Api(api_client)

    logging.info("Creating job:\n%s", yaml.dump(job))
    actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
                                                 job)
    logging.info("Created job %s.%s:\n%s", namespace, name,
                 yaml.safe_dump(actual_job.to_dict()))

    final_job = util.wait_for_job(api_client,
                                  namespace,
                                  name,
                                  timeout=datetime.timedelta(minutes=30))

    logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict()))

    if not final_job.status.conditions:
        raise RuntimeError("Job {0}.{1}; did not complete".format(
            namespace, name))

    last_condition = final_job.status.conditions[-1]

    # Download notebook html to artifacts
    notebook_artifacts_path = os.path.join(notebook_artifacts_dir,
                                           "notebook.html")
    logging.info("Writing notebook artifact to: %s", notebook_artifacts_path)
    os.makedirs(notebook_artifacts_dir, exist_ok=True)
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(nb_bucket)
    blob = bucket.get_blob(nb_path)
    blob.download_to_filename(notebook_artifacts_path)

    if last_condition.type not in ["Complete"]:
        logging.error("Job didn't complete successfully")
        raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
    def build(self):
        self.workflow = self._build_workflow()
        task_template = self._build_task_template()

        # **************************************************************************
        # Checkout

        # create the checkout step
        main_repo = argo_build_util.get_repo_from_prow_env()
        if not main_repo:
            logging.info("Prow environment variables for repo not set")
            main_repo = MAIN_REPO + "@HEAD"
        logging.info("Main repository: %s", main_repo)
        repos = [main_repo]

        repos.extend(EXTRA_REPOS)

        #***************************************************************************
        # Checkout the code
        checkout = argo_build_util.deep_copy(task_template)

        checkout["name"] = "checkout"
        checkout["container"]["command"] = [
            "/usr/local/bin/checkout_repos.sh", "--repos=" + ",".join(repos),
            "--src_dir=" + self.src_root_dir
        ]

        argo_build_util.add_task_to_dag(self.workflow, E2E_DAG_NAME, checkout,
                                        [])

        #***************************************************************************
        # Get credentials for the latest auto-deployed cluster

        credentials = argo_build_util.deep_copy(task_template)

        credentials["name"] = "get-credentials"
        credentials["container"]["command"] = [
            "python3",
            "-m",
            "kubeflow.testing."
            "get_kf_testing_cluster",
            "get-credentials",
        ]

        dependencies = [checkout["name"]]
        argo_build_util.add_task_to_dag(self.workflow, E2E_DAG_NAME,
                                        credentials, dependencies)

        #**************************************************************************
        # Run a dag of tests
        self._build_tests_dag()

        # Add a task to run the dag
        dependencies = [credentials["name"]]
        argo_build_util.add_task_only_to_dag(self.workflow, E2E_DAG_NAME,
                                             TESTS_DAG_NAME, TESTS_DAG_NAME,
                                             dependencies)

        # **************************************************************************
        # create_pr_symlink
        # ***************************************************************************
        # TODO(jlewi): run_e2e_workflow.py should probably create the PR symlink
        step_name = "create-pr-symlink"
        command = [
            "python", "-m", "kubeflow.testing.prow_artifacts",
            "--artifacts_dir=" + self.output_dir, "create_pr_symlink"
        ]

        if self.bucket:
            command.append(self.bucket)

        dependencies = [checkout["name"]]
        self._build_step(step_name, self.workflow, E2E_DAG_NAME, task_template,
                         command, dependencies)

        self._build_exit_dag()

        # Set the labels on all templates
        self.workflow = argo_build_util.set_task_template_labels(self.workflow)

        return self.workflow
Beispiel #8
0
def test_xgboost_synthetic(
        record_xml_attribute,
        name,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        repos,
        image):
    '''Generate Job and summit.'''
    util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")

    util.maybe_activate_service_account()

    with open("job.yaml") as hf:
        job = yaml.load(hf)

    # We need to checkout the correct version of the code
    # in presubmits and postsubmits. We should check the environment variables
    # for the prow environment variables to get the appropriate values.
    # We should probably also only do that if the
    # See
    # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
    if not repos:
        repos = argo_build_util.get_repo_from_prow_env()

    logging.info("Repos set to %s", repos)
    job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
        "/usr/local/bin/checkout_repos.sh",
        "--repos=" + repos,
        "--src_dir=/src",
        "--depth=all",
    ]
    job["spec"]["template"]["spec"]["containers"][0]["image"] = image
    util.load_kube_config(persist_config=False)

    if name:
        job["metadata"]["name"] = name
    else:
        job["metadata"]["name"] = ("xgboost-test-" +
                                   datetime.datetime.now().strftime("%H%M%S") +
                                   "-" + uuid.uuid4().hex[0:3])
        name = job["metadata"]["name"]

    job["metadata"]["namespace"] = namespace

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
    batch_api = k8s_client.BatchV1Api(api_client)

    logging.info("Creating job:\n%s", yaml.dump(job))
    actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
                                                 job)
    logging.info("Created job %s.%s:\n%s", namespace, name,
                 yaml.safe_dump(actual_job.to_dict()))

    final_job = util.wait_for_job(api_client,
                                  namespace,
                                  name,
                                  timeout=datetime.timedelta(minutes=30))

    logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict()))

    if not final_job.status.conditions:
        raise RuntimeError("Job {0}.{1}; did not complete".format(
            namespace, name))

    last_condition = final_job.status.conditions[-1]

    if last_condition.type not in ["Complete"]:
        logging.error("Job didn't complete successfully")
        raise RuntimeError("Job {0}.{1} failed".format(namespace, name))