コード例 #1
0
  def run_simple_tfjob(self, component):
    api_client = k8s_client.ApiClient()

    # Setup the ksonnet app
    ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                         self.params)

    # Create the TF job
    ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
    util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir)
    logging.info("Created job %s in namespaces %s", self.name, self.namespace)

    # Wait for the job to either be in Running state or a terminal state
    logging.info("Wait for conditions Running, Succeeded, or Failed")
    results = tf_job_client.wait_for_condition(
      api_client,
      self.namespace,
      self.name, ["Running", "Succeeded", "Failed"],
      version=self.tfjob_version,
      status_callback=tf_job_client.log_status)
    logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

    # Wait for the job to complete.
    logging.info("Waiting for job to finish.")
    results = tf_job_client.wait_for_job(
      api_client,
      self.namespace,
      self.name,
      self.tfjob_version,
      status_callback=tf_job_client.log_status)
    logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

    if not tf_job_client.job_succeeded(results):
      self.failure = "Job {0} in namespace {1} in status {2}".format(
        self.name, self.namespace, results.get("status", {}))
      logging.error(self.failure)
      return

    # Check for creation failures.
    creation_failures = tf_job_client.get_creation_failures_from_tfjob(
      api_client, self.namespace, results)
    if creation_failures:
      # TODO(jlewi): Starting with
      # https://github.com/kubeflow/tf-operator/pull/646 the number of events
      # no longer seems to match the expected; it looks like maybe events
      # are being combined? For now we just log a warning rather than an
      # error.
      logging.warning(creation_failures)

    # Delete the TFJob.
    tf_job_client.delete_tf_job(
      api_client, self.namespace, self.name, version=self.tfjob_version)
    logging.info("Waiting for job %s in namespaces %s to be deleted.",
                 self.name, self.namespace)
    tf_job_client.wait_for_delete(
      api_client,
      self.namespace,
      self.name,
      self.tfjob_version,
      status_callback=tf_job_client.log_status)
コード例 #2
0
    def run_tfjob_with_shutdown_policy(self, component, shutdown_policy):
        tf_operator_util.load_kube_config()
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace,
                                      component, self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        if shutdown_policy == "worker":
            tf_job_client.terminate_replicas(api_client, self.namespace,
                                             self.name, "worker", 1)
        else:
            tf_job_client.terminate_replicas(api_client, self.namespace,
                                             self.name, "chief", 1)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
コード例 #3
0
    def run_distributed_training_job(self, component):
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # Check for creation failures.
        creation_failures = tf_job_client.get_creation_failures_from_tfjob(
            api_client, self.namespace, results)
        if creation_failures:
            logging.warning(creation_failures)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
コード例 #4
0
def test_jupyter(record_xml_attribute, env, namespace):
  """Test the jupyter notebook.

  Args:
    record_xml_attribute: Test fixture provided by pytest.
    env: ksonnet environment.
    namespace: namespace to run in.
  """
  util.set_pytest_junit(record_xml_attribute, "jupyter_test")

  app_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
  if app_credentials:
    logging.info("Activate service account")
    util.run([
        "gcloud", "auth", "activate-service-account",
        "--key-file=" + app_credentials
    ])

  # util.load_kube_config appears to hang on python3
  kube_config.load_kube_config()
  api_client = k8s_client.ApiClient()
  host = api_client.configuration.host
  logging.info("Kubernetes master: %s", host)
  master = host.rsplit("/", 1)[-1]

  this_dir = os.path.dirname(__file__)
  app_dir = os.path.join(this_dir, "test_app")

  ks_cmd = ks_util.get_ksonnet_cmd(app_dir)

  name = "jupyter-test"
  service = "jupyter-test"
  component = "jupyter"
  params = ""
  ks_util.setup_ks_app(app_dir, env, namespace, component, params)

  util.run([ks_cmd, "apply", env, "-c", component], cwd=app_dir)
  conditions = ["Running"]
  results = util.wait_for_cr_condition(api_client, GROUP, PLURAL, VERSION,
                                       namespace, name, conditions)

  logging.info("Result of CRD:\n%s", results)

  # We proxy the request through the APIServer so that we can connect
  # from outside the cluster.
  url = ("https://{master}/api/v1/namespaces/{namespace}/services/{service}:80"
         "/proxy/default/jupyter/lab?").format(
             master=master, namespace=namespace, service=service)
  logging.info("Request: %s", url)
  r = send_request(url, verify=False)

  if r.status_code != requests.codes.OK:
    msg = "Request to {0} exited with status code: {1} and content: {2}".format(
        url, r.status_code, r.content)
    logging.error(msg)
    raise RuntimeError(msg)
コード例 #5
0
    def test_invalid_tfjob_spec(self):
        api_client = k8s_client.ApiClient()
        component = INVALID_TFJOB_COMPONENT_NAME + "_" + self.tfjob_version

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        logging.info("Wait for conditions Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)

        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        last_condition = results.get("status", {}).get("conditions", [{}])[-1]
        if last_condition.get("type", "").lower() != "failed":
            self.failure = "Job {0} in namespace {1} did not fail; status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        pattern = ".*the spec is invalid.*"
        condition_message = last_condition.get("message", "")
        if not re.match(pattern, condition_message):
            self.failure = "Condition message {0} did not match pattern {1}".format(
                condition_message, pattern)
            logging.error(self.failure)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
コード例 #6
0
ファイル: deploy.py プロジェクト: yunll/tf-operator
def ks_deploy(app_dir, component, params, env=None, account=None):
  """Deploy the specified ksonnet component.

  Args:
    app_dir: The ksonnet directory
    component: Name of the component to deployed
    params: A dictionary of parameters to set; can be empty but should not be
      None.
    env: (Optional) The environment to use, if none is specified a new one
      is created.
    account: (Optional) The account to use.

  Raises:
    ValueError: If input arguments aren't valid.
  """
  if not component:
    raise ValueError("component can't be None.")



  # TODO(jlewi): It might be better if the test creates the app and uses
  # the latest stable release of the ksonnet configs. That however will cause
  # problems when we make changes to the TFJob operator that require changes
  # to the ksonnet configs. One advantage of checking in the app is that
  # we can modify the files in vendor if needed so that changes to the code
  # and config can be submitted in the same pr.
  now = datetime.datetime.now()
  if not env:
    env = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

  logging.info("Using app directory: %s", app_dir)

  ks_cmd = ks_util.get_ksonnet_cmd(app_dir)
  logging.info("Using ksonnet cmd: %s", ks_cmd)

  try:
    util.run([ks_cmd, "env", "add", env], cwd=app_dir)
  except subprocess.CalledProcessError as e:
    if not re.search(".*environment.*already exists.*", e.output):
      raise

  for k, v in params.iteritems():
    util.run([ks_cmd, "param", "set", "--env=" + env, component, k, v],
             cwd=app_dir)

  apply_command = [ks_cmd, "apply", env, "-c", component]
  if account:
    apply_command.append("--as=" + account)
  util.run(apply_command, cwd=app_dir)
コード例 #7
0
ファイル: tfjob_test.py プロジェクト: ykv001/examples
  def __init__(self, args):
    namespace, name, env = test_runner.parse_runtime_params(args)
    self.app_dir = args.app_dir

    if not self.app_dir:
      self.app_dir = os.path.join(os.path.dirname(__file__), "..",
                                  "ks_app")
      self.app_dir = os.path.abspath(self.app_dir)
      logging.info("--app_dir not set defaulting to: %s", self.app_dir)

    self.env = env
    self.namespace = namespace
    self.params = args.params
    self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
    super(TFJobTest, self).__init__(class_name="TFJobTest", name=name)
コード例 #8
0
    def test_invalid_tfjob_spec(self):
        tf_operator_util.load_kube_config()
        api_client = k8s_client.ApiClient()
        component = INVALID_TFJOB_COMPONENT_NAME + "_" + self.tfjob_version

        # Setup the ksonnet app
        tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace,
                                      component, self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        try:
            util.run([ks_cmd, "apply", self.env, "-c", component],
                     cwd=self.app_dir)
        except subprocess.CalledProcessError as e:
            if "invalid: spec.tfReplicaSpecs: Required value" in e.output:
                logging.info("Created job failed which is expected. Reason %s",
                             e.output)
            else:
                self.failure = "Job {0} in namespace {1} failed because {2}".format(
                    self.name, self.namespace, e.output)
                logging.error(self.failure)
コード例 #9
0
    def test_tfjob_and_verify_runconfig(self):
        api_client = k8s_client.ApiClient()
        masterHost = api_client.configuration.host
        component = COMPONENT_NAME + "_" + self.tfjob_version

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        num_ps = results.get("spec", {}).get("tfReplicaSpecs",
                                             {}).get("PS",
                                                     {}).get("replicas", 0)
        num_workers = results.get("spec",
                                  {}).get("tfReplicaSpecs",
                                          {}).get("Worker",
                                                  {}).get("replicas", 0)
        verify_runconfig(masterHost, self.namespace, self.name, "chief",
                         num_ps, num_workers)
        verify_runconfig(masterHost, self.namespace, self.name, "worker",
                         num_ps, num_workers)
        verify_runconfig(masterHost, self.namespace, self.name, "ps", num_ps,
                         num_workers)
        verify_runconfig(masterHost, self.namespace, self.name, "evaluator",
                         num_ps, num_workers)

        tf_job_client.terminate_replicas(api_client, self.namespace, self.name,
                                         "chief", 1)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
コード例 #10
0
ファイル: run_e2e_workflow.py プロジェクト: zabbasi/testing
def run(args, file_handler):  # pylint: disable=too-many-statements,too-many-branches
    job_type = os.getenv("JOB_TYPE")
    repo_owner = os.getenv("REPO_OWNER")
    repo_name = os.getenv("REPO_NAME")
    pull_base_sha = os.getenv("PULL_BASE_SHA")

    # For presubmit/postsubmit jobs, find the list of files changed by the PR.
    diff_command = []
    if job_type == "presubmit":
        # We need to get a common ancestor for the PR and the master branch
        common_ancestor = util.run(["git", "merge-base", "HEAD", "master"],
                                   cwd=os.path.join(args.repos_dir, repo_owner,
                                                    repo_name))
        diff_command = ["git", "diff", "--name-only", common_ancestor]
    elif job_type == "postsubmit":
        # See: https://git-scm.com/docs/git-diff
        # This syntax compares the commit before pull_base_sha with the commit
        # at pull_base_sha
        diff_command = [
            "git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha
        ]

    changed_files = []
    if job_type == "presubmit" or job_type == "postsubmit":
        changed_files = util.run(diff_command,
                                 cwd=os.path.join(args.repos_dir, repo_owner,
                                                  repo_name)).splitlines()

    for f in changed_files:
        logging.info("File %s is modified.", f)

    if args.release:
        generate_env_from_head(args)
    workflows = []
    if args.config_file:
        workflows.extend(parse_config_file(args.config_file, args.repos_dir))

    # Create an initial version of the file with no urls
    create_started_file(args.bucket, {})

    util.maybe_activate_service_account()

    util.configure_kubectl(args.project, args.zone, args.cluster)
    util.load_kube_config()

    workflow_names = []
    ui_urls = {}

    for w in workflows:
        # Create the name for the workflow
        # We truncate sha numbers to prevent the workflow name from being too large.
        # Workflow name should not be more than 63 characters because its used
        # as a label on the pods.
        workflow_name = os.getenv("JOB_NAME") + "-" + w.name
        ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir)

        # Print ksonnet version
        util.run([ks_cmd, "version"])

        # Skip this workflow if it is scoped to a different job type.
        if w.job_types and not job_type in w.job_types:
            logging.info(
                "Skipping workflow %s because job type %s is not one of "
                "%s.", w.name, job_type, w.job_types)
            continue

        # If we are scoping this workflow to specific directories, check if any files
        # modified match the specified regex patterns.
        dir_modified = False
        if w.include_dirs:
            for f in changed_files:
                for d in w.include_dirs:
                    if fnmatch.fnmatch(f, d):
                        dir_modified = True
                        logging.info(
                            "Triggering workflow %s because %s in dir %s is modified.",
                            w.name, f, d)
                        break
                if dir_modified:
                    break

        # Only consider modified files when the job is pre or post submit, and if
        # the include_dirs stanza is defined.
        if job_type != "periodic" and w.include_dirs and not dir_modified:
            logging.info(
                "Skipping workflow %s because no code modified in %s.", w.name,
                w.include_dirs)
            continue

        if job_type == "presubmit":
            workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
            workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

        elif job_type == "postsubmit":
            workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

        workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

        salt = uuid.uuid4().hex[0:4]
        # Add some salt. This is mostly a convenience for the case where you
        # are submitting jobs manually for testing/debugging. Since the prow should
        # vend unique build numbers for each job.
        workflow_name += "-{0}".format(salt)

        workflow_names.append(workflow_name)
        # Create a new environment for this run
        env = workflow_name

        util.run(
            [ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)],
            cwd=w.app_dir)

        util.run([
            ks_cmd, "param", "set", "--env=" + env, w.component, "name",
            workflow_name
        ],
                 cwd=w.app_dir)

        # Set the prow environment variables.
        prow_env = []

        names = [
            "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
            "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
            "REPO_NAME"
        ]
        names.sort()
        for v in names:
            if not os.getenv(v):
                continue
            prow_env.append("{0}={1}".format(v, os.getenv(v)))

        util.run([
            ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env",
            ",".join(prow_env)
        ],
                 cwd=w.app_dir)
        util.run([
            ks_cmd, "param", "set", "--env=" + env, w.component, "namespace",
            get_namespace(args)
        ],
                 cwd=w.app_dir)
        util.run([
            ks_cmd, "param", "set", "--env=" + env, w.component, "bucket",
            args.bucket
        ],
                 cwd=w.app_dir)
        if args.release:
            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component,
                "versionTag",
                os.getenv("VERSION_TAG")
            ],
                     cwd=w.app_dir)

        # Set any extra params. We do this in alphabetical order to make it easier to verify in
        # the unittest.
        param_names = w.params.keys()
        param_names.sort()
        for k in param_names:
            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component, k,
                "{0}".format(w.params[k])
            ],
                     cwd=w.app_dir)

        # For debugging print out the manifest
        util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir)
        util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir)

        ui_url = (
            "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
            "?tab=workflow".format(workflow_name))
        ui_urls[workflow_name] = ui_url
        logging.info("URL for workflow: %s", ui_url)

    # We delay creating started.json until we know the Argo workflow URLs
    create_started_file(args.bucket, ui_urls)

    success = True
    workflow_phase = {}
    try:
        results = argo_client.wait_for_workflows(
            get_namespace(args),
            workflow_names,
            timeout=datetime.timedelta(minutes=180),
            status_callback=argo_client.log_status)
        for r in results:
            phase = r.get("status", {}).get("phase")
            name = r.get("metadata", {}).get("name")
            workflow_phase[name] = phase
            if phase != "Succeeded":
                success = False
            logging.info("Workflow %s/%s finished phase: %s",
                         get_namespace(args), name, phase)
    except util.TimeoutError:
        success = False
        logging.exception("Time out waiting for Workflows %s to finish",
                          ",".join(workflow_names))
    except Exception as e:
        # We explicitly log any exceptions so that they will be captured in the
        # build-log.txt that is uploaded to Gubernator.
        logging.exception("Exception occurred: %s", e)
        raise
    finally:
        success = prow_artifacts.finalize_prow_job(args.bucket, success,
                                                   workflow_phase, ui_urls)

        # Upload logs to GCS. No logs after this point will appear in the
        # file in gcs
        file_handler.flush()
        util.upload_file_to_gcs(
            file_handler.baseFilename,
            os.path.join(prow_artifacts.get_gcs_dir(args.bucket),
                         "build-log.txt"))

    return success
コード例 #11
0
def run(args, file_handler):  # pylint: disable=too-many-statements,too-many-branches
    # Check https://github.com/kubernetes/test-infra/blob/master/prow/jobs.md
    # for a description of the injected environment variables.
    job_type = os.getenv("JOB_TYPE")
    repo_owner = os.getenv("REPO_OWNER")
    repo_name = os.getenv("REPO_NAME")
    base_branch_name = os.getenv("PULL_BASE_REF")
    pull_base_sha = os.getenv("PULL_BASE_SHA")

    # For presubmit/postsubmit jobs, find the list of files changed by the PR.
    diff_command = []
    if job_type == "presubmit":
        # We need to get a common ancestor for the PR and the base branch
        cloned_repo_dir = os.path.join(args.repos_dir, repo_owner, repo_name)

        _ = util.run([
            "git", "fetch", "origin",
            base_branch_name + ":refs/remotes/origin/" + base_branch_name
        ],
                     cwd=cloned_repo_dir)

        diff_command = ["git", "diff", "--name-only"]
        diff_branch = "remotes/origin/{}".format(base_branch_name)
        try:
            common_ancestor = util.run(
                ["git", "merge-base", "HEAD", diff_branch],
                cwd=cloned_repo_dir)
            diff_command.append(common_ancestor)
        except subprocess.CalledProcessError as e:
            logging.warning(
                "git merge-base failed; see "
                "https://github.com/kubeflow/kubeflow/issues/3523. Diff "
                "will be computed against the current master and "
                "therefore files not changed in the PR might be "
                "considered when determining which tests to trigger")
            diff_command.append(diff_branch)

    elif job_type == "postsubmit":
        # See: https://git-scm.com/docs/git-diff
        # This syntax compares the commit before pull_base_sha with the commit
        # at pull_base_sha
        diff_command = [
            "git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha
        ]

    changed_files = []
    if job_type in ("presubmit", "postsubmit"):
        changed_files = util.run(diff_command,
                                 cwd=os.path.join(args.repos_dir, repo_owner,
                                                  repo_name)).splitlines()

    for f in changed_files:
        logging.info("File %s is modified.", f)

    if args.release:
        generate_env_from_head(args)
    workflows = []
    config = {}
    if args.config_file:
        config, new_workflows = parse_config_file(args.config_file,
                                                  args.repos_dir)
        workflows.extend(new_workflows)

    # Add any paths to the python path
    extra_py_paths = []
    for p in config.get("python_paths", []):
        # Assume that python_paths are in the format $REPO_OWNER/$REPO_NAME/path,
        # we need to ensure that the repo is checked out if it is different from
        # the current one, and if the repo is not kubeflow/testing (which is already
        # checked out).
        segments = p.split("/")
        if ((segments[0] != repo_owner or segments[1] != repo_name)
                and not p.startswith("kubeflow/testing")):
            logging.info("Need to clone %s/%s", segments[0], segments[1])
            util.clone_repo(
                os.path.join(args.repos_dir, segments[0], segments[1]),
                segments[0], segments[1])

        path = os.path.join(args.repos_dir, p)
        extra_py_paths.append(path)

    kf_test_path = os.path.join(args.repos_dir, "kubeflow/testing/py")
    if kf_test_path not in extra_py_paths:
        logging.info("Adding %s to extra python paths", kf_test_path)
        extra_py_paths.append(kf_test_path)

    logging.info("Extra python paths: %s", ":".join(extra_py_paths))

    # Create an initial version of the file with no urls
    create_started_file(args.bucket, {})

    util.maybe_activate_service_account()

    util.configure_kubectl(args.project, args.zone, args.cluster)
    util.load_kube_config()

    workflow_names = []
    ui_urls = {}

    for w in workflows:  # pylint: disable=too-many-nested-blocks
        # Create the name for the workflow
        # We truncate sha numbers to prevent the workflow name from being too large.
        # Workflow name should not be more than 63 characters because its used
        # as a label on the pods.
        workflow_name = os.getenv("JOB_NAME") + "-" + w.name

        # Skip this workflow if it is scoped to a different job type.
        if w.job_types and not job_type in w.job_types:
            logging.info(
                "Skipping workflow %s because job type %s is not one of "
                "%s.", w.name, job_type, w.job_types)
            continue

        # If we are scoping this workflow to specific directories, check if any files
        # modified match the specified regex patterns.
        dir_modified = False
        if w.include_dirs:
            for f in changed_files:
                for d in w.include_dirs:
                    if fnmatch.fnmatch(f, d):
                        dir_modified = True
                        logging.info(
                            "Triggering workflow %s because %s in dir %s is modified.",
                            w.name, f, d)
                        break
                if dir_modified:
                    break

        # Only consider modified files when the job is pre or post submit, and if
        # the include_dirs stanza is defined.
        if job_type != "periodic" and w.include_dirs and not dir_modified:
            logging.info(
                "Skipping workflow %s because no code modified in %s.", w.name,
                w.include_dirs)
            continue

        if job_type == "presubmit":
            # When not running under prow we might not set all environment variables
            if os.getenv("PULL_NUMBER"):
                workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
            if os.getenv("PULL_PULL_SHA"):
                workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

        elif job_type == "postsubmit":
            if os.getenv("PULL_BASE_SHA"):
                workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

        # Append the last 4 digits of the build number
        if os.getenv("BUILD_NUMBER"):
            workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")[-4:])

        salt = uuid.uuid4().hex[0:4]
        # Add some salt. This is mostly a convenience for the case where you
        # are submitting jobs manually for testing/debugging. Since the prow should
        # vend unique build numbers for each job.
        workflow_name += "-{0}".format(salt)
        workflow_names.append(workflow_name)

        # check if ks workflow and run
        if w.app_dir:
            ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir)

            # Print ksonnet version
            util.run([ks_cmd, "version"])

            # Create a new environment for this run
            env = workflow_name

            util.run([
                ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)
            ],
                     cwd=w.app_dir)

            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component, "name",
                workflow_name
            ],
                     cwd=w.app_dir)

            # Set the prow environment variables.
            prow_env = []

            names = [
                "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
                "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
                "REPO_NAME"
            ]
            names.sort()
            for v in names:
                if not os.getenv(v):
                    continue
                prow_env.append("{0}={1}".format(v, os.getenv(v)))

            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component,
                "prow_env", ",".join(prow_env)
            ],
                     cwd=w.app_dir)
            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component,
                "namespace",
                get_namespace(args)
            ],
                     cwd=w.app_dir)
            util.run([
                ks_cmd, "param", "set", "--env=" + env, w.component, "bucket",
                args.bucket
            ],
                     cwd=w.app_dir)
            if args.release:
                util.run([
                    ks_cmd, "param", "set", "--env=" + env, w.component,
                    "versionTag",
                    os.getenv("VERSION_TAG")
                ],
                         cwd=w.app_dir)

            # Set any extra params. We do this in alphabetical order to make it easier to verify in
            # the unittest.
            param_names = w.params.keys()
            param_names.sort()
            for k in param_names:
                util.run([
                    ks_cmd, "param", "set", "--env=" + env, w.component, k,
                    "{0}".format(w.params[k])
                ],
                         cwd=w.app_dir)

            # For debugging print out the manifest
            util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir)
            util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir)

            ui_url = (
                "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
                "?tab=workflow".format(workflow_name))
            ui_urls[workflow_name] = ui_url
            logging.info("URL for workflow: %s", ui_url)
        else:
            w.kwargs["name"] = workflow_name
            w.kwargs["namespace"] = get_namespace(args)

            if TEST_TARGET_ARG_NAME not in w.kwargs:
                w.kwargs[TEST_TARGET_ARG_NAME] = w.name
                logging.info(
                    "Workflow %s doesn't set arg %s; defaulting to %s", w.name,
                    TEST_TARGET_ARG_NAME, w.kwargs[TEST_TARGET_ARG_NAME])

            # TODO(https://github.com/kubeflow/testing/issues/467): We shell out
            # to e2e_tool in order to dumpy the Argo workflow to a file which then
            # reimport. We do this because importing the py_func module appears
            # to break when we have to dynamically adjust sys.path to insert
            # new paths. Setting PYTHONPATH before launching python however appears
            # to work which is why we shell out to e2e_tool.
            command = [
                "python", "-m", "kubeflow.testing.e2e_tool", "show", w.py_func
            ]
            for k, v in w.kwargs.items():
                # The fire module turns underscores in parameter names into hyphens
                # so we convert underscores in parameter names to hyphens
                command.append("--{0}={1}".format(k.replace("_", "-"), v))

            with tempfile.NamedTemporaryFile(delete=False) as hf:
                workflow_file = hf.name

            command.append("--output=" + hf.name)
            env = os.environ.copy()
            env["PYTHONPATH"] = ":".join(extra_py_paths)
            util.run(command, env=env)

            with open(workflow_file) as hf:
                wf_result = yaml.load(hf)

            group, version = wf_result['apiVersion'].split('/')
            k8s_co = k8s_client.CustomObjectsApi()
            workflow_name = wf_result["metadata"]["name"]
            py_func_result = k8s_co.create_namespaced_custom_object(
                group=group,
                version=version,
                namespace=wf_result["metadata"]["namespace"],
                plural='workflows',
                body=wf_result)
            logging.info("Created workflow:\n%s",
                         yaml.safe_dump(py_func_result))

            ui_url = (
                "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
                "?tab=workflow".format(workflow_name))
            ui_urls[workflow_name] = ui_url
            logging.info("URL for workflow: %s", ui_url)

    # We delay creating started.json until we know the Argo workflow URLs
    create_started_file(args.bucket, ui_urls)

    workflow_success = False
    workflow_phase = {}
    workflow_status_yamls = {}
    results = []
    try:
        results = argo_client.wait_for_workflows(
            get_namespace(args),
            workflow_names,
            timeout=datetime.timedelta(minutes=180),
            status_callback=argo_client.log_status)
        workflow_success = True
    except util.ExceptionWithWorkflowResults as e:
        # We explicitly log any exceptions so that they will be captured in the
        # build-log.txt that is uploaded to Gubernator.
        logging.exception("Exception occurred: %s", e)
        results = e.workflow_results
        raise
    finally:
        prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket)
        # Upload logs to GCS. No logs after this point will appear in the
        # file in gcs
        file_handler.flush()
        util.upload_file_to_gcs(
            file_handler.baseFilename,
            os.path.join(prow_artifacts_dir, "build-log.txt"))

        # Upload workflow status to GCS.
        for r in results:
            phase = r.get("status", {}).get("phase")
            name = r.get("metadata", {}).get("name")
            workflow_phase[name] = phase
            workflow_status_yamls[name] = yaml.safe_dump(
                r, default_flow_style=False)
            if phase != "Succeeded":
                workflow_success = False
            logging.info("Workflow %s/%s finished phase: %s",
                         get_namespace(args), name, phase)

            for wf_name, wf_status in workflow_status_yamls.items():
                util.upload_to_gcs(
                    wf_status,
                    os.path.join(prow_artifacts_dir,
                                 '{}.yaml'.format(wf_name)))

        all_tests_success = prow_artifacts.finalize_prow_job(
            args.bucket, workflow_success, workflow_phase, ui_urls)

    return all_tests_success
コード例 #12
0
    def run_tfjob_with_cleanpod_policy(self, component, clean_pod_policy):
        tf_operator_util.load_kube_config()
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace,
                                      component, self.params)

        # Create the TF job
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # All pods are deleted.
        if clean_pod_policy == "All":
            pod_labels = tf_job_client.get_labels(self.name)
            pod_selector = tf_job_client.to_selector(pod_labels)
            k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace,
                                                 pod_selector)
        # Only running pods (PS) are deleted, completed pods are not.
        elif clean_pod_policy == "Running":
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Chief", ["Succeeded"])
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Worker", ["Succeeded"])
            pod_labels = tf_job_client.get_labels(self.name, "PS")
            pod_selector = tf_job_client.to_selector(pod_labels)
            k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace,
                                                 pod_selector)
        # No pods are deleted.
        elif clean_pod_policy == "None":
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Chief", ["Succeeded"])
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Worker", ["Succeeded"])
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "PS", ["Running"])

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
コード例 #13
0
    def run_tfjob_with_replica_restart_policy(self, component,
                                              replica_restart_policy,
                                              exit_code):
        tf_operator_util.load_kube_config()
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace,
                                      component, self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        if replica_restart_policy == "Always" and exit_code == 0:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                True)

        elif replica_restart_policy == "Always" and exit_code == 1:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                True)

        elif replica_restart_policy == "OnFailure" and exit_code == 1:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                True)

        elif replica_restart_policy == "OnFailure" and exit_code == 0:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                False)

        elif replica_restart_policy == "Never" and exit_code == 1:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                False)

        elif replica_restart_policy == "Never" and exit_code == 0:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                False)

        elif replica_restart_policy == "ExitCode" and exit_code == 1:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                False)

        else:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                True)

        if res is False:
            self.failure = "Job {0} in namespace {1} with restart policy {2} failed test \
        with exit_code {3}".format(self.name, self.namespace,
                                   replica_restart_policy, exit_code)
            logging.error(self.failure)
            return

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
コード例 #14
0
    def test_pod_names(self):
        api_client = k8s_client.ApiClient()
        component = COMPONENT_NAME + "_" + self.tfjob_version
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        job_specs = extract_job_specs(
            results.get("spec", {}).get("tfReplicaSpecs", {}))
        expected_pod_names = []
        for replica_type, replica_num in job_specs.items():
            logging.info("job_type = %s, replica = %s", replica_type,
                         replica_num)
            for i in range(replica_num):
                expected_pod_names.append(
                    POD_NAME_FORMAT.format(name=self.name,
                                           replica=replica_type,
                                           index=i))
        expected_pod_names = set(expected_pod_names)
        actual_pod_names = tf_job_client.get_pod_names(api_client,
                                                       self.namespace,
                                                       self.name)

        # We are not able to guarantee pods selected with default namespace and job
        # name are only for this test run only. Therefore we only do partial check,
        # e.g. make sure expected set of pod names are in the selected pod names.
        if not (expected_pod_names & actual_pod_names) == expected_pod_names:
            msg = "Actual pod names doesn't match. Expected: {0} Actual: {1}".format(
                str(expected_pod_names), str(actual_pod_names))
            logging.error(msg)
            raise RuntimeError(msg)

        tf_job_client.terminate_replicas(api_client, self.namespace, self.name,
                                         "chief", 1)
        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)