Exemple #1
0
    def download_coco_dataset(node_hostname,
                              namespace="default",
                              pvc_name=None,
                              mirror_base_url=None,
                              client_cert=None):
        """
        Downloads the COCO dataset into a PVC of the cluster

        Args:
            node_hostname: Hostname of the node where the download pod will be executed.
            namespace: Name of the namespace in which the resources will be created.
            pvc_name: Name of the PVC that will be create to store the dataset files.
            mirror_base_url: Optional base URL where to fetch the dataset
            client_cert: Optional path to the client cert to use for accessing the base URL.
        """
        opts = {
            "benchmarking_node_hostname": node_hostname,
            "benchmarking_namespace": namespace,
        }
        if pvc_name is not None:
            opts["benchmarking_coco_dataset_pvc_name"] = pvc_name,
            print(f"Using '{pvc_name}' as PVC name.")

        if mirror_base_url is not None:
            opts["benchmarking_coco_dataset_mirror_base_url"] = mirror_base_url
            print(f"Using '{mirror_base_url}' as mirror base URL.")

        if client_cert is not None:
            opts["benchmarking_coco_dataset_client_cert"] = client_cert
            print(f"Using '{client_cert}' as client certificate.")

        return PlaybookRun("benchmarking_deploy_coco_dataset", opts)
Exemple #2
0
    def get_csv_version():
        """
        Get the version of the GPU Operator currently installed from OLM
        Stores the version in the 'ARTIFACT_EXTRA_LOGS_DIR' artifacts directory.
        """

        return PlaybookRun("gpu_operator_get_csv_version")
Exemple #3
0
    def capture_environment():
        """
        Captures the cluster environment

        Args:
            image: The image to upgrade the cluster to
        """
        return PlaybookRun("cluster_capture_environment")
Exemple #4
0
    def upgrade_to_image(image):
        """
        Upgrades the cluster to the given image

        Args:
            image: The image to upgrade the cluster to
        """
        return PlaybookRun("cluster_upgrade_to_image", {"cluster_upgrade_image": image})
Exemple #5
0
    def bundle_from_commit(
        git_repo,
        git_ref,
        quay_push_secret,
        quay_image_name,
        tag_uid=None,
        namespace=None,
        with_validator=False,
        with_driver=False,
        publish_to_quay=False
    ):
        """
        Build an image of the GPU Operator from sources (<git repository> <git reference>)
        and push it to quay.io <quay_image_image>:operator_bundle_gpu-operator-<gpu_operator_image_tag_uid>
        using the <quay_push_secret> credentials.

        Example parameters - https://github.com/NVIDIA/gpu-operator.git master /path/to/quay_secret.yaml quay.io/org/image_name

        See 'oc get imagestreamtags -n gpu-operator-ci -oname' for the tag-uid to reuse.

        Args:
            git_repo: Git repository URL to generate bundle of
            git_ref: Git ref to bundle
            quay_push_secret: A file Kube Secret YAML file with `.dockerconfigjson` data and type kubernetes.io/dockerconfigjson
            quay_image_image: The quay repo to push to
            tag_uid: Optional image tag suffix to use.
            namespace: Optional namespace to use to deploy the GPU Operator. Default: nvidia-gpu-operator
            with_validator: Optional flag to enable building the validator image (default: false)
            with_driver: Optional flag to enable building the driver image (default: false)
            publish_to_quay: Optional flag to publish the full bundle (including images) to Quay.io (default: false)
        """
        if tag_uid is None:
            tag_uid = secrets.token_hex(4)

        def to_y(_s):
            if not _s: return ""
            if isinstance(_s, bool): return "y" # can't be false here
            s = str(_s).lower()
            if s == "false": return ""
            if s == "n": return ""
            if s == "no": return ""
            return "y"

        opts = {
            "gpu_operator_git_repo": git_repo,
            "gpu_operator_git_ref": git_ref,
            "gpu_operator_image_tag_uid": tag_uid,
            "gpu_operator_commit_quay_push_secret": quay_push_secret,
            "gpu_operator_commit_quay_image_name": quay_image_name,
            "gpu_operator_with_driver": to_y(with_driver),
            "gpu_operator_with_validator": to_y(with_validator),
            "gpu_operator_publish_to_quay":  to_y(publish_to_quay),
        }

        if namespace is not None:
            opts["gpu_operator_target_namespace"] = namespace

        return PlaybookRun("gpu_operator_bundle_from_commit", opts)
Exemple #6
0
    def test_in_cluster(pem_key):
        """
        Tests a given PEM entitlement key on a cluster

        Args:
            pem_key: The PEM entitlement key to test
        """
        return PlaybookRun("entitlement_test_in_cluster",
                           {"entitlement_pem": pem_key})
Exemple #7
0
    def test_in_podman(pem_key):
        """
        Tests a given PEM entitlement key using a podman container

        Args:
            pem_key: The PEM entitlement key to test
        """
        return PlaybookRun("entitlement_test_in_podman",
                           {"entitlement_pem": pem_key})
Exemple #8
0
 def deploy_cluster_policy():
     """
     Creates the ClusterPolicy from the OLM ClusterServiceVersion
     """
     print("Creating the ClusterPolicy from the CSV")
     return PlaybookRun(
         "gpu_operator_deploy_from_operatorhub",
         {"gpu_operator_deploy_from": "pre-deployed"},
     )
Exemple #9
0
    def deploy_from_bundle(bundle, namespace):
        """
        Deploys the GPU Operator from a bundle

        Args:
            bundle: Either a bundle OCI image or "master" to deploy the latest bundle
            namespace: Namespace in which the GPU Operator will be deployed. Before v1.9, the value must be "openshift-operators". With >=v1.9, the namespace can freely chosen (except 'openshift-operators'). Suggested namespace is: nvidia-gpu-operator.
        """
        opts = {"gpu_operator_deploy_from": "bundle",
                "gpu_operator_target_namespace": namespace}

        if bundle == 'master':
            print("Deploying the GPU Operator from OperatorHub using the master bundle")
            return PlaybookRun("gpu_operator_deploy_from_operatorhub", opts)



        opts["deploy_bundle_image"] = bundle
        return PlaybookRun("gpu_operator_deploy_from_operatorhub", opts)
Exemple #10
0
    def deploy_from_operatorhub(channel=None):
        """
        Deploys the GPU Operator from OperatorHub

        Args:
            The operator hub channel to deploy. e.g. 4.7
        """
        opts = {}

        if channel is not None:
            opts["nfd_channel"] = channel

        return PlaybookRun("nfd_operator_deploy_from_operatorhub", opts)
Exemple #11
0
    def run_gpu_burn(runtime=None):
        """
        Runs the GPU burn on the cluster

        Args:
            runtime: How long to run the GPU for, in seconds
        """
        opts = {}
        if runtime is not None:
            opts["gpu_burn_time"] = runtime
            print(f"Running GPU Burn for {runtime} seconds.")

        return PlaybookRun("gpu_operator_run_gpu-burn", opts)
Exemple #12
0
    def set_repo_config(repo_file, dest_dir=None):
        """
        Sets the GPU-operator driver yum repo configuration file

        Args:
            repo_file: Absolute path to the repo file
            dest_dir: The destination dir in the pod to place the repo in
        """
        opts = {"gpu_operator_set_repo_filename": repo_file}
        if dest_dir is not None:
            opts["gpu_operator_set_repo_destdir"] = dest_dir

        return PlaybookRun("gpu_operator_set_repo-config", opts)
Exemple #13
0
    def run_e2e_test(git_repo, git_ref):
        """
        Runs e2e test on the given SRO repo and ref

        Args:
            git_repo: The git repository to deploy from, e.g. https://github.com/openshift-psap/special-resource-operator.git
            git_ref: The git ref to deploy from, e.g. master
        """
        opts = {
            "sro_git_repo": git_repo,
            "sro_git_ref": git_ref,
        }

        return PlaybookRun("sro_run_e2e_test", opts)
Exemple #14
0
    def undeploy_from_commit(git_repo, git_ref):
        """
        Undeploys an SRO-operator that was deployed from commit

        Args:
            git_repo: The git repository to undeploy, e.g. https://github.com/openshift-psap/special-resource-operator.git
            git_ref: The git ref to undeploy, e.g. master
        """
        opts = {
            "sro_git_repo": git_repo,
            "sro_git_ref": git_ref,
        }

        return PlaybookRun("sro_undeploy_custom_commit", opts)
Exemple #15
0
    def test_cluster(no_inspect=False):
        """
        Tests the cluster entitlement

        Args:
            no_inspect: Do not inspect on failure
            pem_ca: Deploy <pem_ca> CA PEM key on the cluster
        """
        opts = {}

        if no_inspect:
            print("INFO: Inspect on failure disabled.")
            opts["entitlement_inspect_on_failure"] = "no"

        return PlaybookRun("entitlement_test", opts)
Exemple #16
0
    def deploy(pem, pem_ca=None):
        """
        Deploys a cluster-wide entitlement key & RHSM config file
        (and optionally a YUM repo certificate) with the help of
        MachineConfig resources.

        Args:
            pem: Entitlement PEM file
            pem_ca: YUM repo certificate
        """
        opts = {"entitlement_pem": pem}

        if pem_ca is not None:
            opts["entitlement_repo_ca"] = pem_ca

        return PlaybookRun("entitlement_deploy", opts)
Exemple #17
0
    def prepare_test_alerts(alert_delay=1, alert_prefix="CI"):
        """
        Prepare test alerts based on the existing GPU Operator alerts.
        Test alerts have a shorter delay than default alerts.

        Args:
          alert_delay: Delay (in minutes) before the alerts fire.
          alert_prefix: Prefix to prepend to the alert names, to distinguish them from the normal alerts.
        """

        opts = {
            "gpu_operator_test_alerts_delay": alert_delay,
            "gpu_operator_test_alerts_prefix": alert_prefix,
        }

        return PlaybookRun("gpu_operator_prepare_test_alerts", opts)
Exemple #18
0
    def deploy_from_commit(git_repo, git_ref, image_tag=None):
        """
        Deploys the NFD operator from the given git commit

        Args:
            git_rep: The git repository to deploy from, e.g. https://github.com/openshift/cluster-nfd-operator.git
            git_ref: The git ref to deploy from, e.g. master
            image_tag: The NFD operator image tag UID.
        """
        opts = {
            "nfd_operator_git_repo": git_repo,
            "nfd_operator_git_ref": git_ref,
        }

        if image_tag is not None:
            opts["nfd_operator_image_tag"] = image_tag

        return PlaybookRun("nfd_operator_deploy_custom_commit", opts)
Exemple #19
0
    def deploy_from_commit(git_repo, git_ref, image_tag=None):
        """
        Deploys the SRO operator from the given git commit

        Args:
            git_repo: The git repository to deploy from, e.g. https://github.com/openshift-psap/special-resource-operator.git
            git_ref: The git ref to deploy from, e.g. master
            image_tag: The SRO operator image tag UID.
        """
        opts = {
            "sro_git_repo": git_repo,
            "sro_git_ref": git_ref,
        }

        if image_tag is not None:
            opts["sro_image_tag"] = image_tag

        return PlaybookRun("sro_deploy_custom_commit", opts)
Exemple #20
0
    def wait_for_alert(alert_name, alert_active: bool):
        """
        Wait for an alert to be active or inactive.

        Args:
            alert_name: The name of the alert to wait for
            alert_active: A boolean telling if the alert should be active or not (true|false)
        """

        if alert_active not in ("true", "false"):
            print(f"Unexpected value for alert_active: '{alert_active}'. Expected a boolean (true|false).")
            sys.exit(1)

        opts = {
            "cluster_wait_for_alert_name": alert_name,
            "cluster_wait_for_alert_active": alert_active,
        }

        return PlaybookRun("cluster_wait_for_alert", opts)
Exemple #21
0
    def run_nvidiadl_ssd(node_hostname, namespace="default", pvc_name=None):
        """
        Run NVIDIA Deep Learning SSD Detection training benchmark.

        Args:
            node_hostname: Hostname of the node where the ssd benchmark will be executed.
            namespace: Name of the namespace in which the resources will be created.
            pvc_name: Name of the PVC that will be create to store the dataset files.
        """

        opts = {
            "benchmarking_node_hostname": node_hostname,
            "benchmarking_namespace": namespace,
        }
        if pvc_name is not None:
            opts["benchmarking_coco_dataset_pvc_name"] = pvc_name
            print(
                f"Using '{pvc_name}' as PVC where the coco dataset is stored.")
        return PlaybookRun("benchmarking_run_nvidiadl_ssd", opts)
Exemple #22
0
    def deploy_from_operatorhub(namespace, version=None, channel=None, installPlan="Manual"):
        """
        Deploys the GPU operator from OperatorHub

        Args:
            namespace: Namespace in which the GPU Operator will be deployed. Before v1.9, the value must be "openshift-operators". With >=v1.9, the namespace can freely chosen. Suggested namespace is: nvidia-gpu-operator.
            version: The version to deploy. If unspecified, deploys the latest version available in OperatorHub. Run the toolbox gpu_operator list_version_from_operator_hub subcommand to see the available versions.
            channel: Optional channel to deploy from.
            installPlan: Optional InstallPlan approval mode (Automatic or Manual [default])
        """
        opts = {"gpu_operator_target_namespace": namespace}

        if version is not None:
            opts["gpu_operator_operatorhub_version"] = version
            print(
                f"Deploying the GPU Operator from OperatorHub using version '{version}'."
            )

        if channel is not None:
            if version is None:
                print("Channel may only be specified if --version is specified")
                sys.exit(1)

            opts["gpu_operator_operatorhub_channel"] = channel
            print(
                f"Deploying the GPU Operator from OperatorHub using channel '{channel}'."
            )

        opts["gpu_operator_installplan_approval"] = installPlan
        if installPlan not in ("Manual", "Automatic"):
            print(
                f"InstallPlan can only be Manual or Automatic. Received '{installPlan}'."
            )
            sys.exit(1)

        print(
            f"Deploying the GPU Operator from OperatorHub using InstallPlan approval '{installPlan}'."
        )

        print("Deploying the GPU Operator from OperatorHub using its master bundle.")
        return PlaybookRun("gpu_operator_deploy_from_operatorhub", opts)
Exemple #23
0
    def deploy(ci_command, git_repository, git_reference, tag_uid=None):
        """
        Runs a given CI command

        Args:
            ci_command: The CI command to run, for example "run gpu-ci"
            git_repository: The git repository to run the command from, e.g. https://github.com/openshift-psap/ci-artifacts.git
            git_reference: The git ref to run the command from, e.g. master
            tag_uid: The local CI image tag UID 
        """

        if tag_uid is None:
            tag_uid = secrets.token_hex(4)

        opts = {
            "local_ci_git_repo": git_repository,
            "local_ci_git_ref": git_reference,
            "local_ci_image_tag_uid": tag_uid,
        }

        os.environ["LOCAL_CI_COMMAND"] = ci_command

        return PlaybookRun("local-ci_deploy", opts)
Exemple #24
0
    def set_scale(instance_type, scale, base_machineset=None, force=False):
        """
        Ensures that the cluster has exactly `scale` nodes with instance_type `instance_type`

        If the machinesets of the given instance type already have the required total number of replicas,
        their replica parameters will not be modified.
        Otherwise,
        - If there's only one machineset with the given instance type, its replicas will be set to the value of this parameter.

        - If there are other machinesets with non-zero replicas, the playbook will fail, unless the 'force_scale' parameter is
        set to true. In that case, the number of replicas of the other machinesets will be zeroed before setting the replicas
        of the first machineset to the value of this parameter."

        - If `--base-machineset=machineset` flag is passed, `machineset` machineset will be used to derive the new
        machinetset (otherwise, the first machinetset of the listing will be used). This is useful if the desired `instance_type`
        is only available in some specific regions and, controlled by different machinesets.

        Example: ./run_toolbox.py cluster set_scale g4dn.xlarge 1 # ensure that the cluster has 1 GPU node

        Args:
            instance_type: The instance type to use, for example, g4dn.xlarge
            scale: The number of required nodes with given instance type
            base_machineset: Name of a machineset to use to derive the new one. Default: pickup the first machineset found in `oc get machinesets -n openshift-machine-api`.
        """
        opts = {
            "machineset_instance_type": instance_type,
            "scale": scale,
        }

        if base_machineset is not None:
            opts["base_machineset"] = base_machineset

        if force:
            opts["force_scale"] = "true"

        return PlaybookRun("cluster_set_scale", opts)
Exemple #25
0
 def wait():
     """
     Waits for entitlement to be deployed
     """
     return PlaybookRun("entitlement_wait")
Exemple #26
0
    def undeploy_from_commit():
        """
        Undeploys a GPU-operator that was deployed from a commit
        """

        return PlaybookRun("gpu_operator_undeploy_custom_commit")
Exemple #27
0
    def undeploy_from_operatorhub():
        """
        Undeploys a GPU-operator that was deployed from OperatorHub
        """

        return PlaybookRun("gpu_operator_undeploy_from_operatorhub")
Exemple #28
0
 def cleanup_bundle_from_commit():
     """
     Cleanup resources leftover from building a bundle from a commit
     """
     return PlaybookRun("gpu_operator_cleanup_bundle_from_commit")
Exemple #29
0
 def capture_deployment_state():
     """
     Captures the GPU operator deployment state
     """
     return PlaybookRun("gpu_operator_capture-deployment-state")
Exemple #30
0
 def wait_deployment():
     """
     Waits for the GPU operator to deploy
     """
     return PlaybookRun("gpu_operator_wait_deployment")