Exemple #1
    def download_coco_dataset(node_hostname,
        Downloads the COCO dataset into a PVC of the cluster

            node_hostname: Hostname of the node where the download pod will be executed.
            namespace: Name of the namespace in which the resources will be created.
            pvc_name: Name of the PVC that will be create to store the dataset files.
            mirror_base_url: Optional base URL where to fetch the dataset
            client_cert: Optional path to the client cert to use for accessing the base URL.
        opts = {
            "benchmarking_node_hostname": node_hostname,
            "benchmarking_namespace": namespace,
        if pvc_name is not None:
            opts["benchmarking_coco_dataset_pvc_name"] = pvc_name,
            print(f"Using '{pvc_name}' as PVC name.")

        if mirror_base_url is not None:
            opts["benchmarking_coco_dataset_mirror_base_url"] = mirror_base_url
            print(f"Using '{mirror_base_url}' as mirror base URL.")

        if client_cert is not None:
            opts["benchmarking_coco_dataset_client_cert"] = client_cert
            print(f"Using '{client_cert}' as client certificate.")

        return PlaybookRun("benchmarking_deploy_coco_dataset", opts)
Exemple #2
    def get_csv_version():
        Get the version of the GPU Operator currently installed from OLM
        Stores the version in the 'ARTIFACT_EXTRA_LOGS_DIR' artifacts directory.

        return PlaybookRun("gpu_operator_get_csv_version")
Exemple #3
    def capture_environment():
        Captures the cluster environment

            image: The image to upgrade the cluster to
        return PlaybookRun("cluster_capture_environment")
Exemple #4
    def upgrade_to_image(image):
        Upgrades the cluster to the given image

            image: The image to upgrade the cluster to
        return PlaybookRun("cluster_upgrade_to_image", {"cluster_upgrade_image": image})
Exemple #5
    def bundle_from_commit(
        Build an image of the GPU Operator from sources (<git repository> <git reference>)
        and push it to quay.io <quay_image_image>:operator_bundle_gpu-operator-<gpu_operator_image_tag_uid>
        using the <quay_push_secret> credentials.

        Example parameters - https://github.com/NVIDIA/gpu-operator.git master /path/to/quay_secret.yaml quay.io/org/image_name

        See 'oc get imagestreamtags -n gpu-operator-ci -oname' for the tag-uid to reuse.

            git_repo: Git repository URL to generate bundle of
            git_ref: Git ref to bundle
            quay_push_secret: A file Kube Secret YAML file with `.dockerconfigjson` data and type kubernetes.io/dockerconfigjson
            quay_image_image: The quay repo to push to
            tag_uid: Optional image tag suffix to use.
            namespace: Optional namespace to use to deploy the GPU Operator. Default: nvidia-gpu-operator
            with_validator: Optional flag to enable building the validator image (default: false)
            with_driver: Optional flag to enable building the driver image (default: false)
            publish_to_quay: Optional flag to publish the full bundle (including images) to Quay.io (default: false)
        if tag_uid is None:
            tag_uid = secrets.token_hex(4)

        def to_y(_s):
            if not _s: return ""
            if isinstance(_s, bool): return "y" # can't be false here
            s = str(_s).lower()
            if s == "false": return ""
            if s == "n": return ""
            if s == "no": return ""
            return "y"

        opts = {
            "gpu_operator_git_repo": git_repo,
            "gpu_operator_git_ref": git_ref,
            "gpu_operator_image_tag_uid": tag_uid,
            "gpu_operator_commit_quay_push_secret": quay_push_secret,
            "gpu_operator_commit_quay_image_name": quay_image_name,
            "gpu_operator_with_driver": to_y(with_driver),
            "gpu_operator_with_validator": to_y(with_validator),
            "gpu_operator_publish_to_quay":  to_y(publish_to_quay),

        if namespace is not None:
            opts["gpu_operator_target_namespace"] = namespace

        return PlaybookRun("gpu_operator_bundle_from_commit", opts)
Exemple #6
    def test_in_cluster(pem_key):
        Tests a given PEM entitlement key on a cluster

            pem_key: The PEM entitlement key to test
        return PlaybookRun("entitlement_test_in_cluster",
                           {"entitlement_pem": pem_key})
Exemple #7
    def test_in_podman(pem_key):
        Tests a given PEM entitlement key using a podman container

            pem_key: The PEM entitlement key to test
        return PlaybookRun("entitlement_test_in_podman",
                           {"entitlement_pem": pem_key})
Exemple #8
 def deploy_cluster_policy():
     Creates the ClusterPolicy from the OLM ClusterServiceVersion
     print("Creating the ClusterPolicy from the CSV")
     return PlaybookRun(
         {"gpu_operator_deploy_from": "pre-deployed"},
Exemple #9
    def deploy_from_bundle(bundle, namespace):
        Deploys the GPU Operator from a bundle

            bundle: Either a bundle OCI image or "master" to deploy the latest bundle
            namespace: Namespace in which the GPU Operator will be deployed. Before v1.9, the value must be "openshift-operators". With >=v1.9, the namespace can freely chosen (except 'openshift-operators'). Suggested namespace is: nvidia-gpu-operator.
        opts = {"gpu_operator_deploy_from": "bundle",
                "gpu_operator_target_namespace": namespace}

        if bundle == 'master':
            print("Deploying the GPU Operator from OperatorHub using the master bundle")
            return PlaybookRun("gpu_operator_deploy_from_operatorhub", opts)

        opts["deploy_bundle_image"] = bundle
        return PlaybookRun("gpu_operator_deploy_from_operatorhub", opts)
Exemple #10
    def deploy_from_operatorhub(channel=None):
        Deploys the GPU Operator from OperatorHub

            The operator hub channel to deploy. e.g. 4.7
        opts = {}

        if channel is not None:
            opts["nfd_channel"] = channel

        return PlaybookRun("nfd_operator_deploy_from_operatorhub", opts)
Exemple #11
    def run_gpu_burn(runtime=None):
        Runs the GPU burn on the cluster

            runtime: How long to run the GPU for, in seconds
        opts = {}
        if runtime is not None:
            opts["gpu_burn_time"] = runtime
            print(f"Running GPU Burn for {runtime} seconds.")

        return PlaybookRun("gpu_operator_run_gpu-burn", opts)
Exemple #12
    def set_repo_config(repo_file, dest_dir=None):
        Sets the GPU-operator driver yum repo configuration file

            repo_file: Absolute path to the repo file
            dest_dir: The destination dir in the pod to place the repo in
        opts = {"gpu_operator_set_repo_filename": repo_file}
        if dest_dir is not None:
            opts["gpu_operator_set_repo_destdir"] = dest_dir

        return PlaybookRun("gpu_operator_set_repo-config", opts)
Exemple #13
    def run_e2e_test(git_repo, git_ref):
        Runs e2e test on the given SRO repo and ref

            git_repo: The git repository to deploy from, e.g. https://github.com/openshift-psap/special-resource-operator.git
            git_ref: The git ref to deploy from, e.g. master
        opts = {
            "sro_git_repo": git_repo,
            "sro_git_ref": git_ref,

        return PlaybookRun("sro_run_e2e_test", opts)
Exemple #14
    def undeploy_from_commit(git_repo, git_ref):
        Undeploys an SRO-operator that was deployed from commit

            git_repo: The git repository to undeploy, e.g. https://github.com/openshift-psap/special-resource-operator.git
            git_ref: The git ref to undeploy, e.g. master
        opts = {
            "sro_git_repo": git_repo,
            "sro_git_ref": git_ref,

        return PlaybookRun("sro_undeploy_custom_commit", opts)
Exemple #15
    def test_cluster(no_inspect=False):
        Tests the cluster entitlement

            no_inspect: Do not inspect on failure
            pem_ca: Deploy <pem_ca> CA PEM key on the cluster
        opts = {}

        if no_inspect:
            print("INFO: Inspect on failure disabled.")
            opts["entitlement_inspect_on_failure"] = "no"

        return PlaybookRun("entitlement_test", opts)
Exemple #16
    def deploy(pem, pem_ca=None):
        Deploys a cluster-wide entitlement key & RHSM config file
        (and optionally a YUM repo certificate) with the help of
        MachineConfig resources.

            pem: Entitlement PEM file
            pem_ca: YUM repo certificate
        opts = {"entitlement_pem": pem}

        if pem_ca is not None:
            opts["entitlement_repo_ca"] = pem_ca

        return PlaybookRun("entitlement_deploy", opts)
Exemple #17
    def prepare_test_alerts(alert_delay=1, alert_prefix="CI"):
        Prepare test alerts based on the existing GPU Operator alerts.
        Test alerts have a shorter delay than default alerts.

          alert_delay: Delay (in minutes) before the alerts fire.
          alert_prefix: Prefix to prepend to the alert names, to distinguish them from the normal alerts.

        opts = {
            "gpu_operator_test_alerts_delay": alert_delay,
            "gpu_operator_test_alerts_prefix": alert_prefix,

        return PlaybookRun("gpu_operator_prepare_test_alerts", opts)
Exemple #18
    def deploy_from_commit(git_repo, git_ref, image_tag=None):
        Deploys the NFD operator from the given git commit

            git_rep: The git repository to deploy from, e.g. https://github.com/openshift/cluster-nfd-operator.git
            git_ref: The git ref to deploy from, e.g. master
            image_tag: The NFD operator image tag UID.
        opts = {
            "nfd_operator_git_repo": git_repo,
            "nfd_operator_git_ref": git_ref,

        if image_tag is not None:
            opts["nfd_operator_image_tag"] = image_tag

        return PlaybookRun("nfd_operator_deploy_custom_commit", opts)
Exemple #19
    def deploy_from_commit(git_repo, git_ref, image_tag=None):
        Deploys the SRO operator from the given git commit

            git_repo: The git repository to deploy from, e.g. https://github.com/openshift-psap/special-resource-operator.git
            git_ref: The git ref to deploy from, e.g. master
            image_tag: The SRO operator image tag UID.
        opts = {
            "sro_git_repo": git_repo,
            "sro_git_ref": git_ref,

        if image_tag is not None:
            opts["sro_image_tag"] = image_tag

        return PlaybookRun("sro_deploy_custom_commit", opts)
Exemple #20
    def wait_for_alert(alert_name, alert_active: bool):
        Wait for an alert to be active or inactive.

            alert_name: The name of the alert to wait for
            alert_active: A boolean telling if the alert should be active or not (true|false)

        if alert_active not in ("true", "false"):
            print(f"Unexpected value for alert_active: '{alert_active}'. Expected a boolean (true|false).")

        opts = {
            "cluster_wait_for_alert_name": alert_name,
            "cluster_wait_for_alert_active": alert_active,

        return PlaybookRun("cluster_wait_for_alert", opts)
Exemple #21
    def run_nvidiadl_ssd(node_hostname, namespace="default", pvc_name=None):
        Run NVIDIA Deep Learning SSD Detection training benchmark.

            node_hostname: Hostname of the node where the ssd benchmark will be executed.
            namespace: Name of the namespace in which the resources will be created.
            pvc_name: Name of the PVC that will be create to store the dataset files.

        opts = {
            "benchmarking_node_hostname": node_hostname,
            "benchmarking_namespace": namespace,
        if pvc_name is not None:
            opts["benchmarking_coco_dataset_pvc_name"] = pvc_name
                f"Using '{pvc_name}' as PVC where the coco dataset is stored.")
        return PlaybookRun("benchmarking_run_nvidiadl_ssd", opts)
Exemple #22
    def deploy_from_operatorhub(namespace, version=None, channel=None, installPlan="Manual"):
        Deploys the GPU operator from OperatorHub

            namespace: Namespace in which the GPU Operator will be deployed. Before v1.9, the value must be "openshift-operators". With >=v1.9, the namespace can freely chosen. Suggested namespace is: nvidia-gpu-operator.
            version: The version to deploy. If unspecified, deploys the latest version available in OperatorHub. Run the toolbox gpu_operator list_version_from_operator_hub subcommand to see the available versions.
            channel: Optional channel to deploy from.
            installPlan: Optional InstallPlan approval mode (Automatic or Manual [default])
        opts = {"gpu_operator_target_namespace": namespace}

        if version is not None:
            opts["gpu_operator_operatorhub_version"] = version
                f"Deploying the GPU Operator from OperatorHub using version '{version}'."

        if channel is not None:
            if version is None:
                print("Channel may only be specified if --version is specified")

            opts["gpu_operator_operatorhub_channel"] = channel
                f"Deploying the GPU Operator from OperatorHub using channel '{channel}'."

        opts["gpu_operator_installplan_approval"] = installPlan
        if installPlan not in ("Manual", "Automatic"):
                f"InstallPlan can only be Manual or Automatic. Received '{installPlan}'."

            f"Deploying the GPU Operator from OperatorHub using InstallPlan approval '{installPlan}'."

        print("Deploying the GPU Operator from OperatorHub using its master bundle.")
        return PlaybookRun("gpu_operator_deploy_from_operatorhub", opts)
Exemple #23
    def deploy(ci_command, git_repository, git_reference, tag_uid=None):
        Runs a given CI command

            ci_command: The CI command to run, for example "run gpu-ci"
            git_repository: The git repository to run the command from, e.g. https://github.com/openshift-psap/ci-artifacts.git
            git_reference: The git ref to run the command from, e.g. master
            tag_uid: The local CI image tag UID 

        if tag_uid is None:
            tag_uid = secrets.token_hex(4)

        opts = {
            "local_ci_git_repo": git_repository,
            "local_ci_git_ref": git_reference,
            "local_ci_image_tag_uid": tag_uid,

        os.environ["LOCAL_CI_COMMAND"] = ci_command

        return PlaybookRun("local-ci_deploy", opts)
Exemple #24
    def set_scale(instance_type, scale, base_machineset=None, force=False):
        Ensures that the cluster has exactly `scale` nodes with instance_type `instance_type`

        If the machinesets of the given instance type already have the required total number of replicas,
        their replica parameters will not be modified.
        - If there's only one machineset with the given instance type, its replicas will be set to the value of this parameter.

        - If there are other machinesets with non-zero replicas, the playbook will fail, unless the 'force_scale' parameter is
        set to true. In that case, the number of replicas of the other machinesets will be zeroed before setting the replicas
        of the first machineset to the value of this parameter."

        - If `--base-machineset=machineset` flag is passed, `machineset` machineset will be used to derive the new
        machinetset (otherwise, the first machinetset of the listing will be used). This is useful if the desired `instance_type`
        is only available in some specific regions and, controlled by different machinesets.

        Example: ./run_toolbox.py cluster set_scale g4dn.xlarge 1 # ensure that the cluster has 1 GPU node

            instance_type: The instance type to use, for example, g4dn.xlarge
            scale: The number of required nodes with given instance type
            base_machineset: Name of a machineset to use to derive the new one. Default: pickup the first machineset found in `oc get machinesets -n openshift-machine-api`.
        opts = {
            "machineset_instance_type": instance_type,
            "scale": scale,

        if base_machineset is not None:
            opts["base_machineset"] = base_machineset

        if force:
            opts["force_scale"] = "true"

        return PlaybookRun("cluster_set_scale", opts)
Exemple #25
 def wait():
     Waits for entitlement to be deployed
     return PlaybookRun("entitlement_wait")
Exemple #26
    def undeploy_from_commit():
        Undeploys a GPU-operator that was deployed from a commit

        return PlaybookRun("gpu_operator_undeploy_custom_commit")
Exemple #27
    def undeploy_from_operatorhub():
        Undeploys a GPU-operator that was deployed from OperatorHub

        return PlaybookRun("gpu_operator_undeploy_from_operatorhub")
Exemple #28
 def cleanup_bundle_from_commit():
     Cleanup resources leftover from building a bundle from a commit
     return PlaybookRun("gpu_operator_cleanup_bundle_from_commit")
Exemple #29
 def capture_deployment_state():
     Captures the GPU operator deployment state
     return PlaybookRun("gpu_operator_capture-deployment-state")
Exemple #30
 def wait_deployment():
     Waits for the GPU operator to deploy
     return PlaybookRun("gpu_operator_wait_deployment")