def download_coco_dataset(node_hostname, namespace="default", pvc_name=None, mirror_base_url=None, client_cert=None): """ Downloads the COCO dataset into a PVC of the cluster Args: node_hostname: Hostname of the node where the download pod will be executed. namespace: Name of the namespace in which the resources will be created. pvc_name: Name of the PVC that will be create to store the dataset files. mirror_base_url: Optional base URL where to fetch the dataset client_cert: Optional path to the client cert to use for accessing the base URL. """ opts = { "benchmarking_node_hostname": node_hostname, "benchmarking_namespace": namespace, } if pvc_name is not None: opts["benchmarking_coco_dataset_pvc_name"] = pvc_name, print(f"Using '{pvc_name}' as PVC name.") if mirror_base_url is not None: opts["benchmarking_coco_dataset_mirror_base_url"] = mirror_base_url print(f"Using '{mirror_base_url}' as mirror base URL.") if client_cert is not None: opts["benchmarking_coco_dataset_client_cert"] = client_cert print(f"Using '{client_cert}' as client certificate.") return PlaybookRun("benchmarking_deploy_coco_dataset", opts)
def get_csv_version(): """ Get the version of the GPU Operator currently installed from OLM Stores the version in the 'ARTIFACT_EXTRA_LOGS_DIR' artifacts directory. """ return PlaybookRun("gpu_operator_get_csv_version")
def capture_environment(): """ Captures the cluster environment Args: image: The image to upgrade the cluster to """ return PlaybookRun("cluster_capture_environment")
def upgrade_to_image(image): """ Upgrades the cluster to the given image Args: image: The image to upgrade the cluster to """ return PlaybookRun("cluster_upgrade_to_image", {"cluster_upgrade_image": image})
def bundle_from_commit( git_repo, git_ref, quay_push_secret, quay_image_name, tag_uid=None, namespace=None, with_validator=False, with_driver=False, publish_to_quay=False ): """ Build an image of the GPU Operator from sources (<git repository> <git reference>) and push it to quay.io <quay_image_image>:operator_bundle_gpu-operator-<gpu_operator_image_tag_uid> using the <quay_push_secret> credentials. Example parameters - https://github.com/NVIDIA/gpu-operator.git master /path/to/quay_secret.yaml quay.io/org/image_name See 'oc get imagestreamtags -n gpu-operator-ci -oname' for the tag-uid to reuse. Args: git_repo: Git repository URL to generate bundle of git_ref: Git ref to bundle quay_push_secret: A file Kube Secret YAML file with `.dockerconfigjson` data and type kubernetes.io/dockerconfigjson quay_image_image: The quay repo to push to tag_uid: Optional image tag suffix to use. namespace: Optional namespace to use to deploy the GPU Operator. Default: nvidia-gpu-operator with_validator: Optional flag to enable building the validator image (default: false) with_driver: Optional flag to enable building the driver image (default: false) publish_to_quay: Optional flag to publish the full bundle (including images) to Quay.io (default: false) """ if tag_uid is None: tag_uid = secrets.token_hex(4) def to_y(_s): if not _s: return "" if isinstance(_s, bool): return "y" # can't be false here s = str(_s).lower() if s == "false": return "" if s == "n": return "" if s == "no": return "" return "y" opts = { "gpu_operator_git_repo": git_repo, "gpu_operator_git_ref": git_ref, "gpu_operator_image_tag_uid": tag_uid, "gpu_operator_commit_quay_push_secret": quay_push_secret, "gpu_operator_commit_quay_image_name": quay_image_name, "gpu_operator_with_driver": to_y(with_driver), "gpu_operator_with_validator": to_y(with_validator), "gpu_operator_publish_to_quay": to_y(publish_to_quay), } if namespace is not None: opts["gpu_operator_target_namespace"] = namespace return PlaybookRun("gpu_operator_bundle_from_commit", opts)
def test_in_cluster(pem_key): """ Tests a given PEM entitlement key on a cluster Args: pem_key: The PEM entitlement key to test """ return PlaybookRun("entitlement_test_in_cluster", {"entitlement_pem": pem_key})
def test_in_podman(pem_key): """ Tests a given PEM entitlement key using a podman container Args: pem_key: The PEM entitlement key to test """ return PlaybookRun("entitlement_test_in_podman", {"entitlement_pem": pem_key})
def deploy_cluster_policy(): """ Creates the ClusterPolicy from the OLM ClusterServiceVersion """ print("Creating the ClusterPolicy from the CSV") return PlaybookRun( "gpu_operator_deploy_from_operatorhub", {"gpu_operator_deploy_from": "pre-deployed"}, )
def deploy_from_bundle(bundle, namespace): """ Deploys the GPU Operator from a bundle Args: bundle: Either a bundle OCI image or "master" to deploy the latest bundle namespace: Namespace in which the GPU Operator will be deployed. Before v1.9, the value must be "openshift-operators". With >=v1.9, the namespace can freely chosen (except 'openshift-operators'). Suggested namespace is: nvidia-gpu-operator. """ opts = {"gpu_operator_deploy_from": "bundle", "gpu_operator_target_namespace": namespace} if bundle == 'master': print("Deploying the GPU Operator from OperatorHub using the master bundle") return PlaybookRun("gpu_operator_deploy_from_operatorhub", opts) opts["deploy_bundle_image"] = bundle return PlaybookRun("gpu_operator_deploy_from_operatorhub", opts)
def deploy_from_operatorhub(channel=None): """ Deploys the GPU Operator from OperatorHub Args: The operator hub channel to deploy. e.g. 4.7 """ opts = {} if channel is not None: opts["nfd_channel"] = channel return PlaybookRun("nfd_operator_deploy_from_operatorhub", opts)
def run_gpu_burn(runtime=None): """ Runs the GPU burn on the cluster Args: runtime: How long to run the GPU for, in seconds """ opts = {} if runtime is not None: opts["gpu_burn_time"] = runtime print(f"Running GPU Burn for {runtime} seconds.") return PlaybookRun("gpu_operator_run_gpu-burn", opts)
def set_repo_config(repo_file, dest_dir=None): """ Sets the GPU-operator driver yum repo configuration file Args: repo_file: Absolute path to the repo file dest_dir: The destination dir in the pod to place the repo in """ opts = {"gpu_operator_set_repo_filename": repo_file} if dest_dir is not None: opts["gpu_operator_set_repo_destdir"] = dest_dir return PlaybookRun("gpu_operator_set_repo-config", opts)
def run_e2e_test(git_repo, git_ref): """ Runs e2e test on the given SRO repo and ref Args: git_repo: The git repository to deploy from, e.g. https://github.com/openshift-psap/special-resource-operator.git git_ref: The git ref to deploy from, e.g. master """ opts = { "sro_git_repo": git_repo, "sro_git_ref": git_ref, } return PlaybookRun("sro_run_e2e_test", opts)
def undeploy_from_commit(git_repo, git_ref): """ Undeploys an SRO-operator that was deployed from commit Args: git_repo: The git repository to undeploy, e.g. https://github.com/openshift-psap/special-resource-operator.git git_ref: The git ref to undeploy, e.g. master """ opts = { "sro_git_repo": git_repo, "sro_git_ref": git_ref, } return PlaybookRun("sro_undeploy_custom_commit", opts)
def test_cluster(no_inspect=False): """ Tests the cluster entitlement Args: no_inspect: Do not inspect on failure pem_ca: Deploy <pem_ca> CA PEM key on the cluster """ opts = {} if no_inspect: print("INFO: Inspect on failure disabled.") opts["entitlement_inspect_on_failure"] = "no" return PlaybookRun("entitlement_test", opts)
def deploy(pem, pem_ca=None): """ Deploys a cluster-wide entitlement key & RHSM config file (and optionally a YUM repo certificate) with the help of MachineConfig resources. Args: pem: Entitlement PEM file pem_ca: YUM repo certificate """ opts = {"entitlement_pem": pem} if pem_ca is not None: opts["entitlement_repo_ca"] = pem_ca return PlaybookRun("entitlement_deploy", opts)
def prepare_test_alerts(alert_delay=1, alert_prefix="CI"): """ Prepare test alerts based on the existing GPU Operator alerts. Test alerts have a shorter delay than default alerts. Args: alert_delay: Delay (in minutes) before the alerts fire. alert_prefix: Prefix to prepend to the alert names, to distinguish them from the normal alerts. """ opts = { "gpu_operator_test_alerts_delay": alert_delay, "gpu_operator_test_alerts_prefix": alert_prefix, } return PlaybookRun("gpu_operator_prepare_test_alerts", opts)
def deploy_from_commit(git_repo, git_ref, image_tag=None): """ Deploys the NFD operator from the given git commit Args: git_rep: The git repository to deploy from, e.g. https://github.com/openshift/cluster-nfd-operator.git git_ref: The git ref to deploy from, e.g. master image_tag: The NFD operator image tag UID. """ opts = { "nfd_operator_git_repo": git_repo, "nfd_operator_git_ref": git_ref, } if image_tag is not None: opts["nfd_operator_image_tag"] = image_tag return PlaybookRun("nfd_operator_deploy_custom_commit", opts)
def deploy_from_commit(git_repo, git_ref, image_tag=None): """ Deploys the SRO operator from the given git commit Args: git_repo: The git repository to deploy from, e.g. https://github.com/openshift-psap/special-resource-operator.git git_ref: The git ref to deploy from, e.g. master image_tag: The SRO operator image tag UID. """ opts = { "sro_git_repo": git_repo, "sro_git_ref": git_ref, } if image_tag is not None: opts["sro_image_tag"] = image_tag return PlaybookRun("sro_deploy_custom_commit", opts)
def wait_for_alert(alert_name, alert_active: bool): """ Wait for an alert to be active or inactive. Args: alert_name: The name of the alert to wait for alert_active: A boolean telling if the alert should be active or not (true|false) """ if alert_active not in ("true", "false"): print(f"Unexpected value for alert_active: '{alert_active}'. Expected a boolean (true|false).") sys.exit(1) opts = { "cluster_wait_for_alert_name": alert_name, "cluster_wait_for_alert_active": alert_active, } return PlaybookRun("cluster_wait_for_alert", opts)
def run_nvidiadl_ssd(node_hostname, namespace="default", pvc_name=None): """ Run NVIDIA Deep Learning SSD Detection training benchmark. Args: node_hostname: Hostname of the node where the ssd benchmark will be executed. namespace: Name of the namespace in which the resources will be created. pvc_name: Name of the PVC that will be create to store the dataset files. """ opts = { "benchmarking_node_hostname": node_hostname, "benchmarking_namespace": namespace, } if pvc_name is not None: opts["benchmarking_coco_dataset_pvc_name"] = pvc_name print( f"Using '{pvc_name}' as PVC where the coco dataset is stored.") return PlaybookRun("benchmarking_run_nvidiadl_ssd", opts)
def deploy_from_operatorhub(namespace, version=None, channel=None, installPlan="Manual"): """ Deploys the GPU operator from OperatorHub Args: namespace: Namespace in which the GPU Operator will be deployed. Before v1.9, the value must be "openshift-operators". With >=v1.9, the namespace can freely chosen. Suggested namespace is: nvidia-gpu-operator. version: The version to deploy. If unspecified, deploys the latest version available in OperatorHub. Run the toolbox gpu_operator list_version_from_operator_hub subcommand to see the available versions. channel: Optional channel to deploy from. installPlan: Optional InstallPlan approval mode (Automatic or Manual [default]) """ opts = {"gpu_operator_target_namespace": namespace} if version is not None: opts["gpu_operator_operatorhub_version"] = version print( f"Deploying the GPU Operator from OperatorHub using version '{version}'." ) if channel is not None: if version is None: print("Channel may only be specified if --version is specified") sys.exit(1) opts["gpu_operator_operatorhub_channel"] = channel print( f"Deploying the GPU Operator from OperatorHub using channel '{channel}'." ) opts["gpu_operator_installplan_approval"] = installPlan if installPlan not in ("Manual", "Automatic"): print( f"InstallPlan can only be Manual or Automatic. Received '{installPlan}'." ) sys.exit(1) print( f"Deploying the GPU Operator from OperatorHub using InstallPlan approval '{installPlan}'." ) print("Deploying the GPU Operator from OperatorHub using its master bundle.") return PlaybookRun("gpu_operator_deploy_from_operatorhub", opts)
def deploy(ci_command, git_repository, git_reference, tag_uid=None): """ Runs a given CI command Args: ci_command: The CI command to run, for example "run gpu-ci" git_repository: The git repository to run the command from, e.g. https://github.com/openshift-psap/ci-artifacts.git git_reference: The git ref to run the command from, e.g. master tag_uid: The local CI image tag UID """ if tag_uid is None: tag_uid = secrets.token_hex(4) opts = { "local_ci_git_repo": git_repository, "local_ci_git_ref": git_reference, "local_ci_image_tag_uid": tag_uid, } os.environ["LOCAL_CI_COMMAND"] = ci_command return PlaybookRun("local-ci_deploy", opts)
def set_scale(instance_type, scale, base_machineset=None, force=False): """ Ensures that the cluster has exactly `scale` nodes with instance_type `instance_type` If the machinesets of the given instance type already have the required total number of replicas, their replica parameters will not be modified. Otherwise, - If there's only one machineset with the given instance type, its replicas will be set to the value of this parameter. - If there are other machinesets with non-zero replicas, the playbook will fail, unless the 'force_scale' parameter is set to true. In that case, the number of replicas of the other machinesets will be zeroed before setting the replicas of the first machineset to the value of this parameter." - If `--base-machineset=machineset` flag is passed, `machineset` machineset will be used to derive the new machinetset (otherwise, the first machinetset of the listing will be used). This is useful if the desired `instance_type` is only available in some specific regions and, controlled by different machinesets. Example: ./run_toolbox.py cluster set_scale g4dn.xlarge 1 # ensure that the cluster has 1 GPU node Args: instance_type: The instance type to use, for example, g4dn.xlarge scale: The number of required nodes with given instance type base_machineset: Name of a machineset to use to derive the new one. Default: pickup the first machineset found in `oc get machinesets -n openshift-machine-api`. """ opts = { "machineset_instance_type": instance_type, "scale": scale, } if base_machineset is not None: opts["base_machineset"] = base_machineset if force: opts["force_scale"] = "true" return PlaybookRun("cluster_set_scale", opts)
def wait(): """ Waits for entitlement to be deployed """ return PlaybookRun("entitlement_wait")
def undeploy_from_commit(): """ Undeploys a GPU-operator that was deployed from a commit """ return PlaybookRun("gpu_operator_undeploy_custom_commit")
def undeploy_from_operatorhub(): """ Undeploys a GPU-operator that was deployed from OperatorHub """ return PlaybookRun("gpu_operator_undeploy_from_operatorhub")
def cleanup_bundle_from_commit(): """ Cleanup resources leftover from building a bundle from a commit """ return PlaybookRun("gpu_operator_cleanup_bundle_from_commit")
def capture_deployment_state(): """ Captures the GPU operator deployment state """ return PlaybookRun("gpu_operator_capture-deployment-state")
def wait_deployment(): """ Waits for the GPU operator to deploy """ return PlaybookRun("gpu_operator_wait_deployment")