Beispiel #1
0
def _gpu_test_scaleup(remote_command_executor, region, asg_name, stack_name,
                      scaledown_idletime, num_gpus):
    """Test cluster is scaling up correctly and GPU jobs are not aborted on slurmctld restart."""
    logging.info("Testing cluster scales correctly with GPU jobs")
    slurm_commands = SlurmCommands(remote_command_executor)
    # Assert initial conditions
    _assert_asg_has_no_node(region, asg_name)
    _assert_no_nodes_in_scheduler(slurm_commands)
    # g3.8xlarge has 32 vcpus and 2 GPUs, hardcoding tests for g3.8xlarge
    job_ids = []

    # sbatch --wrap 'sleep 10' -G 3
    result = slurm_commands.submit_command(command="sleep 10",
                                           nodes=-1,
                                           other_options="-G 3")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:31, gpu:0}, {cpu:31, gpu:0}]

    # sbatch --wrap 'sleep 10' --cpus-per-gpu=10 --gpus-per-task=1
    result = slurm_commands.submit_command(
        command="sleep 10",
        nodes=-1,
        other_options="--cpus-per-gpu=10 --gpus-per-task=1")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}]

    # sbatch --wrap 'sleep 10' -N 1 --gpus-per-node=1 -c 22 -n 1
    result = slurm_commands.submit_command(
        command="sleep 10",
        nodes=1,
        slots=1,
        other_options="--gpus-per-node=1 -c 23")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}]

    # sbatch --wrap 'sleep 10' -c 31 -n 1
    result = slurm_commands.submit_command(command="sleep 10",
                                           nodes=-1,
                                           slots=1,
                                           other_options="-c 31")
    job_ids.append(slurm_commands.assert_job_submitted(result.stdout))
    # Nodes/resources available after this job:
    # [{cpu:0, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}]

    # Assert scaling worked as expected
    assert_scaling_worked(slurm_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=4,
                          expected_final=0)
    # Assert jobs were completed
    for job_id in job_ids:
        slurm_commands.assert_job_succeeded(job_id)
Beispiel #2
0
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime, max_queue_size):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(slurm_commands.get_job_info(job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions")
    assert_that(slurm_commands.get_job_info(dependent_job_id)).contains(
        "JobState=PENDING Reason=Dependency")

    assert_scaling_worked(slurm_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=1,
                          expected_final=0)
    # Assert scheduler configuration is correct
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    assert_that(
        _retrieve_slurm_compute_nodes_from_config(
            remote_command_executor)).is_empty()
    # Assert jobs were completed
    _assert_job_completed(remote_command_executor, job_id)
    _assert_job_completed(remote_command_executor, dependent_job_id)
Beispiel #3
0
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime):
    logging.info("Testing cluster doesn't scale when job dependencies are not satisfied")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions"
    )
    assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time)
        + minutes(scaledown_idletime)
        + minutes(estimated_scaleup_time)
        + minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
Beispiel #4
0
def _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size):
    logging.info("Testing dummy nodes are automatically reconfigured based on actual compute nodes")
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    slurm_commands.wait_job_completed(job_id)
    _assert_dummy_nodes(remote_command_executor, max_queue_size - 1)
Beispiel #5
0
def _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size):
    logging.info("Testing dummy nodes are automatically reconfigured based on actual compute nodes")
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    slurm_commands.wait_job_completed(job_id)
    _assert_dummy_nodes(remote_command_executor, max_queue_size - 1)
Beispiel #6
0
def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name):
    logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit")

    # Check we are not scaling
    time.sleep(60)
    asg_client = boto3.client("autoscaling", region_name=region)
    asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0]
    assert_that(asg.get("DesiredCapacity")).is_equal_to(0)
Beispiel #7
0
def test_replace_compute_on_failure(region, pcluster_config_reader, clusters_factory, s3_bucket_factory, test_datadir):
    """
    Test that compute nodes get replaced on userdata failures and logs get saved in shared directory.

    The failure is caused by a post_install script that exits with errors on compute nodes.
    """
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh")
    cluster_config = pcluster_config_reader(bucket_name=bucket_name)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # submit a job to spin up a compute node that will fail due to post_install script
    sge_commands = SlurmCommands(remote_command_executor)
    sge_commands.submit_command("sleep 1")
    instance_id = wait_compute_log(remote_command_executor)

    # extract logs and check one of them
    _assert_compute_logs(remote_command_executor, instance_id)

    # check that instance got already replaced or is marked as Unhealthy
    assert_instance_replaced_or_terminating(instance_id, region)
Beispiel #8
0
def _gpu_test_cluster_limits(remote_command_executor, max_queue_size,
                             num_gpus):
    """Test edge cases regarding the number of GPUs."""
    logging.info(
        "Testing scheduler does not accept jobs when requesting for more GPUs than available"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    # Expect commands below to fail with exit 1
    _submit_and_assert_job_rejected_node_config(
        remote_command_executor,
        "sbatch -N 1 --wrap='sleep 1' --gpus-per-task {0}".format(num_gpus +
                                                                  1))
    _submit_and_assert_job_rejected_node_config(
        remote_command_executor,
        "sbatch -N 1 --wrap='sleep 1' --gres=gpu:{0}".format(num_gpus + 1))
    _submit_and_assert_job_rejected_node_config(
        remote_command_executor,
        "sbatch -G {0} --wrap='sleep 1'".format(num_gpus * max_queue_size + 1))

    # Commands below should be correctly submitted
    result = slurm_commands.submit_command(
        "sleep 1",
        nodes=1,
        slots=num_gpus,
        other_options="-G {0} --gpus-per-task=1".format(num_gpus))
    slurm_commands.assert_job_submitted(result.stdout)
    result = slurm_commands.submit_command(
        "sleep 1", nodes=1, other_options="--gres=gpu:{0}".format(num_gpus))
    slurm_commands.assert_job_submitted(result.stdout)
    # Submit job without '-N' option(nodes=-1)
    result = slurm_commands.submit_command(
        "sleep 1",
        nodes=-1,
        other_options="-G {0} --gpus-per-node={1}".format(
            num_gpus * max_queue_size, num_gpus))
    slurm_commands.assert_job_submitted(result.stdout)
Beispiel #9
0
def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name):
    logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1000", nodes=max_queue_size + 1)
    max_nodes_job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' --cpus-per-task 5")
    max_cpu_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Check we are not scaling
    time.sleep(60)
    assert_asg_desired_capacity(region, asg_name, expected=0)
    assert_that(_get_job_info(remote_command_executor, max_nodes_job_id)).contains(
        "JobState=PENDING Reason=PartitionNodeLimit"
    )
    assert_that(_get_job_info(remote_command_executor, max_cpu_job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions"
    )
Beispiel #10
0
def _test_dynamic_dummy_nodes(remote_command_executor,
                              region,
                              asg_name,
                              max_queue_size,
                              slots=4,
                              gpus=0):
    logging.info(
        "Testing dummy nodes are automatically reconfigured based on actual compute nodes"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    # Assert initial conditions
    _assert_asg_has_no_node(region, asg_name)
    _assert_no_nodes_in_scheduler(slurm_commands)

    _assert_dummy_nodes(remote_command_executor, max_queue_size, slots, gpus)
    result = slurm_commands.submit_command("sleep 1", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    slurm_commands.wait_job_completed(job_id)
    _assert_dummy_nodes(remote_command_executor, max_queue_size - 1, slots,
                        gpus)
def _test_cluster_limits(remote_command_executor, max_queue_size, region,
                         asg_name):
    logging.info(
        "Testing cluster doesn't scale when job requires a capacity that is higher than the max available"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(
        remote_command_executor,
        job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit")

    # Check we are not scaling
    time.sleep(60)
    asg_client = boto3.client("autoscaling", region_name=region)
    asg = asg_client.describe_auto_scaling_groups(
        AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0]
    assert_that(asg.get("DesiredCapacity")).is_equal_to(0)
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions")
    assert_that(_get_job_info(
        remote_command_executor,
        dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(estimated_scaleup_time) +
        minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
Beispiel #13
0
def test_update_hit(region, scheduler, pcluster_config_reader,
                    clusters_factory, test_datadir, s3_bucket_factory):
    # Create S3 bucket for pre/post install scripts
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    bucket.upload_file(str(test_datadir / "preinstall.sh"),
                       "scripts/preinstall.sh")
    bucket.upload_file(str(test_datadir / "postinstall.sh"),
                       "scripts/postinstall.sh")

    # Create cluster with initial configuration
    init_config_file = pcluster_config_reader(resource_bucket=bucket_name)
    cluster = clusters_factory(init_config_file)

    # Update cluster with the same configuration, command should not result any error even if not using force update
    cluster.config_file = str(init_config_file)
    cluster.update(force=True)

    # Command executors
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    # Create shared dir for script results
    command_executor.run_remote_command("mkdir -p /shared/script_results")

    initial_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1_i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 1,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
                "queue1_i2": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 9,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
        "queue2": {
            "compute_resources": {
                "queue2_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=initial_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=initial_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Submit a job in order to verify that jobs are not affected by an update of the queue size
    result = slurm_commands.submit_command("sleep infinity",
                                           constraint="static")
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Update cluster with new configuration
    updated_config_file = pcluster_config_reader(
        config_file="pcluster.config.update.ini",
        bucket=bucket_name,
        resource_bucket=bucket_name)
    cluster.config_file = str(updated_config_file)
    cluster.update()

    # Here is the expected list of nodes. Note that queue1-dy-t2micro-1 comes from the initial_count set when creating
    # the cluster:
    # queue1-dy-t2micro-1
    # queue1-st-c5xlarge-1
    # queue1-st-c5xlarge-2
    assert_initial_conditions(slurm_commands, 2, 1, partition="queue1")

    updated_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1_i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 2,
                    "expected_power_saved_instances": 2,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1_i2": {
                    "instance_type": "c5.2xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1_i3": {
                    "instance_type": "t2.micro",
                    "expected_running_instances":
                    1,  # This comes from initial_count before update
                    "expected_power_saved_instances": 9,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "spot",
        },
        "queue2": {
            "compute_resources": {
                "queue2_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 1,
                    "enable_efa": True,
                    "disable_hyperthreading": True,
                },
            },
            "compute_type": "ondemand",
        },
        "queue3": {
            "compute_resources": {
                "queue3_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": True,
                    "enable_efa": True,
                },
                "queue3_i2": {
                    "instance_type": "t2.xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=updated_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=updated_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Read updated configuration
    updated_config = configparser.ConfigParser()
    updated_config.read(updated_config_file)

    # Check new S3 resources
    check_s3_read_resource(
        region, cluster,
        updated_config.get("cluster default", "s3_read_resource"))
    check_s3_read_write_resource(
        region, cluster,
        updated_config.get("cluster default", "s3_read_write_resource"))

    # Check new Additional IAM policies
    _check_role_attached_policy(
        region, cluster,
        updated_config.get("cluster default", "additional_iam_policies"))

    # Assert that the job submitted before the update is still running
    assert_that(
        slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")
Beispiel #14
0
def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory,
                      clusters_factory, test_datadir):
    # Create S3 bucket for pre/post install scripts
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    for script in [
            "preinstall.sh", "postinstall.sh", "updated_preinstall.sh",
            "updated_postinstall.sh"
    ]:
        bucket.upload_file(str(test_datadir / script), f"scripts/{script}")

    # Create cluster with initial configuration
    init_config_file = pcluster_config_reader(resource_bucket=bucket_name,
                                              bucket=bucket_name)
    cluster = clusters_factory(init_config_file)

    # Update cluster with the same configuration, command should not result any error even if not using force update
    cluster.update(str(init_config_file), force_update="true")

    # Command executors
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    # Create shared dir for script results
    command_executor.run_remote_command("mkdir -p /shared/script_results")

    initial_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 1,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
                "queue1-i2": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 9,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                }
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=initial_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=initial_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # submit job in queue1 to verify original pre/post-install script execution
    initial_compute_nodes = slurm_commands.get_compute_nodes(
        filter_by_partition="queue1")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "preinstall", "QWE")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "postinstall", "RTY")

    # Submit a job in order to verify that jobs are not affected by an update of the queue size
    result = slurm_commands.submit_command("sleep infinity",
                                           constraint="static&c5.xlarge")
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Update cluster with new configuration
    additional_policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonAppStreamServiceAccess"
    updated_config_file = pcluster_config_reader(
        config_file="pcluster.config.update.yaml",
        bucket=bucket_name,
        resource_bucket=bucket_name,
        additional_policy_arn=additional_policy_arn,
    )
    cluster.update(str(updated_config_file), force_update="true")

    # Here is the expected list of nodes.
    # the cluster:
    # queue1-st-c5xlarge-1
    # queue1-st-c5xlarge-2
    assert_initial_conditions(slurm_commands, 2, 0, partition="queue1")

    updated_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 2,
                    "expected_power_saved_instances": 2,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i2": {
                    "instance_type": "c5.2xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i3": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "spot",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 1,
                    "enable_efa": True,
                    "disable_hyperthreading": True,
                }
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
        "queue3": {
            "compute_resources": {
                "queue3-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": True,
                    "enable_efa": True,
                },
                "queue3-i2": {
                    "instance_type": "t2.xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
    }

    _assert_scheduler_nodes(queues_config=updated_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=updated_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Read updated configuration
    with open(updated_config_file, encoding="utf-8") as conf_file:
        updated_config = yaml.safe_load(conf_file)

    # Check new S3 resources
    check_s3_read_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=False))
    check_s3_read_write_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=True))

    # Check new Additional IAM policies
    _check_role_attached_policy(region, cluster, additional_policy_arn)

    # Assert that the job submitted before the update is still running
    assert_that(
        slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")

    _check_volume(cluster, updated_config, region)

    # Launch a new instance for queue1 and test updated pre/post install script execution and extra json update
    # Add a new dynamic node t2.micro to queue1-i3
    new_compute_node = _add_compute_nodes(slurm_commands, "queue1", "t2.micro")

    assert_that(len(new_compute_node)).is_equal_to(1)
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_preinstall", "ABC")
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_postinstall", "DEF")

    # check new extra json
    _check_extra_json(command_executor, slurm_commands, new_compute_node[0],
                      "test_value")