Esempio n. 1
0
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime, max_queue_size):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(slurm_commands.get_job_info(job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions")
    assert_that(slurm_commands.get_job_info(dependent_job_id)).contains(
        "JobState=PENDING Reason=Dependency")

    assert_scaling_worked(slurm_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=1,
                          expected_final=0)
    # Assert scheduler configuration is correct
    _assert_dummy_nodes(remote_command_executor, max_queue_size)
    assert_that(
        _retrieve_slurm_compute_nodes_from_config(
            remote_command_executor)).is_empty()
    # Assert jobs were completed
    _assert_job_completed(remote_command_executor, job_id)
    _assert_job_completed(remote_command_executor, dependent_job_id)
Esempio n. 2
0
def _gpu_resource_check(remote_command_executor):
    """Test GPU related resources are correctly allocated."""
    logging.info("Testing number of GPU/CPU resources allocated to job")
    slurm_commands = SlurmCommands(remote_command_executor)

    result = remote_command_executor.run_remote_command(
        "sbatch -G 1 --cpus-per-gpu 5 --wrap='sleep 1'")
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    job_info = slurm_commands.get_job_info(job_id)
    assert_that(job_info).contains("TresPerJob=gpu:1", "CpusPerTres=gpu:5")

    result = remote_command_executor.run_remote_command(
        "sbatch --gres=gpu:2 --cpus-per-gpu 6 --wrap='sleep 1'")
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    job_info = slurm_commands.get_job_info(job_id)
    assert_that(job_info).contains("TresPerNode=gpu:2", "CpusPerTres=gpu:6")
Esempio n. 3
0
def test_update_hit(region, scheduler, pcluster_config_reader,
                    clusters_factory, test_datadir, s3_bucket_factory):
    # Create S3 bucket for pre/post install scripts
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    bucket.upload_file(str(test_datadir / "preinstall.sh"),
                       "scripts/preinstall.sh")
    bucket.upload_file(str(test_datadir / "postinstall.sh"),
                       "scripts/postinstall.sh")

    # Create cluster with initial configuration
    init_config_file = pcluster_config_reader(resource_bucket=bucket_name)
    cluster = clusters_factory(init_config_file)

    # Update cluster with the same configuration, command should not result any error even if not using force update
    cluster.config_file = str(init_config_file)
    cluster.update(force=True)

    # Command executors
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    # Create shared dir for script results
    command_executor.run_remote_command("mkdir -p /shared/script_results")

    initial_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1_i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 1,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
                "queue1_i2": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 9,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
        "queue2": {
            "compute_resources": {
                "queue2_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=initial_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=initial_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Submit a job in order to verify that jobs are not affected by an update of the queue size
    result = slurm_commands.submit_command("sleep infinity",
                                           constraint="static")
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Update cluster with new configuration
    updated_config_file = pcluster_config_reader(
        config_file="pcluster.config.update.ini",
        bucket=bucket_name,
        resource_bucket=bucket_name)
    cluster.config_file = str(updated_config_file)
    cluster.update()

    # Here is the expected list of nodes. Note that queue1-dy-t2micro-1 comes from the initial_count set when creating
    # the cluster:
    # queue1-dy-t2micro-1
    # queue1-st-c5xlarge-1
    # queue1-st-c5xlarge-2
    assert_initial_conditions(slurm_commands, 2, 1, partition="queue1")

    updated_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1_i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 2,
                    "expected_power_saved_instances": 2,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1_i2": {
                    "instance_type": "c5.2xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1_i3": {
                    "instance_type": "t2.micro",
                    "expected_running_instances":
                    1,  # This comes from initial_count before update
                    "expected_power_saved_instances": 9,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "spot",
        },
        "queue2": {
            "compute_resources": {
                "queue2_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 1,
                    "enable_efa": True,
                    "disable_hyperthreading": True,
                },
            },
            "compute_type": "ondemand",
        },
        "queue3": {
            "compute_resources": {
                "queue3_i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": True,
                    "enable_efa": True,
                },
                "queue3_i2": {
                    "instance_type": "t2.xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=updated_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=updated_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Read updated configuration
    updated_config = configparser.ConfigParser()
    updated_config.read(updated_config_file)

    # Check new S3 resources
    check_s3_read_resource(
        region, cluster,
        updated_config.get("cluster default", "s3_read_resource"))
    check_s3_read_write_resource(
        region, cluster,
        updated_config.get("cluster default", "s3_read_write_resource"))

    # Check new Additional IAM policies
    _check_role_attached_policy(
        region, cluster,
        updated_config.get("cluster default", "additional_iam_policies"))

    # Assert that the job submitted before the update is still running
    assert_that(
        slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")
Esempio n. 4
0
def test_update_slurm(region, pcluster_config_reader, s3_bucket_factory,
                      clusters_factory, test_datadir):
    # Create S3 bucket for pre/post install scripts
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    for script in [
            "preinstall.sh", "postinstall.sh", "updated_preinstall.sh",
            "updated_postinstall.sh"
    ]:
        bucket.upload_file(str(test_datadir / script), f"scripts/{script}")

    # Create cluster with initial configuration
    init_config_file = pcluster_config_reader(resource_bucket=bucket_name,
                                              bucket=bucket_name)
    cluster = clusters_factory(init_config_file)

    # Update cluster with the same configuration, command should not result any error even if not using force update
    cluster.update(str(init_config_file), force_update="true")

    # Command executors
    command_executor = RemoteCommandExecutor(cluster)
    slurm_commands = SlurmCommands(command_executor)

    # Create shared dir for script results
    command_executor.run_remote_command("mkdir -p /shared/script_results")

    initial_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 1,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
                "queue1-i2": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 1,
                    "expected_power_saved_instances": 9,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                },
            },
            "compute_type": "ondemand",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "enable_efa": False,
                    "disable_hyperthreading": False,
                }
            },
            "compute_type": "ondemand",
        },
    }

    _assert_scheduler_nodes(queues_config=initial_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=initial_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # submit job in queue1 to verify original pre/post-install script execution
    initial_compute_nodes = slurm_commands.get_compute_nodes(
        filter_by_partition="queue1")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "preinstall", "QWE")
    _check_script(command_executor, slurm_commands, initial_compute_nodes[0],
                  "postinstall", "RTY")

    # Submit a job in order to verify that jobs are not affected by an update of the queue size
    result = slurm_commands.submit_command("sleep infinity",
                                           constraint="static&c5.xlarge")
    job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Update cluster with new configuration
    additional_policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonAppStreamServiceAccess"
    updated_config_file = pcluster_config_reader(
        config_file="pcluster.config.update.yaml",
        bucket=bucket_name,
        resource_bucket=bucket_name,
        additional_policy_arn=additional_policy_arn,
    )
    cluster.update(str(updated_config_file), force_update="true")

    # Here is the expected list of nodes.
    # the cluster:
    # queue1-st-c5xlarge-1
    # queue1-st-c5xlarge-2
    assert_initial_conditions(slurm_commands, 2, 0, partition="queue1")

    updated_queues_config = {
        "queue1": {
            "compute_resources": {
                "queue1-i1": {
                    "instance_type": "c5.xlarge",
                    "expected_running_instances": 2,
                    "expected_power_saved_instances": 2,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i2": {
                    "instance_type": "c5.2xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
                "queue1-i3": {
                    "instance_type": "t2.micro",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "spot",
        },
        "queue2": {
            "compute_resources": {
                "queue2-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 1,
                    "enable_efa": True,
                    "disable_hyperthreading": True,
                }
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
        "queue3": {
            "compute_resources": {
                "queue3-i1": {
                    "instance_type": "c5n.18xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": True,
                    "enable_efa": True,
                },
                "queue3-i2": {
                    "instance_type": "t2.xlarge",
                    "expected_running_instances": 0,
                    "expected_power_saved_instances": 10,
                    "disable_hyperthreading": False,
                    "enable_efa": False,
                },
            },
            "compute_type": "ondemand",
            "networking": {
                "placement_group": {
                    "enabled": False
                }
            },
        },
    }

    _assert_scheduler_nodes(queues_config=updated_queues_config,
                            slurm_commands=slurm_commands)
    _assert_launch_templates_config(queues_config=updated_queues_config,
                                    cluster_name=cluster.name,
                                    region=region)

    # Read updated configuration
    with open(updated_config_file, encoding="utf-8") as conf_file:
        updated_config = yaml.safe_load(conf_file)

    # Check new S3 resources
    check_s3_read_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=False))
    check_s3_read_write_resource(
        region, cluster,
        get_policy_resources(updated_config, enable_write_access=True))

    # Check new Additional IAM policies
    _check_role_attached_policy(region, cluster, additional_policy_arn)

    # Assert that the job submitted before the update is still running
    assert_that(
        slurm_commands.get_job_info(job_id)).contains("JobState=RUNNING")

    _check_volume(cluster, updated_config, region)

    # Launch a new instance for queue1 and test updated pre/post install script execution and extra json update
    # Add a new dynamic node t2.micro to queue1-i3
    new_compute_node = _add_compute_nodes(slurm_commands, "queue1", "t2.micro")

    assert_that(len(new_compute_node)).is_equal_to(1)
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_preinstall", "ABC")
    _check_script(command_executor, slurm_commands, new_compute_node[0],
                  "updated_postinstall", "DEF")

    # check new extra json
    _check_extra_json(command_executor, slurm_commands, new_compute_node[0],
                      "test_value")