Ejemplo n.º 1
0
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir):
    scaledown_idletime = 4
    # Test jobs should take at most 9 minutes to be executed.
    # These guarantees that the jobs are executed in parallel.
    max_jobs_execution_time = 9

    cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)

    logging.info("Executing test jobs on cluster")
    remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler])

    logging.info("Monitoring asg capacity and compute nodes")
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=cluster.cfn_name,
        max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5),
    )

    logging.info("Verifying test jobs completed successfully and in the expected time")
    _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60)

    logging.info("Verifying auto-scaling worked correctly")
    _assert_scaling_works(
        asg_capacity_time_series=asg_capacity_time_series,
        compute_nodes_time_series=compute_nodes_time_series,
        expected_asg_capacity=(0, 3),
        expected_compute_nodes=(0, 3),
    )
Ejemplo n.º 2
0
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime):
    logging.info("Testing cluster doesn't scale when job dependencies are not satisfied")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions"
    )
    assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time)
        + minutes(scaledown_idletime)
        + minutes(estimated_scaleup_time)
        + minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
Ejemplo n.º 3
0
def assert_scaling_worked(
    scheduler_commands,
    region,
    stack_name,
    scaledown_idletime,
    expected_max,
    expected_final,
    assert_asg=True,
    assert_scheduler=True,
):
    jobs_execution_time = 1
    estimated_scaleup_time = 5
    max_scaledown_time = 10
    asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(estimated_scaleup_time) +
        minutes(max_scaledown_time),
    )

    with soft_assertions():
        if assert_asg:
            asg_capacity_time_series_str = f"asg_capacity_time_series={asg_capacity_time_series}"
            assert_that(max(asg_capacity_time_series)).described_as(
                asg_capacity_time_series_str).is_equal_to(expected_max)
            assert_that(asg_capacity_time_series[-1]).described_as(
                asg_capacity_time_series_str).is_equal_to(expected_final)
        if assert_scheduler:
            compute_nodes_time_series_str = f"compute_nodes_time_series={compute_nodes_time_series}"
            assert_that(max(compute_nodes_time_series)).described_as(
                compute_nodes_time_series_str).is_equal_to(expected_max)
            assert_that(compute_nodes_time_series[-1]).described_as(
                compute_nodes_time_series_str).is_equal_to(expected_final)
Ejemplo n.º 4
0
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader,
                                  clusters_factory, test_datadir):
    scaledown_idletime = 4
    # Test jobs should take at most 9 minutes to be executed.
    # These guarantees that the jobs are executed in parallel.
    max_jobs_execution_time = 9

    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    logging.info("Executing sleep job to start a dynamic node")
    result = scheduler_commands.submit_command("sleep 1")
    job_id = scheduler_commands.assert_job_submitted(result.stdout)
    retry(wait_fixed=seconds(30),
          stop_max_delay=seconds(500))(_assert_job_state)(
              scheduler_commands, job_id, job_state="COMPLETED")

    logging.info("Executing test jobs on cluster")
    remote_command_executor.run_remote_script(test_datadir /
                                              "cluster-check.sh",
                                              args=["submit", scheduler])

    logging.info("Monitoring ec2 capacity and compute nodes")
    ec2_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=cluster.cfn_name,
        max_monitoring_time=minutes(max_jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(5),
    )

    logging.info(
        "Verifying test jobs completed successfully and in the expected time")
    _assert_test_jobs_completed(remote_command_executor,
                                max_jobs_execution_time * 60)

    logging.info("Verifying auto-scaling worked correctly")
    _assert_scaling_works(
        ec2_capacity_time_series=ec2_capacity_time_series,
        compute_nodes_time_series=compute_nodes_time_series,
        expected_ec2_capacity=(0, 3),
        expected_compute_nodes=(0, 3),
    )

    logging.info("Verifying no error in logs")
    assert_no_errors_in_logs(remote_command_executor, scheduler)
Ejemplo n.º 5
0
def assert_scaling_worked(scheduler_commands, region, stack_name,
                          scaledown_idletime, expected_max, expected_final):
    jobs_execution_time = 1
    estimated_scaleup_time = 5
    max_scaledown_time = 10
    asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(estimated_scaleup_time) +
        minutes(max_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(expected_max)
    assert_that(max(compute_nodes_time_series)).is_equal_to(expected_max)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(expected_final)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(expected_final)
Ejemplo n.º 6
0
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions")
    assert_that(_get_job_info(
        remote_command_executor,
        dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(estimated_scaleup_time) +
        minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)