def _test_job_arrays_and_parallel_jobs(remote_command_executor, region,
                                       stack_name, scaledown_idletime,
                                       max_slots):
    logging.info(
        "Testing cluster scales correctly with array jobs and parallel jobs")
    torque_commands = TorqueCommands(remote_command_executor)

    result = remote_command_executor.run_remote_command(
        "echo 'sleep 30' | qsub -t 1-{0}".format(max_slots),
        raise_on_error=False)
    array_job_id = torque_commands.assert_job_submitted(result.stdout)

    result = remote_command_executor.run_remote_command(
        "echo 'sleep 30' | qsub -l nodes=2:ppn=1", raise_on_error=False)
    parallel_job_id = torque_commands.assert_job_submitted(result.stdout)

    # Assert scaling worked as expected
    assert_scaling_worked(torque_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=3,
                          expected_final=0)
    # Assert jobs were completed
    for i in range(1, max_slots + 1):
        _assert_job_completed(remote_command_executor,
                              array_job_id.replace("[]", "[{0}]".format(i)))
    _assert_job_completed(remote_command_executor, parallel_job_id)
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    torque_commands = TorqueCommands(remote_command_executor)
    result = torque_commands.submit_command("sleep 60", nodes=1)
    job_id = torque_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1' | qsub -W depend=afterok:{0}".format(job_id),
        raise_on_error=False)
    dependent_job_id = torque_commands.assert_job_submitted(result.stdout)

    assert_that(_get_job_state(remote_command_executor,
                               dependent_job_id)).is_equal_to("H")

    # Assert scaling worked as expected
    assert_scaling_worked(torque_commands,
                          region,
                          stack_name,
                          scaledown_idletime,
                          expected_max=1,
                          expected_final=0)
    # Assert jobs were completed
    _assert_job_completed(remote_command_executor, job_id)
    _assert_job_completed(remote_command_executor, dependent_job_id)
def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots,
                            region, cluster, scaledown_idletime):
    logging.info("Testing jobs that violate scheduling requirements")
    torque_commands = TorqueCommands(remote_command_executor)

    # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly
    if torque_commands.compute_nodes_count() == 0:
        result = torque_commands.submit_command("sleep 1")
        job_id = torque_commands.assert_job_submitted(result.stdout)
        torque_commands.wait_job_completed(job_id)
    assert_that(torque_commands.compute_nodes_count()).is_greater_than(0)

    logging.info(
        "Testing cluster doesn't scale when job requires a capacity that is higher than the max available"
    )
    # nodes limit enforced by scheduler
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l nodes={0}".format(max_queue_size + 1),
        raise_on_error=False)
    assert_that(result.stdout).contains("Job exceeds queue resource limits")
    # ppn limit enforced by daemons
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l nodes=1:ppn={0}".format(max_slots + 1),
        raise_on_error=False)
    ppn_job_id = torque_commands.assert_job_submitted(result.stdout)
    # ppn total limit enforced by scheduler
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l nodes=1:ppn={0}".format((max_slots *
                                                              max_queue_size) +
                                                             1),
        raise_on_error=False)
    assert_that(result.stdout).contains("Job exceeds queue resource limits")
    # ncpus limit enforced by scheduler
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l ncpus={0}".format(max_slots + 1),
        raise_on_error=False)
    assert_that(result.stdout).contains("Job exceeds queue resource limits")

    logging.info("Testing cluster doesn't scale when job is set on hold")
    result = remote_command_executor.run_remote_command(
        "echo 'sleep 1000' | qsub -l nodes=1 -h", raise_on_error=False)
    hold_job_id = torque_commands.assert_job_submitted(result.stdout)

    logging.info(
        "Testing cluster scales down when pending jobs cannot be submitted")
    assert_scaling_worked(torque_commands,
                          region,
                          cluster.cfn_name,
                          scaledown_idletime,
                          expected_max=1,
                          expected_final=0)
    # Assert jobs are still pending
    assert_that(_get_job_state(remote_command_executor,
                               ppn_job_id)).is_equal_to("Q")
    assert_that(_get_job_state(remote_command_executor,
                               hold_job_id)).is_equal_to("H")
Example #4
0
def _test_dynamic_cluster_limits(remote_command_executor, max_queue_size, max_slots, region, asg_name):
    logging.info("Testing cluster limits are dynamically updated")
    torque_commands = TorqueCommands(remote_command_executor)

    # Make sure cluster is scaled to 0 when this test starts
    assert_that(torque_commands.compute_nodes_count()).is_equal_to(0)

    _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size)

    # Submit a job to scale up to 1 node
    result = torque_commands.submit_command("sleep 1", nodes=1)
    job_id = torque_commands.assert_job_submitted(result.stdout)
    # Change ASG max size
    asg_client = boto3.client("autoscaling", region_name=region)
    new_max_size = max_queue_size + 1
    asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=new_max_size)
    # sleeping for 200 seconds since daemons fetch this data every 3 minutes
    time.sleep(200)
    # Wait for job completion to be sure cluster scaled
    torque_commands.wait_job_completed(job_id)

    _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, new_max_size)

    # Restore initial cluster size
    asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=max_queue_size)
    # sleeping for 200 seconds since daemons fetch this data every 3 minutes
    time.sleep(200)
    _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size)
Example #5
0
def _test_jobs_executed_concurrently(remote_command_executor, max_slots):
    logging.info("Testing jobs are executed concurrently and nodes are fully allocated")
    torque_commands = TorqueCommands(remote_command_executor)

    # GIVEN: a cluster with 3 free nodes
    assert_that(torque_commands.compute_nodes_count()).is_equal_to(3)

    # WHEN: an array job that requires 3 nodes and all slots is submitted
    jobs_start_time = int(remote_command_executor.run_remote_command("date +%s").stdout)
    job_exec_time = 30
    job_ids = []
    for i in range(0, 3 * max_slots):
        result = torque_commands.submit_command(
            f"sleep {job_exec_time} && hostname > /shared/job{i} && date +%s >> /shared/end_time", nodes=1, slots=1
        )
        job_id = torque_commands.assert_job_submitted(result.stdout)
        job_ids.append(job_id)

    # THEN: cluster scales down correctly after completion
    watch_compute_nodes(torque_commands, minutes(10), 0)
    for id in job_ids:
        _assert_job_completed(remote_command_executor, id)

    # THEN: each host executes 4 jobs in the expected time
    jobs_to_hosts_count = (
        remote_command_executor.run_remote_command("cat /shared/job* | sort | uniq -c | awk '{print $1}'")
        .stdout.strip()
        .splitlines()
    )
    assert_that(jobs_to_hosts_count).is_equal_to(["4", "4", "4"])
    # verify execution time
    jobs_completion_time = int(
        remote_command_executor.run_remote_command("cat /shared/end_time | sort -n | tail -1").stdout.split()[-1]
    )
    assert_that(jobs_completion_time - jobs_start_time).is_greater_than(0).is_less_than(2 * job_exec_time)
def _test_torque_job_submit(remote_command_executor, test_datadir):
    """Test torque job submit command in slurm cluster."""
    logging.info("Testing cluster submits job by torque command")
    torque_commands = TorqueCommands(remote_command_executor)
    result = torque_commands.submit_script(str(test_datadir / "torque_job.sh"))
    torque_commands.assert_job_submitted(result.stdout)