def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime, max_slots): logging.info( "Testing cluster scales correctly with array jobs and parallel jobs") sge_commands = SgeCommands(remote_command_executor) result = remote_command_executor.run_remote_command( "echo 'sleep 1' | qsub -t 1-5", raise_on_error=False) array_job_id = sge_commands.assert_job_submitted(result.stdout, is_array=True) result = remote_command_executor.run_remote_command( "echo 'sleep 1' | qsub -pe mpi 4", raise_on_error=False) parallel_job_id = sge_commands.assert_job_submitted(result.stdout) # Assert scaling worked as expected expected_max = math.ceil(float(5 + 4) / max_slots) assert_scaling_worked(sge_commands, region, stack_name, scaledown_idletime, expected_max=expected_max, expected_final=0) # Assert jobs were completed sge_commands.assert_job_succeeded(array_job_id) sge_commands.assert_job_succeeded(parallel_job_id)
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime, max_queue_size): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id)) dependent_job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(slurm_commands.get_job_info(job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions") assert_that(slurm_commands.get_job_info(dependent_job_id)).contains( "JobState=PENDING Reason=Dependency") assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert scheduler configuration is correct _assert_dummy_nodes(remote_command_executor, max_queue_size) assert_that( _retrieve_slurm_compute_nodes_from_config( remote_command_executor)).is_empty() # Assert jobs were completed _assert_job_completed(remote_command_executor, job_id) _assert_job_completed(remote_command_executor, dependent_job_id)
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) sge_commands = SgeCommands(remote_command_executor) result = sge_commands.submit_command("sleep 60", nodes=1) job_id = sge_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "echo 'sleep 1' | qsub -hold_jid {0}".format(job_id), raise_on_error=False) dependent_job_id = sge_commands.assert_job_submitted(result.stdout) assert_that(_get_job_state(remote_command_executor, dependent_job_id)).is_equal_to("hqw") # Assert scaling worked as expected assert_scaling_worked(sge_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert jobs were completed sge_commands.assert_job_succeeded(job_id) sge_commands.assert_job_succeeded(dependent_job_id)
def assert_overscaling_when_job_submitted_during_scaledown( remote_command_executor, scheduler, region, stack_name, scaledown_idletime ): """Test that if a job gets submitted when a node is locked the cluster does not overscale""" logging.info("Testing cluster does not overscale when a job is submitted and a node is being terminated.") scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) if scheduler_commands.compute_nodes_count() == 0: result = scheduler_commands.submit_command("sleep 1") job_id = scheduler_commands.assert_job_submitted(result.stdout) scheduler_commands.wait_job_completed(job_id) assert_that(scheduler_commands.compute_nodes_count()).is_equal_to(1) scheduler_commands.wait_for_locked_node() result = scheduler_commands.submit_command("sleep 1") scheduler_commands.assert_job_submitted(result.stdout) # do not check scheduler scaling but only ASG. assert_scaling_worked( scheduler_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0, assert_scheduler=False, )
def _test_job_dependencies(slurm_commands, region, stack_name, scaledown_idletime): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) job_id = slurm_commands.submit_command_and_assert_job_accepted( submit_command_args={ "command": "sleep 60", "nodes": 1 }) dependent_job_id = slurm_commands.submit_command_and_assert_job_accepted( submit_command_args={ "command": "sleep 1", "nodes": 1, "after_ok": job_id }) # Wait for reason to be computed time.sleep(3) # Job should be in CF and waiting for nodes to power_up assert_that( slurm_commands.get_job_info(job_id)).contains("JobState=CONFIGURING") assert_that(slurm_commands.get_job_info(dependent_job_id)).contains( "JobState=PENDING Reason=Dependency") assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert jobs were completed _assert_job_completed(slurm_commands, job_id) _assert_job_completed(slurm_commands, dependent_job_id)
def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime, max_slots): logging.info( "Testing cluster scales correctly with array jobs and parallel jobs") torque_commands = TorqueCommands(remote_command_executor) result = remote_command_executor.run_remote_command( "echo 'sleep 30' | qsub -t 1-{0}".format(max_slots), raise_on_error=False) array_job_id = torque_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "echo 'sleep 30' | qsub -l nodes=2:ppn=1", raise_on_error=False) parallel_job_id = torque_commands.assert_job_submitted(result.stdout) # Assert scaling worked as expected assert_scaling_worked(torque_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0) # Assert jobs were completed for i in range(1, max_slots + 1): _assert_job_completed(remote_command_executor, array_job_id.replace("[]", "[{0}]".format(i))) _assert_job_completed(remote_command_executor, parallel_job_id)
def _gpu_test_scaleup(remote_command_executor, region, asg_name, stack_name, scaledown_idletime, num_gpus): """Test cluster is scaling up correctly and GPU jobs are not aborted on slurmctld restart.""" logging.info("Testing cluster scales correctly with GPU jobs") slurm_commands = SlurmCommands(remote_command_executor) # Assert initial conditions _assert_asg_has_no_node(region, asg_name) _assert_no_nodes_in_scheduler(slurm_commands) # g3.8xlarge has 32 vcpus and 2 GPUs, hardcoding tests for g3.8xlarge job_ids = [] # sbatch --wrap 'sleep 10' -G 3 result = slurm_commands.submit_command(command="sleep 10", nodes=-1, other_options="-G 3") job_ids.append(slurm_commands.assert_job_submitted(result.stdout)) # Nodes/resources available after this job: # [{cpu:31, gpu:0}, {cpu:31, gpu:0}] # sbatch --wrap 'sleep 10' --cpus-per-gpu=10 --gpus-per-task=1 result = slurm_commands.submit_command( command="sleep 10", nodes=-1, other_options="--cpus-per-gpu=10 --gpus-per-task=1") job_ids.append(slurm_commands.assert_job_submitted(result.stdout)) # Nodes/resources available after this job: # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}] # sbatch --wrap 'sleep 10' -N 1 --gpus-per-node=1 -c 22 -n 1 result = slurm_commands.submit_command( command="sleep 10", nodes=1, slots=1, other_options="--gpus-per-node=1 -c 23") job_ids.append(slurm_commands.assert_job_submitted(result.stdout)) # Nodes/resources available after this job: # [{cpu:31, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}] # sbatch --wrap 'sleep 10' -c 31 -n 1 result = slurm_commands.submit_command(command="sleep 10", nodes=-1, slots=1, other_options="-c 31") job_ids.append(slurm_commands.assert_job_submitted(result.stdout)) # Nodes/resources available after this job: # [{cpu:0, gpu:0}, {cpu:31, gpu:0}, {cpu:22, gpu:1}, {cpu:19, gpu:1}] # Assert scaling worked as expected assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=4, expected_final=0) # Assert jobs were completed for job_id in job_ids: slurm_commands.assert_job_succeeded(job_id)
def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime): logging.info("Testing jobs that violate scheduling requirements") torque_commands = TorqueCommands(remote_command_executor) # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly if torque_commands.compute_nodes_count() == 0: result = torque_commands.submit_command("sleep 1") job_id = torque_commands.assert_job_submitted(result.stdout) torque_commands.wait_job_completed(job_id) assert_that(torque_commands.compute_nodes_count()).is_greater_than(0) logging.info( "Testing cluster doesn't scale when job requires a capacity that is higher than the max available" ) # nodes limit enforced by scheduler result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l nodes={0}".format(max_queue_size + 1), raise_on_error=False) assert_that(result.stdout).contains("Job exceeds queue resource limits") # ppn limit enforced by daemons result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l nodes=1:ppn={0}".format(max_slots + 1), raise_on_error=False) ppn_job_id = torque_commands.assert_job_submitted(result.stdout) # ppn total limit enforced by scheduler result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l nodes=1:ppn={0}".format((max_slots * max_queue_size) + 1), raise_on_error=False) assert_that(result.stdout).contains("Job exceeds queue resource limits") # ncpus limit enforced by scheduler result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l ncpus={0}".format(max_slots + 1), raise_on_error=False) assert_that(result.stdout).contains("Job exceeds queue resource limits") logging.info("Testing cluster doesn't scale when job is set on hold") result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l nodes=1 -h", raise_on_error=False) hold_job_id = torque_commands.assert_job_submitted(result.stdout) logging.info( "Testing cluster scales down when pending jobs cannot be submitted") assert_scaling_worked(torque_commands, region, cluster.cfn_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert jobs are still pending assert_that(_get_job_state(remote_command_executor, ppn_job_id)).is_equal_to("Q") assert_that(_get_job_state(remote_command_executor, hold_job_id)).is_equal_to("H")
def _test_mpi( remote_command_executor, slots_per_instance, scheduler, os, region=None, stack_name=None, scaledown_idletime=None, verify_scaling=False, ): logging.info("Testing mpi job") datadir = pathlib.Path(__file__).parent / "data/mpi/" mpi_module = OS_TO_OPENMPI_MODULE_MAP[os] # Compile mpi script command = "mpicc -o mpi_hello_world mpi_hello_world.c" if mpi_module != "no_module_available": command = "module load {0} && {1}".format(mpi_module, command) remote_command_executor.run_remote_command( command, additional_files=[str(datadir / "mpi_hello_world.c")]) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) # submit script using additional files result = scheduler_commands.submit_script(str( datadir / "mpi_submit_{0}.sh".format(mpi_module)), slots=2 * slots_per_instance) job_id = scheduler_commands.assert_job_submitted(result.stdout) if verify_scaling: assert_scaling_worked(scheduler_commands, region, stack_name, scaledown_idletime, expected_max=2, expected_final=0) # not checking assert_job_succeeded after cluster scale down cause the scheduler history might be gone else: scheduler_commands.wait_job_completed(job_id) scheduler_commands.assert_job_succeeded(job_id) mpi_out = remote_command_executor.run_remote_command( "cat /shared/mpi.out").stdout assert_that(mpi_out.splitlines()).is_length(2) assert_that(mpi_out).matches( r"Hello world from processor ip-.+, rank 0 out of 2 processors") assert_that(mpi_out).matches( r"Hello world from processor ip-.+, rank 1 out of 2 processors") assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime): logging.info("Testing cluster scales correctly with array jobs and parallel jobs") slurm_commands = SlurmCommands(remote_command_executor) result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -a 1-5") array_job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -c 3 -n 2") parallel_job_id = slurm_commands.assert_job_submitted(result.stdout) # Assert scaling worked as expected assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0) # Assert jobs were completed _assert_job_completed(remote_command_executor, array_job_id) _assert_job_completed(remote_command_executor, parallel_job_id)
def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime): logging.info("Testing jobs that violate scheduling requirements") sge_commands = SgeCommands(remote_command_executor) # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly if sge_commands.compute_nodes_count() == 0: result = sge_commands.submit_command("sleep 1") job_id = sge_commands.assert_job_submitted(result.stdout) sge_commands.wait_job_completed(job_id) assert_that(sge_commands.compute_nodes_count()).is_greater_than(0) logging.info( "Testing cluster doesn't scale when job requires a capacity that is higher than the max available" ) result = sge_commands.submit_command("sleep 1000", slots=(max_slots * max_queue_size) + 1) max_slots_job_id = sge_commands.assert_job_submitted(result.stdout) assert_that(_get_job_state(remote_command_executor, max_slots_job_id)).is_equal_to("qw") logging.info("Testing cluster doesn't scale when job is set on hold") result = sge_commands.submit_command("sleep 1000", hold=True) hold_job_id = sge_commands.assert_job_submitted(result.stdout) assert_that(_get_job_state(remote_command_executor, hold_job_id)).is_equal_to("hqw") logging.info( "Testing cluster scales down when pending jobs cannot be submitted") assert_scaling_worked(sge_commands, region, cluster.cfn_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert jobs are still pending pending_jobs = remote_command_executor.run_remote_command( "qstat -s p | tail -n +3 | awk '{ print $1 }'").stdout pending_jobs = pending_jobs.splitlines() assert_that(pending_jobs).contains(max_slots_job_id, hold_job_id)
def _test_job_arrays_and_parallel_jobs(slurm_commands, region, stack_name, scaledown_idletime, partition, instance_type, cpu_per_instance): logging.info( "Testing cluster scales correctly with array jobs and parallel jobs") # Following 2 jobs requires total of 3 nodes array_job_id = slurm_commands.submit_command_and_assert_job_accepted( submit_command_args={ "command": "sleep 1", "nodes": -1, "partition": partition, "constraint": instance_type, "other_options": "-a 1-{0}".format(cpu_per_instance + 1), }) parallel_job_id = slurm_commands.submit_command_and_assert_job_accepted( submit_command_args={ "command": "sleep 1", "nodes": -1, "slots": 2, "partition": partition, "constraint": instance_type, "other_options": "-c {0}".format(cpu_per_instance - 1), }) # Assert scaling worked as expected assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0) # Assert jobs were completed _assert_job_completed(slurm_commands, array_job_id) _assert_job_completed(slurm_commands, parallel_job_id)
def _test_mpi( remote_command_executor, slots_per_instance, scheduler, region=None, stack_name=None, scaledown_idletime=None, verify_scaling=False, partition=None, ): logging.info("Testing mpi job") mpi_module = "openmpi" # Compile mpi script compile_mpi_ring(mpi_module, remote_command_executor) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) if partition: # submit script using additional files result = scheduler_commands.submit_script( str(MPI_COMMON_DATADIR / "mpi_submit_{0}.sh".format(mpi_module)), slots=2 * slots_per_instance, partition=partition, ) else: # submit script using additional files result = scheduler_commands.submit_script(str( MPI_COMMON_DATADIR / "mpi_submit_{0}.sh".format(mpi_module)), slots=2 * slots_per_instance) job_id = scheduler_commands.assert_job_submitted(result.stdout) if verify_scaling: assert_scaling_worked(scheduler_commands, region, stack_name, scaledown_idletime, expected_max=2, expected_final=0) # not checking assert_job_succeeded after cluster scale down cause the scheduler history might be gone else: scheduler_commands.wait_job_completed(job_id) scheduler_commands.assert_job_succeeded(job_id) mpi_out = remote_command_executor.run_remote_command( "cat /shared/mpi.out").stdout # mpi_out expected output # Hello world from processor ip-192-168-53-169, rank 0 out of 2 processors # Process 0 received token -1 from process 1 # Hello world from processor ip-192-168-60-9, rank 1 out of 2 processors # Process 1 received token -1 from process 0 assert_that(mpi_out.splitlines()).is_length(4) # Slurm HIT DNS name is the same as nodename and starts with partition # Example: efa-enabled-st-c5n18xlarge-2 if partition: nodename_prefix = partition elif scheduler == "slurm": nodename_prefix = "" else: nodename_prefix = "ip-" assert_that(mpi_out).matches( r"Hello world from processor {0}.+, rank 0 out of 2 processors".format( nodename_prefix)) assert_that(mpi_out).matches( r"Hello world from processor {0}.+, rank 1 out of 2 processors".format( nodename_prefix)) assert_that(mpi_out).contains("Process 0 received token -1 from process 1") assert_that(mpi_out).contains("Process 1 received token -1 from process 0") assert_no_errors_in_logs(remote_command_executor, scheduler)