def _test_jobs_executed_concurrently(remote_command_executor, max_slots): logging.info("Testing jobs are executed concurrently and nodes are fully allocated") torque_commands = TorqueCommands(remote_command_executor) # GIVEN: a cluster with 3 free nodes assert_that(torque_commands.compute_nodes_count()).is_equal_to(3) # WHEN: an array job that requires 3 nodes and all slots is submitted jobs_start_time = int(remote_command_executor.run_remote_command("date +%s").stdout) job_exec_time = 30 job_ids = [] for i in range(0, 3 * max_slots): result = torque_commands.submit_command( f"sleep {job_exec_time} && hostname > /shared/job{i} && date +%s >> /shared/end_time", nodes=1, slots=1 ) job_id = torque_commands.assert_job_submitted(result.stdout) job_ids.append(job_id) # THEN: cluster scales down correctly after completion watch_compute_nodes(torque_commands, minutes(10), 0) for id in job_ids: _assert_job_completed(remote_command_executor, id) # THEN: each host executes 4 jobs in the expected time jobs_to_hosts_count = ( remote_command_executor.run_remote_command("cat /shared/job* | sort | uniq -c | awk '{print $1}'") .stdout.strip() .splitlines() ) assert_that(jobs_to_hosts_count).is_equal_to(["4", "4", "4"]) # verify execution time jobs_completion_time = int( remote_command_executor.run_remote_command("cat /shared/end_time | sort -n | tail -1").stdout.split()[-1] ) assert_that(jobs_completion_time - jobs_start_time).is_greater_than(0).is_less_than(2 * job_exec_time)
def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime, max_slots): logging.info( "Testing cluster scales correctly with array jobs and parallel jobs") torque_commands = TorqueCommands(remote_command_executor) result = remote_command_executor.run_remote_command( "echo 'sleep 30' | qsub -t 1-{0}".format(max_slots), raise_on_error=False) array_job_id = torque_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "echo 'sleep 30' | qsub -l nodes=2:ppn=1", raise_on_error=False) parallel_job_id = torque_commands.assert_job_submitted(result.stdout) # Assert scaling worked as expected assert_scaling_worked(torque_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0) # Assert jobs were completed for i in range(1, max_slots + 1): _assert_job_completed(remote_command_executor, array_job_id.replace("[]", "[{0}]".format(i))) _assert_job_completed(remote_command_executor, parallel_job_id)
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) torque_commands = TorqueCommands(remote_command_executor) result = torque_commands.submit_command("sleep 60", nodes=1) job_id = torque_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "echo 'sleep 1' | qsub -W depend=afterok:{0}".format(job_id), raise_on_error=False) dependent_job_id = torque_commands.assert_job_submitted(result.stdout) assert_that(_get_job_state(remote_command_executor, dependent_job_id)).is_equal_to("H") # Assert scaling worked as expected assert_scaling_worked(torque_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert jobs were completed _assert_job_completed(remote_command_executor, job_id) _assert_job_completed(remote_command_executor, dependent_job_id)
def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime): logging.info("Testing jobs that violate scheduling requirements") torque_commands = TorqueCommands(remote_command_executor) # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly if torque_commands.compute_nodes_count() == 0: result = torque_commands.submit_command("sleep 1") job_id = torque_commands.assert_job_submitted(result.stdout) torque_commands.wait_job_completed(job_id) assert_that(torque_commands.compute_nodes_count()).is_greater_than(0) logging.info( "Testing cluster doesn't scale when job requires a capacity that is higher than the max available" ) # nodes limit enforced by scheduler result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l nodes={0}".format(max_queue_size + 1), raise_on_error=False) assert_that(result.stdout).contains("Job exceeds queue resource limits") # ppn limit enforced by daemons result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l nodes=1:ppn={0}".format(max_slots + 1), raise_on_error=False) ppn_job_id = torque_commands.assert_job_submitted(result.stdout) # ppn total limit enforced by scheduler result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l nodes=1:ppn={0}".format((max_slots * max_queue_size) + 1), raise_on_error=False) assert_that(result.stdout).contains("Job exceeds queue resource limits") # ncpus limit enforced by scheduler result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l ncpus={0}".format(max_slots + 1), raise_on_error=False) assert_that(result.stdout).contains("Job exceeds queue resource limits") logging.info("Testing cluster doesn't scale when job is set on hold") result = remote_command_executor.run_remote_command( "echo 'sleep 1000' | qsub -l nodes=1 -h", raise_on_error=False) hold_job_id = torque_commands.assert_job_submitted(result.stdout) logging.info( "Testing cluster scales down when pending jobs cannot be submitted") assert_scaling_worked(torque_commands, region, cluster.cfn_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert jobs are still pending assert_that(_get_job_state(remote_command_executor, ppn_job_id)).is_equal_to("Q") assert_that(_get_job_state(remote_command_executor, hold_job_id)).is_equal_to("H")
def _test_dynamic_cluster_limits(remote_command_executor, max_queue_size, max_slots, region, asg_name): logging.info("Testing cluster limits are dynamically updated") torque_commands = TorqueCommands(remote_command_executor) # Make sure cluster is scaled to 0 when this test starts assert_that(torque_commands.compute_nodes_count()).is_equal_to(0) _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size) # Submit a job to scale up to 1 node result = torque_commands.submit_command("sleep 1", nodes=1) job_id = torque_commands.assert_job_submitted(result.stdout) # Change ASG max size asg_client = boto3.client("autoscaling", region_name=region) new_max_size = max_queue_size + 1 asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=new_max_size) # sleeping for 200 seconds since daemons fetch this data every 3 minutes time.sleep(200) # Wait for job completion to be sure cluster scaled torque_commands.wait_job_completed(job_id) _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, new_max_size) # Restore initial cluster size asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=max_queue_size) # sleeping for 200 seconds since daemons fetch this data every 3 minutes time.sleep(200) _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size)
def _test_torque_job_submit(remote_command_executor, test_datadir): """Test torque job submit command in slurm cluster.""" logging.info("Testing cluster submits job by torque command") torque_commands = TorqueCommands(remote_command_executor) result = torque_commands.submit_script(str(test_datadir / "torque_job.sh")) torque_commands.assert_job_submitted(result.stdout)