def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): logging.info("Testing cluster doesn't scale when job dependencies are not satisfied") slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id)) dependent_job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(_get_job_info(remote_command_executor, job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions" ) assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency") jobs_execution_time = 1 estimated_scaleup_time = 5 estimated_scaledown_time = 20 asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=slurm_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(estimated_scaledown_time), ) assert_that(max(asg_capacity_time_series)).is_equal_to(1) assert_that(max(compute_nodes_time_series)).is_equal_to(1) assert_that(asg_capacity_time_series[-1]).is_equal_to(0) assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 # Test jobs should take at most 9 minutes to be executed. # These guarantees that the jobs are executed in parallel. max_jobs_execution_time = 9 cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Executing test jobs on cluster") remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler]) logging.info("Monitoring asg capacity and compute nodes") asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=cluster.cfn_name, max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5), ) logging.info("Verifying test jobs completed successfully and in the expected time") _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60) logging.info("Verifying auto-scaling worked correctly") _assert_scaling_works( asg_capacity_time_series=asg_capacity_time_series, compute_nodes_time_series=compute_nodes_time_series, expected_asg_capacity=(0, 3), expected_compute_nodes=(0, 3), )
def assert_scaling_worked( scheduler_commands, region, stack_name, scaledown_idletime, expected_max, expected_final, assert_asg=True, assert_scheduler=True, ): jobs_execution_time = 1 estimated_scaleup_time = 5 max_scaledown_time = 10 asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(max_scaledown_time), ) with soft_assertions(): if assert_asg: asg_capacity_time_series_str = f"asg_capacity_time_series={asg_capacity_time_series}" assert_that(max(asg_capacity_time_series)).described_as( asg_capacity_time_series_str).is_equal_to(expected_max) assert_that(asg_capacity_time_series[-1]).described_as( asg_capacity_time_series_str).is_equal_to(expected_final) if assert_scheduler: compute_nodes_time_series_str = f"compute_nodes_time_series={compute_nodes_time_series}" assert_that(max(compute_nodes_time_series)).described_as( compute_nodes_time_series_str).is_equal_to(expected_max) assert_that(compute_nodes_time_series[-1]).described_as( compute_nodes_time_series_str).is_equal_to(expected_final)
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 # Test jobs should take at most 9 minutes to be executed. # These guarantees that the jobs are executed in parallel. max_jobs_execution_time = 9 cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Executing sleep job to start a dynamic node") result = scheduler_commands.submit_command("sleep 1") job_id = scheduler_commands.assert_job_submitted(result.stdout) retry(wait_fixed=seconds(30), stop_max_delay=seconds(500))(_assert_job_state)( scheduler_commands, job_id, job_state="COMPLETED") logging.info("Executing test jobs on cluster") remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler]) logging.info("Monitoring ec2 capacity and compute nodes") ec2_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=cluster.cfn_name, max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5), ) logging.info( "Verifying test jobs completed successfully and in the expected time") _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60) logging.info("Verifying auto-scaling worked correctly") _assert_scaling_works( ec2_capacity_time_series=ec2_capacity_time_series, compute_nodes_time_series=compute_nodes_time_series, expected_ec2_capacity=(0, 3), expected_compute_nodes=(0, 3), ) logging.info("Verifying no error in logs") assert_no_errors_in_logs(remote_command_executor, scheduler)
def _test_jobs_executed_concurrently(remote_command_executor, max_slots): logging.info("Testing jobs are executed concurrently and nodes are fully allocated") torque_commands = TorqueCommands(remote_command_executor) # GIVEN: a cluster with 3 free nodes assert_that(torque_commands.compute_nodes_count()).is_equal_to(3) # WHEN: an array job that requires 3 nodes and all slots is submitted jobs_start_time = int(remote_command_executor.run_remote_command("date +%s").stdout) job_exec_time = 30 job_ids = [] for i in range(0, 3 * max_slots): result = torque_commands.submit_command( f"sleep {job_exec_time} && hostname > /shared/job{i} && date +%s >> /shared/end_time", nodes=1, slots=1 ) job_id = torque_commands.assert_job_submitted(result.stdout) job_ids.append(job_id) # THEN: cluster scales down correctly after completion watch_compute_nodes(torque_commands, minutes(10), 0) for id in job_ids: _assert_job_completed(remote_command_executor, id) # THEN: each host executes 4 jobs in the expected time jobs_to_hosts_count = ( remote_command_executor.run_remote_command("cat /shared/job* | sort | uniq -c | awk '{print $1}'") .stdout.strip() .splitlines() ) assert_that(jobs_to_hosts_count).is_equal_to(["4", "4", "4"]) # verify execution time jobs_completion_time = int( remote_command_executor.run_remote_command("cat /shared/end_time | sort -n | tail -1").stdout.split()[-1] ) assert_that(jobs_completion_time - jobs_start_time).is_greater_than(0).is_less_than(2 * job_exec_time)
def _test_dynamic_cluster_limits(remote_command_executor, max_queue_size, max_slots, region, asg_name): logging.info("Testing cluster limits are dynamically updated") torque_commands = TorqueCommands(remote_command_executor) # Make sure cluster is scaled to 0 when this test starts assert_that(torque_commands.compute_nodes_count()).is_equal_to(0) _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size) # Submit a job to scale up to 1 node result = torque_commands.submit_command("sleep 1", nodes=1) job_id = torque_commands.assert_job_submitted(result.stdout) # Change ASG max size asg_client = boto3.client("autoscaling", region_name=region) new_max_size = max_queue_size + 1 asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=new_max_size) # sleeping for 200 seconds since daemons fetch this data every 3 minutes time.sleep(200) # Wait for job completion to be sure cluster scaled torque_commands.wait_job_completed(job_id) _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, new_max_size) # Restore initial cluster size asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=max_queue_size) # sleeping for 200 seconds since daemons fetch this data every 3 minutes time.sleep(200) # make sure cluster scaled to 0 watch_compute_nodes(torque_commands, minutes(10), 0) _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size)
def _add_compute_nodes(slurm_commands, number_of_nodes=1): """ Add new compute nodes to the cluster. It is required because some changes will be available only on new compute nodes. :param cluster: the cluster :param number_of_nodes: number of nodes to add :return an array containing the new compute nodes only """ initial_compute_nodes = slurm_commands.get_compute_nodes() number_of_nodes = len(initial_compute_nodes) + number_of_nodes # submit a job to perform a scaling up action and have new instances result = slurm_commands.submit_command("sleep 1", nodes=number_of_nodes) slurm_commands.assert_job_submitted(result.stdout) estimated_scaleup_time = 5 watch_compute_nodes( scheduler_commands=slurm_commands, max_monitoring_time=minutes(estimated_scaleup_time), number_of_nodes=number_of_nodes, ) return [ node for node in slurm_commands.get_compute_nodes() if node not in initial_compute_nodes ]
def assert_initial_conditions(scheduler_commands, num_static_nodes, num_dynamic_nodes, partition, cancel_job_id=None): """Assert cluster is in expected state before test starts; return list of compute nodes.""" logging.info( "Assert initial condition, expect cluster to have {num_nodes} idle nodes" .format(num_nodes=num_static_nodes + num_dynamic_nodes)) wait_for_num_nodes_in_scheduler(scheduler_commands, num_static_nodes + num_dynamic_nodes, filter_by_partition=partition) nodes_in_scheduler = scheduler_commands.get_compute_nodes(partition) static_nodes = [] dynamic_nodes = [] for node in nodes_in_scheduler: if "-st-" in node: static_nodes.append(node) if "-dy-" in node: dynamic_nodes.append(node) assert_that(len(static_nodes)).is_equal_to(num_static_nodes) assert_that(len(dynamic_nodes)).is_equal_to(num_dynamic_nodes) assert_compute_node_states(scheduler_commands, nodes_in_scheduler, expected_states=["idle", "mixed", "allocated"]) if cancel_job_id: # Cancel warm up job so no extra scaling behavior should be happening scheduler_commands.cancel_job(cancel_job_id) retry(wait_fixed=seconds(20), stop_max_delay=minutes(2))(assert_compute_node_states)( scheduler_commands, nodes_in_scheduler, expected_states=["idle"]) return static_nodes, dynamic_nodes
def assert_scaling_worked(scheduler_commands, region, stack_name, scaledown_idletime, expected_max, expected_final): jobs_execution_time = 1 estimated_scaleup_time = 5 max_scaledown_time = 10 asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation( scheduler_commands=scheduler_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(max_scaledown_time), ) assert_that(max(asg_capacity_time_series)).is_equal_to(expected_max) assert_that(max(compute_nodes_time_series)).is_equal_to(expected_max) assert_that(asg_capacity_time_series[-1]).is_equal_to(expected_final) assert_that(compute_nodes_time_series[-1]).is_equal_to(expected_final)
def test_scaling_performance(region, scheduler, os, instance, pcluster_config_reader, clusters_factory, request): """The test runs benchmarks for the scaling logic.""" benchmarks_max_time = request.config.getoption("benchmarks_max_time") benchmark_params = { "region": region, "scheduler": scheduler, "os": os, "instance": instance, "scaling_target": request.config.getoption("benchmarks_target_capacity"), "scaledown_idletime": 2, "job_duration": 60, } cluster_config = pcluster_config_reader( scaledown_idletime=benchmark_params["scaledown_idletime"], scaling_target=benchmark_params["scaling_target"]) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) logging.info("Starting benchmark with following parameters: %s", benchmark_params) start_time = datetime.datetime.utcnow() kwargs = {"nodes": benchmark_params["scaling_target"]} result = scheduler_commands.submit_command( "sleep {0}".format(benchmark_params["job_duration"]), **kwargs) scheduler_commands.assert_job_submitted(result.stdout) compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric( scheduler_commands, max_monitoring_time=minutes(benchmarks_max_time), region=region, cluster_name=cluster.cfn_name, ) logging.info( "Benchmark completed. Producing outputs and performing assertions.") benchmark_params["total_time"] = "{0}seconds".format( int((end_time - start_time).total_seconds())) produce_benchmark_metrics_report( benchmark_params, region, cluster.cfn_name, start_time.replace(tzinfo=datetime.timezone.utc).isoformat(), end_time.replace(tzinfo=datetime.timezone.utc).isoformat(), benchmark_params["scaling_target"], request, ) assert_that(max(compute_nodes_time_series)).is_equal_to( benchmark_params["scaling_target"]) assert_that(compute_nodes_time_series[-1]).is_equal_to(0) assert_no_errors_in_logs(remote_command_executor, scheduler)
class AWSBatchCommands(SchedulerCommands): """Implement commands for awsbatch scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry( retry_on_result=lambda result: "FAILED" not in result and any( status != "SUCCEEDED" for status in result), wait_fixed=seconds(7), stop_max_delay=minutes(15), ) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "awsbstat -d {0}".format(job_id)) return re.findall(r"status\s+: (.+)", result.stdout) def get_job_exit_status(self, job_id): # noqa: D102 return self.wait_job_completed(job_id) def assert_job_submitted(self, awsbsub_output): # noqa: D102 __tracebackhide__ = True match = re.match(r"Job ([a-z0-9\-]{36}) \(.+\) has been submitted.", awsbsub_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1, slots=None): # noqa: D102 return self._remote_command_executor.run_remote_command( 'echo "{0}" | awsbsub -n {1}'.format(command, nodes)) def submit_script(self, script, script_args=None, nodes=1, additional_files=None, slots=None): # noqa: D102 raise NotImplementedError def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) assert_that(status).is_length(1 + children_number) assert_that(status).contains_only("SUCCEEDED") def compute_nodes_count(self): # noqa: D102 raise NotImplementedError def get_compute_nodes(self): # noqa: D102 raise NotImplementedError def wait_for_locked_node(self): # noqa: D102 raise NotImplementedError def get_node_cores(self): # noqa: D102 raise NotImplementedError
def test_scheduler_performance(region, scheduler, os, instance, pcluster_config_reader, clusters_factory, request): """The test runs a stress test to verify scheduler behaviour with many submitted jobs.""" benchmarks_max_time = request.config.getoption("benchmarks_max_time") instance_slots = get_instance_vcpus(region, instance) benchmark_params = { "region": region, "scheduler": scheduler, "os": os, "instance": instance, "scaling_target": request.config.getoption("benchmarks_target_capacity"), "scaledown_idletime": 2, "job_duration": 60, "jobs_to_submit": 2 * instance_slots * request.config.getoption("benchmarks_target_capacity"), } cluster_config = pcluster_config_reader( scaledown_idletime=benchmark_params["scaledown_idletime"], scaling_target=benchmark_params["scaling_target"] ) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) enable_asg_metrics(region, cluster) logging.info("Starting benchmark with following parameters: %s", benchmark_params) start_time = datetime.datetime.utcnow() _submit_jobs(benchmark_params, scheduler_commands, instance_slots, cluster) compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric( scheduler_commands, max_monitoring_time=minutes(benchmarks_max_time), region=region, cluster_name=cluster.cfn_name, ) logging.info("Benchmark completed. Producing outputs and performing assertions.") benchmark_params["total_time"] = "{0}seconds".format(int((end_time - start_time).total_seconds())) produce_benchmark_metrics_report( benchmark_params, region, cluster.cfn_name, cluster.asg, start_time.replace(tzinfo=datetime.timezone.utc).isoformat(), end_time.replace(tzinfo=datetime.timezone.utc).isoformat(), benchmark_params["scaling_target"], request, ) assert_that(max(compute_nodes_time_series)).is_equal_to(benchmark_params["scaling_target"]) assert_that(compute_nodes_time_series[-1]).is_equal_to(0) _assert_jobs_completed(remote_command_executor, benchmark_params["jobs_to_submit"]) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def wait_job_completed(self, job_id, timeout=None): # noqa: D102 if not timeout: timeout = 12 @retry( retry_on_result=lambda result: "job_state = C" not in result, wait_fixed=seconds(3), stop_max_delay=minutes(timeout), ) def _job_status_retryer(): result = self._remote_command_executor.run_remote_command( "qstat -f {0}".format(job_id)) return result.stdout return _job_status_retryer()
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): logging.info( "Testing cluster doesn't scale when job dependencies are not satisfied" ) slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command( "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id)) dependent_job_id = slurm_commands.assert_job_submitted(result.stdout) # Wait for reason to be computed time.sleep(3) assert_that(_get_job_info(remote_command_executor, job_id)).contains( "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" "_or_reserved_for_jobs_in_higher_priority_partitions") assert_that(_get_job_info( remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency") jobs_execution_time = 1 estimated_scaleup_time = 5 estimated_scaledown_time = 20 asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=slurm_commands, region=region, stack_name=stack_name, max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) + minutes(estimated_scaledown_time), ) assert_that(max(asg_capacity_time_series)).is_equal_to(1) assert_that(max(compute_nodes_time_series)).is_equal_to(1) assert_that(asg_capacity_time_series[-1]).is_equal_to(0) assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
def wait_job_completed(self, job_id, timeout=None): # noqa: D102 if not timeout: timeout = 15 @retry( retry_on_result=lambda result: "FAILED" not in result and any( status != "SUCCEEDED" for status in result), wait_fixed=seconds(7), stop_max_delay=minutes(timeout), ) def _job_status_retryer(): result = self._remote_command_executor.run_remote_command( "awsbstat -d {0}".format(job_id), log_output=True) return re.findall(r"status\s+: (.+)", result.stdout) return _job_status_retryer()
class SgeCommands(SchedulerCommands): """Implement commands for sge scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry(retry_on_result=lambda result: result != 0, wait_fixed=seconds(7), stop_max_delay=minutes(5)) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id), raise_on_error=False) return result.return_code def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id)) match = re.search(r"exit_status\s+([0-9]+)", result.stdout) assert_that(match).is_not_none() return match.group(1) def assert_job_submitted(self, qsub_output): # noqa: D102 __tracebackhide__ = True match = re.search(r"Your job ([0-9]+) \(.+\) has been submitted", qsub_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1): # noqa: D102 # TODO add support for multiple nodes return self._remote_command_executor.run_remote_command( "echo '{0}' | qsub".format(command)) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) assert_that(status).is_equal_to("0") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qhost | grep -o ip- | wc -l") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1])
def wait_job_completed(self, job_id, timeout=None): # noqa: D102 if not timeout: timeout = 12 @retry( retry_on_result=lambda result: "JobState" not in result or any(value in result for value in [ "EndTime=Unknown", "JobState=RUNNING", "JobState=COMPLETING", "JobState=CONFIGURING" ]), wait_fixed=seconds(10), stop_max_delay=minutes(timeout), ) def _job_status_retryer(): result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id), raise_on_error=False) return result.stdout return _job_status_retryer()
class SlurmCommands(SchedulerCommands): """Implement commands for slurm scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry(retry_on_result=lambda result: result == "Unknown", wait_fixed=seconds(7), stop_max_delay=minutes(5)) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) match = re.search(r"EndTime=(.+?) ", result.stdout) return match.group(1) def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) match = re.search(r"ExitCode=(.+?) ", result.stdout) return match.group(1) def assert_job_submitted(self, sbatch_output): # noqa: D102 __tracebackhide__ = True match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1): # noqa: D102 return self._remote_command_executor.run_remote_command( "sbatch -N {0} --wrap='{1}'".format(nodes, command)) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) return "JobState=COMPLETED" in result.stdout def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "sinfo --Node --noheader | grep compute | wc -l") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1])
def _test_ec2_status_check_replacement( remote_command_executor, scheduler_commands, cluster_name, region, partition, num_static_nodes, ): """Test nodes with failing ec2 status checks are correctly replaced.""" logging.info( "Testing that nodes with failing ec2 status checks are correctly replaced" ) static_nodes, _ = assert_initial_conditions(scheduler_commands, num_static_nodes, 0, partition) # Can take up to 15 mins for ec2_status_check to show # Need to increase SlurmdTimeout to avoid slurm health check and trigger ec2_status_check code path _set_slurmd_timeout(remote_command_executor, timeout=10000) kill_job_id = _submit_kill_networking_job(remote_command_executor, scheduler_commands, partition, node_type="static", num_nodes=num_static_nodes) # Assert ec2_status_check code path is triggered retry( wait_fixed=seconds(20), stop_max_delay=minutes(15) )(assert_errors_in_logs)( remote_command_executor, ["/var/log/parallelcluster/clustermgtd"], ["Setting nodes failing health check type ec2_health_check to DRAIN"], ) scheduler_commands.cancel_job(kill_job_id) # Assert static nodes are reset _wait_for_node_reset(scheduler_commands, static_nodes=static_nodes, dynamic_nodes=[]) assert_num_instances_in_cluster(cluster_name, region, len(static_nodes)) # Reset SlurmdTimeout to 180s _set_slurmd_timeout(remote_command_executor, timeout=180)
def _create_image_roles(create_roles_stack): # Create build image roles image_roles_stack = create_roles_stack( stack_prefix="integ-tests-iam-image-roles", roles_file="image-roles.cfn.yaml") lambda_cleanup_role = image_roles_stack.cfn_outputs[ "BuildImageLambdaCleanupRole"] instance_profile = image_roles_stack.cfn_outputs[ "BuildImageInstanceProfile"] # instance_role = image_roles_stack.cfn_outputs["BuildImageInstanceRole"] return instance_profile, lambda_cleanup_role @retry(wait_fixed=minutes(1), stop_max_delay=minutes(60)) def _wait_build_image_complete(image): pcluster_describe_image_result = image.describe() logging.info(pcluster_describe_image_result) assert_that(image.image_status).is_equal_to("BUILD_COMPLETE") def _check_roles( cfn_client, ec2_client, lambda_client, stack_name, instance_profile, lambda_cleanup_role, ): """Test roles are attached to EC2 build instance and Lambda cleanup function in the building stack."""
if gres: retrieve_nodes_command = "sudo cat /opt/slurm/etc/slurm_parallelcluster_gres.conf" else: retrieve_nodes_command = "sudo cat /opt/slurm/etc/slurm_parallelcluster_nodes.conf" return remote_command_executor.run_remote_command( retrieve_nodes_command).stdout def _retrieve_slurm_dummy_nodes(remote_command_executor, gres=False): retrieve_dummy_nodes_command = "scontrol -F show nodes | grep 'State=FUTURE'" return len( remote_command_executor.run_remote_command( retrieve_dummy_nodes_command).stdout.split("\n")) @retry(wait_fixed=seconds(20), stop_max_delay=minutes(7)) def _assert_no_nodes_in_scheduler(scheduler_commands): assert_that(scheduler_commands.compute_nodes_count()).is_equal_to(0) @retry(wait_fixed=seconds(20), stop_max_delay=minutes(7)) def _assert_asg_has_no_node(region, asg_name): assert_asg_desired_capacity(region, asg_name, expected=0) def _assert_dummy_nodes(remote_command_executor, count, slots=4, gpus=0): __tracebackhide__ = True if gpus > 0: # If GPU instance, need to check for extra GPU info in slurm_parallelcluster_nodes.conf gpu_entry = "Gres=gpu:tesla:{gpus} ".format(gpus=gpus) # Checking dummy nodes in slurm_parallelcluster_gres.conf
class SlurmCommands(SchedulerCommands): """Implement commands for slurm scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry(retry_on_result=lambda result: result == "Unknown", wait_fixed=seconds(7), stop_max_delay=minutes(5)) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) match = re.search(r"EndTime=(.+?) ", result.stdout) return match.group(1) def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) match = re.search(r"ExitCode=(.+?) ", result.stdout) return match.group(1) def assert_job_submitted(self, sbatch_output): # noqa: D102 __tracebackhide__ = True match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1, slots=None, host=None): # noqa: D102 submission_command = "sbatch -N {0} --wrap='{1}'".format( nodes, command) if host: submission_command += " --nodelist={0}".format(host) if slots: submission_command += " -n {0}".format(slots) return self._remote_command_executor.run_remote_command( submission_command) def submit_script(self, script, nodes=1, slots=None, host=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] additional_files.append(script) script_name = os.path.basename(script) submission_command = "sbatch" if host: submission_command += " --nodelist={0}".format(host) if slots: submission_command += " -n {0}".format(slots) if nodes > 1: submission_command += " -N {0}".format(slots) submission_command += " {1}".format(nodes, script_name) return self._remote_command_executor.run_remote_command( submission_command, additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) assert_that(result.stdout).contains("JobState=COMPLETED") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "sinfo --Node --noheader | grep compute | wc -l") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) def get_compute_nodes(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "sinfo --Node --noheader | grep compute | awk '{print $1}'") return result.stdout.splitlines()
class SgeCommands(SchedulerCommands): """Implement commands for sge scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry(retry_on_result=lambda result: result != 0, wait_fixed=seconds(7), stop_max_delay=minutes(5)) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id), raise_on_error=False) return result.return_code def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id)) match = re.search(r"exit_status\s+([0-9]+)", result.stdout) assert_that(match).is_not_none() return match.group(1) def assert_job_submitted(self, qsub_output, is_array=False): # noqa: D102 __tracebackhide__ = True if is_array: regex = r"Your job-array ([0-9]+)\.[0-9\-:]+ \(.+\) has been submitted" else: regex = r"Your job ([0-9]+) \(.+\) has been submitted" match = re.search(regex, qsub_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1, slots=None, hold=False): # noqa: D102 flags = "" if nodes != 1: raise Exception("SGE does not support nodes option") if slots: flags += "-pe mpi {0} ".format(slots) if hold: flags += "-h " return self._remote_command_executor.run_remote_command( "echo '{0}' | qsub {1}".format(command, flags), raise_on_error=False) def submit_script(self, script, nodes=1, slots=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] additional_files.append(script) flags = "" if slots: flags += "-pe mpi {0} ".format(slots) script_name = os.path.basename(script) return self._remote_command_executor.run_remote_command( "qsub {0} {1}".format(flags, script_name), additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) assert_that(status).is_equal_to("0") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qhost | grep -o ip- | wc -l") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) def get_compute_nodes(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qhost | grep ip- | awk '{print $1}'") return result.stdout.splitlines()
def _test_cloud_node_health_check( remote_command_executor, scheduler_commands, cluster_name, region, partition, num_static_nodes, num_dynamic_nodes, dynamic_instance_type, ): """ Test nodes with networking failure are correctly replaced. This will test if slurm is performing health check on CLOUD nodes correctly. """ logging.info( "Testing that nodes with networking failure fails slurm health check and replaced" ) job_id = submit_initial_job( scheduler_commands, "sleep 500", partition, dynamic_instance_type, num_dynamic_nodes, other_options="--no-requeue", ) static_nodes, dynamic_nodes = assert_initial_conditions( scheduler_commands, num_static_nodes, num_dynamic_nodes, partition, job_id) # Assert that the default SlurmdTimeout=180 is in effect _assert_slurmd_timeout(remote_command_executor, timeout=180) # Nodes with networking failures should fail slurm health check before failing ec2_status_check # Test on freshly launched dynamic nodes kill_job_id = _submit_kill_networking_job(remote_command_executor, scheduler_commands, partition, node_type="dynamic", num_nodes=num_dynamic_nodes) # Sleep for a bit so the command to detach network interface can be run time.sleep(15) # Job will hang, cancel it manually to avoid waiting for job failing scheduler_commands.cancel_job(kill_job_id) # Assert nodes are put into DOWN for not responding # TO-DO: this test only works with num_dynamic = 1 because slurm will record this error in nodelist format # i.e. error: Nodes q2-st-t2large-[1-2] not responding, setting DOWN # To support multiple nodes, need to convert list of node into nodelist format string retry(wait_fixed=seconds(20), stop_max_delay=minutes(5))(assert_errors_in_logs)( remote_command_executor, ["/var/log/slurmctld.log"], [ "Nodes {} not responding, setting DOWN".format( ",".join(dynamic_nodes)) ], ) # Assert dynamic nodes are reset _wait_for_node_reset(scheduler_commands, static_nodes=[], dynamic_nodes=dynamic_nodes) assert_num_instances_in_cluster(cluster_name, region, len(static_nodes)) # Assert ec2_status_check code path is not triggered assert_no_msg_in_logs( remote_command_executor, ["/var/log/parallelcluster/clustermgtd"], ["Setting nodes failing health check type ec2_health_check to DRAIN"], )
remote_command_executor.run_remote_command( "aws s3 cp s3://{bucket_name}/export_dir/file_to_export ./file_to_export" .format(bucket_name=bucket_name)) result = remote_command_executor.run_remote_command("cat ./file_to_export") assert_that(result.stdout).is_equal_to("Exported by FSx Lustre") def _assert_job_submitted(qsub_output): __tracebackhide__ = True match = re.search(r"Your job ([0-9]+) \(.+\) has been submitted", qsub_output) assert_that(match).is_not_none() return match.group(1) @retry(retry_on_result=lambda result: result != 0, wait_fixed=seconds(7), stop_max_delay=minutes(5)) def _wait_job_completed(remote_command_executor, job_id): result = remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id), raise_on_error=False) return result.return_code def _get_job_exit_status(remote_command_executor, job_id): result = remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id)) match = re.search(r"exit_status\s+([0-9]+)", result.stdout) assert_that(match).is_not_none() return match.group(1)
class EBSSnapshotsFactory: """Manage creation and destruction of volume snapshots.""" def __init__(self): self.config = None self.instance = None self.volume = None self.snapshot = None self.security_group_id = None self.ec2 = None self.boto_client = None def create_snapshot(self, request, subnet_id, region): """ Create a snapshot in a given region. :param request: The current request :param subnet_id: The subnet id where to get the snapshot :param region: The region where to get the snapshot """ # Only one snapshot creation per factory allowed if self.snapshot: raise Exception("Snapshot already created") self.ec2 = boto3.resource("ec2", region_name=region) self.boto_client = boto3.client("ec2", region_name=region) snapshot_config = SnapshotConfig( request.config.getoption("key_path"), request.config.getoption("key_name"), self.ec2.Subnet(subnet_id).vpc_id, subnet_id, ) self.snapshot = self._create_snapshot(region, snapshot_config) return self.snapshot.id def create_existing_volume(self, request, subnet_id, region): """ Create a volume in a given region. :param request: The current request :param subnet_id: The subnet id where to get the snapshot :param region: The region where to get the snapshot """ # Only one volume creation per factory allowed if self.volume: raise Exception("Volume already created") self.ec2 = boto3.resource("ec2", region_name=region) self.boto_client = boto3.client("ec2", region_name=region) volume_config = SnapshotConfig( request.config.getoption("key_path"), request.config.getoption("key_name"), self.ec2.Subnet(subnet_id).vpc_id, subnet_id, ) self._create_volume_process(region, volume_config) return self.volume.id def _create_volume_process(self, region, snapshot_config): self.config = snapshot_config ami_id = self._get_amazonlinux2_ami() self.security_group_id = self._get_security_group_id() subnet = self.ec2.Subnet(self.config.head_node_subnet_id) # Create a new volume and attach to the instance self.volume = self._create_volume(subnet) self.instance = self._launch_instance(ami_id, subnet) self._attach_volume() # Open ssh connection self.ssh_conn = self._open_ssh_connection() # Partitions the disk with a gpt table and 1 single partition inside self._format_volume(self.ssh_conn) # Stops the instance before taking the snapshot self._release_instance() def _create_snapshot(self, region, snapshot_config): self._create_volume_process(region, snapshot_config) self.snapshot = self._create_volume_snapshot() return self.snapshot def _create_volume_snapshot(self): logging.info("creating snapshot...") snapshot = self.ec2.create_snapshot( Description="parallelcluster-test-snapshot", VolumeId=self.volume.id) while snapshot.state == "pending": time.sleep(10) snapshot = self.ec2.Snapshot(snapshot.id) logging.info("Snapshot ready: %s" % snapshot.id) return snapshot def _format_volume(self, ssh_conn): logging.info("Partitioning device...") ssh_conn.run( "sudo sh -c 'echo -e \"g\nn\np\n1\n\n\nw\" | fdisk /dev/sdf'", warn=True, pty=False, hide=False) # Finds out the device name of the volume logging.info("Finding device name...") device_name = ssh_conn.run("readlink -f /dev/sdf").stdout.strip() # formats the 1st partition of disk logging.info("Formatting 1st partition...") ssh_conn.run("sudo sh -c 'mkfs.ext4 {}1'".format(device_name)) logging.info("Mounting partition...") ssh_conn.run("sudo mkdir /mnt/tmp") ssh_conn.run("sudo mount {}1 /mnt/tmp".format(device_name)) logging.info("Writing test data...") ssh_conn.run("echo 'hello world' | sudo tee -a /mnt/tmp/test.txt") logging.info("Device ready") def _open_ssh_connection(self): tries = 5 logging.info("Connecting to instance %s " % self.instance.public_ip_address) logging.info("ssh_key: %s " % self.config.ssh_key) ssh_conn = None while tries > 0: try: ssh_conn = Connection( host=self.instance.public_ip_address, user="******", forward_agent=False, connect_kwargs={"key_filename": [self.config.ssh_key]}, ) ssh_conn.open() tries = 0 except BaseException: logging.info("SSH connection error - retrying...") tries -= 1 time.sleep(20) if (ssh_conn is None) or (not ssh_conn.is_connected): raise ConnectionError() return ssh_conn @retry(retry_on_result=lambda state: state != "attached", wait_fixed=seconds(2), stop_max_delay=minutes(5)) def _wait_volume_attached(self): vol = self.ec2.Volume(self.volume.id) attachment_state = next( (attachment["State"] for attachment in vol.attachments if attachment["InstanceId"] == self.instance.id), "") return attachment_state def _attach_volume(self): result = self.volume.attach_to_instance(InstanceId=self.instance.id, Device="/dev/sdf") logging.info("Attach Volume Result: %s", result) self._wait_volume_attached() logging.info("Volume attached") def _create_volume(self, subnet): vol = self.ec2.create_volume( Size=10, Encrypted=False, AvailabilityZone=subnet.availability_zone, TagSpecifications=[{ "ResourceType": "volume", "Tags": [{ "Key": "name", "Value": "parallel-cluster-test-volume" }] }], ) logging.info("Volume Id: %s" % vol.id) # We can check if the volume is now ready and available: logging.info("Waiting for the volume to be ready...") while vol.state == "creating": vol = self.ec2.Volume(vol.id) time.sleep(2) logging.info("Volume ready") return vol def _get_security_group_id(self): security_group_id = self.boto_client.create_security_group( Description="security group for snapshot instance node", GroupName="snapshot-" + random_alphanumeric(), VpcId=self.config.vpc_id, )["GroupId"] self.boto_client.authorize_security_group_ingress( GroupId=security_group_id, IpPermissions=[{ "IpProtocol": "tcp", "FromPort": 22, "ToPort": 22, "IpRanges": [{ "CidrIp": "0.0.0.0/0" }] }], ) return security_group_id def _launch_instance(self, ami_id, subnet): instance = self.ec2.create_instances( ImageId=ami_id, KeyName=self.config.key_name, MinCount=1, MaxCount=1, InstanceType="t2.micro", NetworkInterfaces=[{ "SubnetId": subnet.id, "DeviceIndex": 0, "AssociatePublicIpAddress": True, "Groups": [self.security_group_id], }], TagSpecifications=[{ "ResourceType": "instance", "Tags": [{ "Key": "Name", "Value": "pcluster-snapshot-instance" }] }], )[0] logging.info("Waiting for instance to be running...") while instance.state["Name"] == "pending": time.sleep(10) instance = self.ec2.Instance(instance.id) logging.info("Instance state: %s" % instance.state) logging.info("Public dns: %s" % instance.public_dns_name) return instance def _get_amazonlinux2_ami(self): # Finds most recent alinux2 ami in region response = self.boto_client.describe_images( Owners=["amazon"], Filters=[ { "Name": "name", "Values": ["amzn2-ami-hvm-*"] }, { "Name": "description", "Values": ["Amazon Linux 2 AMI*"] }, { "Name": "architecture", "Values": ["x86_64"] }, { "Name": "root-device-type", "Values": ["ebs"] }, { "Name": "state", "Values": ["available"] }, ], ) amis = sorted(response["Images"], key=lambda x: x["CreationDate"], reverse=True) return amis[0]["ImageId"] def release_all(self): """Release all resources""" self._release_instance() self._release_volume() self._release_snapshot() self._release_security_group() @retry(stop_max_attempt_number=5, wait_fixed=5000) def _release_snapshot(self): if self.snapshot: logging.info("Deleting snapshot %s" % self.snapshot.id) self.snapshot.delete() @retry(stop_max_attempt_number=5, wait_fixed=5000) def _release_instance(self): if self.instance: self.instance.terminate() logging.info("Waiting for instance to be terminated...") while self.instance.state["Name"] != "terminated": time.sleep(10) self.instance = self.ec2.Instance(self.instance.id) logging.info("Instance terminated") self.instance = None @retry(stop_max_attempt_number=5, wait_fixed=5000) def _release_volume(self): if self.volume: logging.info("Deleting volume %s" % self.volume.id) self.volume.delete() self.volume = None def _release_security_group(self): if self.security_group_id: logging.info("Deleting security group %s" % self.security_group_id) self.boto_client.delete_security_group( GroupId=self.security_group_id) self.security_group_id = None
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance # with the License. A copy of the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. from retrying import retry from remote_command_executor import RemoteCommandExecutionError from time_utils import minutes, seconds @retry( retry_on_exception=lambda exception: isinstance( exception, RemoteCommandExecutionError), wait_fixed=seconds(30), stop_max_delay=minutes(10), ) def wait_compute_log(remote_command_executor): remote_command_executor.run_remote_command("test -d /home/logs/compute", log_error=False) # return instance-id return remote_command_executor.run_remote_command( "find /home/logs/compute/ -type f -printf '%f\\n' -quit | head -1 | cut -d. -f1", log_error=False).stdout
class SgeCommands(SchedulerCommands): """Implement commands for sge scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry(retry_on_result=lambda result: result != 0, wait_fixed=seconds(3), stop_max_delay=minutes(7)) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id), raise_on_error=False) return result.return_code def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qacct -j {0}".format(job_id)) match = re.search(r"exit_status\s+([0-9]+)", result.stdout) assert_that(match).is_not_none() return match.group(1) def assert_job_submitted(self, qsub_output, is_array=False): # noqa: D102 __tracebackhide__ = True if is_array: regex = r"Your job-array ([0-9]+)\.[0-9\-:]+ \(.+\) has been submitted" else: regex = r"Your job ([0-9]+) \(.+\) has been submitted" match = re.search(regex, qsub_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1, slots=None, hold=False, after_ok=None): # noqa: D102 flags = "" if nodes > 1: slots = nodes * slots if slots: flags += "-pe mpi {0} ".format(slots) if hold: flags += "-h " if after_ok: flags += "-hold_jid {0} ".format(after_ok) return self._remote_command_executor.run_remote_command( "echo '{0}' | qsub {1}".format(command, flags), raise_on_error=False) def submit_script(self, script, script_args=None, nodes=1, slots=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] if not script_args: script_args = [] additional_files.append(script) flags = "" if slots: flags += "-pe mpi {0} ".format(slots) script_name = os.path.basename(script) return self._remote_command_executor.run_remote_command( "qsub {0} {1} {2}".format(flags, script_name, " ".join(script_args)), additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) assert_that(status).is_equal_to("0") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qhost | grep -o ip- | wc -l") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) def get_compute_nodes(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qhost | grep ip- | awk '{print $1}'") return result.stdout.splitlines() @retry( retry_on_result=lambda result: "<state>d</state>" not in result, wait_fixed=seconds(3), stop_max_delay=minutes(5), ) def wait_for_locked_node(self): # noqa: D102 return self._remote_command_executor.run_remote_command( "qstat -f -xml").stdout def get_node_cores(self): """Return number of slots from the scheduler.""" result = self._remote_command_executor.run_remote_command( "qhost -F | grep hl:m_core") return re.search(r"hl:m_core=(\d+).000000", result.stdout).group(1)
class SlurmCommands(SchedulerCommands): """Implement commands for slurm scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry( retry_on_result=lambda result: "JobState" not in result or any(value in result for value in ["EndTime=Unknown", "JobState=RUNNING", "JobState=COMPLETING"]), wait_fixed=seconds(3), stop_max_delay=minutes(7), ) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id), raise_on_error=False) return result.stdout def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) match = re.search(r"ExitCode=(.+?) ", result.stdout) return match.group(1) def assert_job_submitted(self, sbatch_output): # noqa: D102 __tracebackhide__ = True match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output) assert_that(match).is_not_none() return match.group(1) def submit_command(self, command, nodes=1, slots=None, host=None, after_ok=None, other_options=None): # noqa: D102 submission_command = "sbatch --wrap='{0}'".format(command) if nodes > 0: submission_command += " -N {0}".format(nodes) if host: submission_command += " --nodelist={0}".format(host) if slots: submission_command += " -n {0}".format(slots) if after_ok: submission_command += " -d afterok:{0}".format(after_ok) if other_options: submission_command += " {0}".format(other_options) return self._remote_command_executor.run_remote_command( submission_command) def submit_script(self, script, script_args=None, nodes=1, slots=None, host=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] if not script_args: script_args = [] additional_files.append(script) script_name = os.path.basename(script) submission_command = "sbatch" if host: submission_command += " --nodelist={0}".format(host) if slots: submission_command += " -n {0}".format(slots) if nodes > 1: submission_command += " -N {0}".format(nodes) submission_command += " {1} {2}".format(nodes, script_name, " ".join(script_args)) return self._remote_command_executor.run_remote_command( submission_command, additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)) assert_that(result.stdout).contains("JobState=COMPLETED") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "sinfo --Node --noheader | grep compute | wc -l") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) def get_compute_nodes(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "sinfo --Node --noheader | grep compute | awk '{print $1}'") return result.stdout.splitlines() @retry(retry_on_result=lambda result: "drain" not in result, wait_fixed=seconds(3), stop_max_delay=minutes(5)) def wait_for_locked_node(self): # noqa: D102 return self._remote_command_executor.run_remote_command( "/opt/slurm/bin/sinfo -h -o '%t'").stdout def get_node_cores(self): """Return number of slots from the scheduler.""" result = self._remote_command_executor.run_remote_command( "/opt/slurm/bin/sinfo -o '%c' -h") return re.search(r"(\d+)", result.stdout).group(1) def get_job_info(self, job_id): """Return job details from slurm""" return self._remote_command_executor.run_remote_command( "scontrol show jobs -o {0}".format(job_id)).stdout
class TorqueCommands(SchedulerCommands): """Implement commands for torque scheduler.""" def __init__(self, remote_command_executor): super().__init__(remote_command_executor) @retry(retry_on_result=lambda result: "job_state = C" not in result, wait_fixed=seconds(3), stop_max_delay=minutes(12)) def wait_job_completed(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qstat -f {0}".format(job_id)) return result.stdout def get_job_exit_status(self, job_id): # noqa: D102 result = self._remote_command_executor.run_remote_command( "qstat -f {0}".format(job_id)) match = re.search(r"exit_status = (\d+)", result.stdout) return match.group(1) def assert_job_submitted(self, qsub_output): # noqa: D102 __tracebackhide__ = True # qsub_output is the id of the job in case of successful submissions id = qsub_output # check that the job exists self._remote_command_executor.run_remote_command( "qstat -f {0}".format(id)) return id def submit_command(self, command, nodes=1, slots=None, after_ok=None): # noqa: D102 flags = "-l nodes={0}:ppn={1}".format(nodes or 1, slots or 1) if after_ok: flags += " -W depend=afterok:{0}".format(after_ok) return self._remote_command_executor.run_remote_command( "echo '{0}' | qsub {1}".format(command, flags), raise_on_error=False) def submit_script(self, script, script_args=None, nodes=1, slots=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] script_name = os.path.basename(script) additional_files.append(script) flags = "-l nodes={0}:ppn={1}".format(nodes or 1, slots or 1) if script_args: flags += ' -F "{0}"'.format(" ".join(script_args)) return self._remote_command_executor.run_remote_command( "qsub {0} {1}".format(flags, script_name), additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) assert_that(status).is_equal_to("0") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "echo $(( $(/opt/torque/bin/pbsnodes -l all | wc -l) - 1))") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) def get_compute_nodes(self): # noqa: D102 result = self._remote_command_executor.run_remote_command( "pbsnodes -l all | grep -v $(hostname) | awk '{print $1}'") return result.stdout.splitlines() @retry(retry_on_result=lambda result: "offline" not in result, wait_fixed=seconds(5), stop_max_delay=minutes(5)) def wait_for_locked_node(self): # noqa: D102 # discard the first node since that is the master server return self._remote_command_executor.run_remote_command( r'pbsnodes | grep -e "\sstate = " | tail -n +2').stdout def get_node_cores(self): """Return number of slots from the scheduler.""" result = self._remote_command_executor.run_remote_command( "pbsnodes | tail -n +10") return re.search(r"np = (\d+)", result.stdout).group(1)
if auto_import_policy in ("NEW", "NEW_CHANGED"): result = remote_command_executor.run_remote_command( f"cat {mount_dir}/{filename}".format(mount_dir=mount_dir)) assert_that( result.stdout).is_equal_to(modified_file_body if auto_import_policy == "NEW_CHANGED" else new_file_body) else: result = remote_command_executor.run_remote_command(f"ls {mount_dir}/") assert_that(result.stdout).does_not_contain(filename) @retry( retry_on_result=lambda result: result.get("Lifecycle") in ["PENDING", "EXECUTING", "CANCELLING"], wait_fixed=seconds(5), stop_max_delay=minutes(7), ) def poll_on_data_export(task, fsx): logging.info("Data Export Task {task_id}: {status}".format( task_id=task.get("TaskId"), status=task.get("Lifecycle"))) return fsx.describe_data_repository_tasks( TaskIds=[task.get("TaskId")]).get("DataRepositoryTasks")[0] def _test_data_repository_task(remote_command_executor, mount_dir, bucket_name, fsx_fs_id, region): logging.info("Testing fsx lustre data repository task") file_contents = "Exported by FSx Lustre" remote_command_executor.run_remote_command( "echo '{file_contents}' > {mount_dir}/file_to_export".format( file_contents=file_contents, mount_dir=mount_dir))