def test_efa(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir): """ Test all EFA Features. Grouped all tests in a single function so that cluster can be reused for all of them. """ max_queue_size = 2 slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader(max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_efa_installed(scheduler_commands, remote_command_executor) _test_mpi(remote_command_executor, slots_per_instance, scheduler, os) logging.info("Running on Instances: {0}".format( get_compute_nodes_instance_ids(cluster.cfn_name, region))) _test_osu_benchmarks("openmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) _test_osu_benchmarks("intelmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor) assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_mpi(scheduler, region, instance, pcluster_config_reader, clusters_factory): scaledown_idletime = 3 max_queue_size = 3 slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) # This verifies that the job completes correctly _test_mpi( remote_command_executor, slots_per_instance, scheduler, region, cluster.cfn_name, scaledown_idletime, verify_scaling=False, ) # This verifies that scaling worked _test_mpi( remote_command_executor, slots_per_instance, scheduler, region, cluster.cfn_name, scaledown_idletime, verify_scaling=True, )
def test_sit_efa( region, scheduler, instance, pcluster_config_reader, clusters_factory, test_datadir, architecture, network_interfaces_count, ): """ Test all EFA Features. Grouped all tests in a single function so that cluster can be reused for all of them. """ max_queue_size = 2 slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader(max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_efa_installation(scheduler_commands, remote_command_executor, efa_installed=True) _test_mpi(remote_command_executor, slots_per_instance, scheduler) logging.info("Running on Instances: {0}".format( get_compute_nodes_instance_ids(cluster.cfn_name, region))) _test_osu_benchmarks_latency("openmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) if architecture == "x86_64": _test_osu_benchmarks_latency("intelmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor) if network_interfaces_count > 1: _test_osu_benchmarks_multiple_bandwidth(remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) assert_no_errors_in_logs(remote_command_executor, scheduler)
def test_existing_hosted_zone( hosted_zone_factory, pcluster_config_reader, clusters_factory, vpc_stack, cfn_stacks_factory, key_name, scheduler, region, instance, ): """Test hosted_zone_id is provided in the config file.""" num_computes = 2 hosted_zone_id, domain_name = hosted_zone_factory() cluster_config = pcluster_config_reader( existing_hosted_zone=hosted_zone_id, queue_size=num_computes) cluster = clusters_factory(cluster_config, upper_case_cluster_name=True) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) # Test run mpi job _test_mpi( remote_command_executor, slots_per_instance=fetch_instance_slots(region, instance), scheduler=scheduler, region=region, stack_name=cluster.cfn_name, scaledown_idletime=3, verify_scaling=False, ) # Assert compute hostname is the same as nodename compute_nodes = scheduler_commands.get_compute_nodes() _test_hostname_same_as_nodename(scheduler_commands, remote_command_executor, compute_nodes) # Test domain name matches expected domain name resolv_conf = remote_command_executor.run_remote_command( "cat /etc/resolv.conf").stdout assert_that(resolv_conf).contains(cluster.cfn_name.lower() + "." + domain_name)
def test_hit_no_cluster_dns_mpi(scheduler, region, instance, pcluster_config_reader, clusters_factory, test_datadir): logging.info("Testing HIT cluster with cluster DNS disabled.") scaledown_idletime = 3 max_queue_size = 3 min_queue_size = 1 slots_per_instance = fetch_instance_slots(region, instance) cluster_config = pcluster_config_reader( scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size, min_queue_size=min_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) # Assert that compute hostname cannot be pinged directly compute_nodes = scheduler_commands.get_compute_nodes() result = remote_command_executor.run_remote_command("ping -c 3 {}".format( compute_nodes[0]), raise_on_error=False) assert_that(result.failed).is_true() # Assert compute hostname is the same as nodename _test_hostname_same_as_nodename(scheduler_commands, remote_command_executor, compute_nodes) # This verifies that the job completes correctly _test_mpi( remote_command_executor, slots_per_instance, scheduler, region, cluster.cfn_name, scaledown_idletime, verify_scaling=False, )
def test_efa( os, region, scheduler, instance, pcluster_config_reader, clusters_factory, test_datadir, architecture, network_interfaces_count, ): """ Test all EFA Features. Grouped all tests in a single function so that cluster can be reused for all of them. """ # We collected OSU benchmarks results for c5n.18xlarge only. osu_benchmarks_instances = ["c5n.18xlarge"] # 4 instances are required to see performance differences in collective OSU benchmarks. # 2 instances are enough for other EFA tests. max_queue_size = 4 if instance in osu_benchmarks_instances else 2 slots_per_instance = fetch_instance_slots(region, instance) head_node_instance = "c5n.18xlarge" if architecture == "x86_64" else "c6gn.16xlarge" cluster_config = pcluster_config_reader(max_queue_size=max_queue_size, head_node_instance=head_node_instance) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _test_efa_installation(scheduler_commands, remote_command_executor, efa_installed=True, partition="efa-enabled") _test_mpi(remote_command_executor, slots_per_instance, scheduler, partition="efa-enabled") logging.info("Running on Instances: {0}".format(get_compute_nodes_instance_ids(cluster.cfn_name, region))) if instance in osu_benchmarks_instances: benchmark_failures = [] mpi_versions = ["openmpi"] if architecture == "x86_64": mpi_versions.append("intelmpi") # Run OSU benchmarks in efa-enabled queue. for mpi_version in mpi_versions: benchmark_failures.extend( _test_osu_benchmarks_pt2pt( mpi_version, remote_command_executor, scheduler_commands, test_datadir, instance, slots_per_instance, partition="efa-enabled", ) ) benchmark_failures.extend( _test_osu_benchmarks_collective( mpi_version, remote_command_executor, scheduler_commands, test_datadir, instance, num_of_instances=max_queue_size, slots_per_instance=slots_per_instance, partition="efa-enabled", ) ) assert_that(benchmark_failures, description="Some OSU benchmarks are failing").is_empty() if network_interfaces_count > 1: _test_osu_benchmarks_multiple_bandwidth( remote_command_executor, scheduler_commands, test_datadir, slots_per_instance, partition="efa-enabled" ) _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor, partition="efa-enabled") if instance == "p4d.24xlarge" and "centos" not in os: _test_nccl_benchmarks(remote_command_executor, test_datadir, "openmpi", scheduler_commands) assert_no_errors_in_logs(remote_command_executor, scheduler)