Ejemplo n.º 1
0
def test_efa(region, scheduler, instance, os, pcluster_config_reader,
             clusters_factory, test_datadir):
    """
    Test all EFA Features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    max_queue_size = 2
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader(max_queue_size=max_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    _test_efa_installed(scheduler_commands, remote_command_executor)
    _test_mpi(remote_command_executor, slots_per_instance, scheduler, os)
    logging.info("Running on Instances: {0}".format(
        get_compute_nodes_instance_ids(cluster.cfn_name, region)))
    _test_osu_benchmarks("openmpi", remote_command_executor,
                         scheduler_commands, test_datadir, slots_per_instance)
    _test_osu_benchmarks("intelmpi", remote_command_executor,
                         scheduler_commands, test_datadir, slots_per_instance)
    _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
Ejemplo n.º 2
0
def test_mpi(scheduler, region, instance, pcluster_config_reader,
             clusters_factory):
    scaledown_idletime = 3
    max_queue_size = 3
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # This verifies that the job completes correctly
    _test_mpi(
        remote_command_executor,
        slots_per_instance,
        scheduler,
        region,
        cluster.cfn_name,
        scaledown_idletime,
        verify_scaling=False,
    )

    # This verifies that scaling worked
    _test_mpi(
        remote_command_executor,
        slots_per_instance,
        scheduler,
        region,
        cluster.cfn_name,
        scaledown_idletime,
        verify_scaling=True,
    )
Ejemplo n.º 3
0
def test_sit_efa(
    region,
    scheduler,
    instance,
    pcluster_config_reader,
    clusters_factory,
    test_datadir,
    architecture,
    network_interfaces_count,
):
    """
    Test all EFA Features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    max_queue_size = 2
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader(max_queue_size=max_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    _test_efa_installation(scheduler_commands,
                           remote_command_executor,
                           efa_installed=True)
    _test_mpi(remote_command_executor, slots_per_instance, scheduler)
    logging.info("Running on Instances: {0}".format(
        get_compute_nodes_instance_ids(cluster.cfn_name, region)))
    _test_osu_benchmarks_latency("openmpi", remote_command_executor,
                                 scheduler_commands, test_datadir,
                                 slots_per_instance)
    if architecture == "x86_64":
        _test_osu_benchmarks_latency("intelmpi", remote_command_executor,
                                     scheduler_commands, test_datadir,
                                     slots_per_instance)
    _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor)
    if network_interfaces_count > 1:
        _test_osu_benchmarks_multiple_bandwidth(remote_command_executor,
                                                scheduler_commands,
                                                test_datadir,
                                                slots_per_instance)

    assert_no_errors_in_logs(remote_command_executor, scheduler)
Ejemplo n.º 4
0
def test_existing_hosted_zone(
    hosted_zone_factory,
    pcluster_config_reader,
    clusters_factory,
    vpc_stack,
    cfn_stacks_factory,
    key_name,
    scheduler,
    region,
    instance,
):
    """Test hosted_zone_id is provided in the config file."""
    num_computes = 2
    hosted_zone_id, domain_name = hosted_zone_factory()
    cluster_config = pcluster_config_reader(
        existing_hosted_zone=hosted_zone_id, queue_size=num_computes)
    cluster = clusters_factory(cluster_config, upper_case_cluster_name=True)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    # Test run mpi job
    _test_mpi(
        remote_command_executor,
        slots_per_instance=fetch_instance_slots(region, instance),
        scheduler=scheduler,
        region=region,
        stack_name=cluster.cfn_name,
        scaledown_idletime=3,
        verify_scaling=False,
    )

    # Assert compute hostname is the same as nodename
    compute_nodes = scheduler_commands.get_compute_nodes()
    _test_hostname_same_as_nodename(scheduler_commands,
                                    remote_command_executor, compute_nodes)

    # Test domain name matches expected domain name
    resolv_conf = remote_command_executor.run_remote_command(
        "cat /etc/resolv.conf").stdout
    assert_that(resolv_conf).contains(cluster.cfn_name.lower() + "." +
                                      domain_name)
Ejemplo n.º 5
0
def test_hit_no_cluster_dns_mpi(scheduler, region, instance,
                                pcluster_config_reader, clusters_factory,
                                test_datadir):
    logging.info("Testing HIT cluster with cluster DNS disabled.")
    scaledown_idletime = 3
    max_queue_size = 3
    min_queue_size = 1
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime,
        max_queue_size=max_queue_size,
        min_queue_size=min_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    # Assert that compute hostname cannot be pinged directly
    compute_nodes = scheduler_commands.get_compute_nodes()
    result = remote_command_executor.run_remote_command("ping -c 3 {}".format(
        compute_nodes[0]),
                                                        raise_on_error=False)
    assert_that(result.failed).is_true()

    # Assert compute hostname is the same as nodename
    _test_hostname_same_as_nodename(scheduler_commands,
                                    remote_command_executor, compute_nodes)

    # This verifies that the job completes correctly
    _test_mpi(
        remote_command_executor,
        slots_per_instance,
        scheduler,
        region,
        cluster.cfn_name,
        scaledown_idletime,
        verify_scaling=False,
    )
Ejemplo n.º 6
0
def test_efa(
    os,
    region,
    scheduler,
    instance,
    pcluster_config_reader,
    clusters_factory,
    test_datadir,
    architecture,
    network_interfaces_count,
):
    """
    Test all EFA Features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    # We collected OSU benchmarks results for c5n.18xlarge only.
    osu_benchmarks_instances = ["c5n.18xlarge"]

    # 4 instances are required to see performance differences in collective OSU benchmarks.
    # 2 instances are enough for other EFA tests.
    max_queue_size = 4 if instance in osu_benchmarks_instances else 2
    slots_per_instance = fetch_instance_slots(region, instance)
    head_node_instance = "c5n.18xlarge" if architecture == "x86_64" else "c6gn.16xlarge"
    cluster_config = pcluster_config_reader(max_queue_size=max_queue_size, head_node_instance=head_node_instance)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)

    _test_efa_installation(scheduler_commands, remote_command_executor, efa_installed=True, partition="efa-enabled")
    _test_mpi(remote_command_executor, slots_per_instance, scheduler, partition="efa-enabled")
    logging.info("Running on Instances: {0}".format(get_compute_nodes_instance_ids(cluster.cfn_name, region)))

    if instance in osu_benchmarks_instances:
        benchmark_failures = []
        mpi_versions = ["openmpi"]
        if architecture == "x86_64":
            mpi_versions.append("intelmpi")

        # Run OSU benchmarks in efa-enabled queue.
        for mpi_version in mpi_versions:
            benchmark_failures.extend(
                _test_osu_benchmarks_pt2pt(
                    mpi_version,
                    remote_command_executor,
                    scheduler_commands,
                    test_datadir,
                    instance,
                    slots_per_instance,
                    partition="efa-enabled",
                )
            )
            benchmark_failures.extend(
                _test_osu_benchmarks_collective(
                    mpi_version,
                    remote_command_executor,
                    scheduler_commands,
                    test_datadir,
                    instance,
                    num_of_instances=max_queue_size,
                    slots_per_instance=slots_per_instance,
                    partition="efa-enabled",
                )
            )
        assert_that(benchmark_failures, description="Some OSU benchmarks are failing").is_empty()

    if network_interfaces_count > 1:
        _test_osu_benchmarks_multiple_bandwidth(
            remote_command_executor, scheduler_commands, test_datadir, slots_per_instance, partition="efa-enabled"
        )
    _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor, partition="efa-enabled")

    if instance == "p4d.24xlarge" and "centos" not in os:
        _test_nccl_benchmarks(remote_command_executor, test_datadir, "openmpi", scheduler_commands)

    assert_no_errors_in_logs(remote_command_executor, scheduler)