Esempio n. 1
0
def _test_mpi_ssh(remote_command_executor, scheduler, os, test_datadir):
    logging.info("Testing mpi SSH")
    mpi_module = OS_TO_OPENMPI_MODULE_MAP[os]

    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    compute_node = scheduler_commands.get_compute_nodes()
    assert_that(len(compute_node)).is_equal_to(1)
    remote_host = compute_node[0]

    # Gets remote host ip from hostname
    remote_host_ip = remote_command_executor.run_remote_command(
        "getent hosts {0} | cut -d' ' -f1".format(remote_host)).stdout

    # Below job will timeout if the IP address is not in known_hosts
    mpirun_out_ip = remote_command_executor.run_remote_script(
        str(test_datadir / "mpi_ssh.sh"),
        args=[mpi_module, remote_host_ip]).stdout.splitlines()

    # mpirun_out_ip = "ip-10-0-127-71"
    assert_that(len(mpirun_out_ip)).is_equal_to(1)
    assert_that(mpirun_out_ip[-1]).is_equal_to(remote_host)

    mpirun_out = remote_command_executor.run_remote_script(
        str(test_datadir / "mpi_ssh.sh"),
        args=[mpi_module, remote_host]).stdout.splitlines()

    # mpirun_out = "ip-10-0-127-71"
    assert_that(len(mpirun_out)).is_equal_to(1)
    assert_that(mpirun_out[-1]).is_equal_to(remote_host)
Esempio n. 2
0
def test_efa(region, scheduler, instance, os, pcluster_config_reader,
             clusters_factory, test_datadir):
    """
    Test all EFA Features.

    Grouped all tests in a single function so that cluster can be reused for all of them.
    """
    max_queue_size = 2
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader(max_queue_size=max_queue_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    _test_efa_installed(scheduler_commands, remote_command_executor)
    _test_mpi(remote_command_executor, slots_per_instance, scheduler, os)
    logging.info("Running on Instances: {0}".format(
        get_compute_nodes_instance_ids(cluster.cfn_name, region)))
    _test_osu_benchmarks("openmpi", remote_command_executor,
                         scheduler_commands, test_datadir, slots_per_instance)
    _test_osu_benchmarks("intelmpi", remote_command_executor,
                         scheduler_commands, test_datadir, slots_per_instance)
    _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
Esempio n. 3
0
def test_replace_compute_on_failure(
    region, scheduler, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir
):
    """
    Test that compute nodes get replaced on userdata failures and logs get saved in shared directory.

    The failure is caused by a post_install script that exits with errors on compute nodes.
    """
    bucket_name = s3_bucket_factory()
    bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
    bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh")
    cluster_config = pcluster_config_reader(bucket_name=bucket_name)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    # submit a job to spin up a compute node that will fail due to post_install script
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    scheduler_commands.submit_command("sleep 1")
    instance_id = wait_compute_log(remote_command_executor)[0]

    # extract logs and check one of them
    _assert_compute_logs(remote_command_executor, instance_id)

    # check that instance got already replaced or is marked as Unhealthy
    time.sleep(25)  # Instance waits for 10 seconds before terminating to allow logs to propagate to CloudWatch
    assert_instance_replaced_or_terminating(instance_id, region)
Esempio n. 4
0
def _test_mpi_ssh(remote_command_executor, scheduler, test_datadir):
    logging.info("Testing mpi SSH")
    mpi_module = "openmpi"

    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    compute_node = scheduler_commands.get_compute_nodes()
    assert_that(len(compute_node)).is_equal_to(1)
    remote_host = compute_node[0]

    # Gets remote host ip from hostname
    remote_host_ip = remote_command_executor.run_remote_command(
        "getent hosts {0} | cut -d' ' -f1".format(remote_host),
        timeout=10).stdout

    # Below job will timeout if the IP address is not in known_hosts
    mpirun_out_ip = remote_command_executor.run_remote_script(
        str(test_datadir / "mpi_ssh.sh"),
        args=[mpi_module, remote_host_ip],
        timeout=10).stdout.splitlines()

    # mpirun_out_ip = ["Warning: Permanently added '192.168.60.89' (ECDSA) to the list of known hosts.",
    # '', 'ip-192-168-60-89']
    assert_that(len(mpirun_out_ip)).is_equal_to(3)
    assert_that(mpirun_out_ip[-1]).is_equal_to(remote_host)

    mpirun_out = remote_command_executor.run_remote_script(
        str(test_datadir / "mpi_ssh.sh"),
        args=[mpi_module, remote_host],
        timeout=10).stdout.splitlines()

    # mpirun_out = ["Warning: Permanently added 'ip-192-168-60-89,192.168.60.89' (ECDSA) to the list of known hosts.",
    # '', 'ip-192-168-60-89']
    assert_that(len(mpirun_out)).is_equal_to(3)
    assert_that(mpirun_out[-1]).is_equal_to(remote_host)
Esempio n. 5
0
def test_ebs_snapshot(request, vpc_stacks, region, scheduler,
                      pcluster_config_reader, clusters_factory,
                      snapshots_factory):
    logging.info("Testing ebs snapshot")
    mount_dir = "ebs_mount_dir"
    volume_size = 10

    logging.info("Creating snapshot")

    snapshot_id = snapshots_factory.create_snapshot(
        request, vpc_stacks[region].cfn_outputs["PublicSubnetId"], region)

    logging.info("Snapshot id: %s" % snapshot_id)
    cluster_config = pcluster_config_reader(mount_dir=mount_dir,
                                            volume_size=volume_size,
                                            snapshot_id=snapshot_id)

    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    mount_dir = "/" + mount_dir
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    _test_ebs_correctly_mounted(remote_command_executor,
                                mount_dir,
                                volume_size="9.8")
    _test_ebs_correctly_shared(remote_command_executor, mount_dir,
                               scheduler_commands)

    # Checks for test data
    result = remote_command_executor.run_remote_command(
        "cat {}/test.txt".format(mount_dir))
    assert_that(result.stdout.strip()).is_equal_to("hello world")
Esempio n. 6
0
def test_ebs_existing(request, vpc_stacks, region, scheduler,
                      pcluster_config_reader, snapshots_factory,
                      clusters_factory):
    logging.info("Testing ebs existing")
    existing_mount_dir = "existing_mount_dir"

    logging.info("Creating volume")

    volume_id = snapshots_factory.create_existing_volume(
        request, vpc_stacks[region].cfn_outputs["PublicSubnetId"], region)

    logging.info("Existing Volume id: %s" % volume_id)
    cluster_config = pcluster_config_reader(
        volume_id=volume_id, existing_mount_dir=existing_mount_dir)

    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    existing_mount_dir = "/" + existing_mount_dir
    _test_ebs_correctly_mounted(remote_command_executor,
                                existing_mount_dir,
                                volume_size="9.8")
    _test_ebs_correctly_shared(remote_command_executor, existing_mount_dir,
                               scheduler_commands)
    # Checks for test data
    result = remote_command_executor.run_remote_command(
        "cat {}/test.txt".format(existing_mount_dir))
    assert_that(result.stdout.strip()).is_equal_to("hello world")

    # delete the cluster before detaching the EBS volume
    cluster.delete()
    # check the volume still exists after deleting the cluster
    _assert_volume_exist(volume_id, region)
Esempio n. 7
0
def assert_overscaling_when_job_submitted_during_scaledown(
    remote_command_executor, scheduler, region, stack_name, scaledown_idletime
):
    """Test that if a job gets submitted when a node is locked the cluster does not overscale"""
    logging.info("Testing cluster does not overscale when a job is submitted and a node is being terminated.")
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    if scheduler_commands.compute_nodes_count() == 0:
        result = scheduler_commands.submit_command("sleep 1")
        job_id = scheduler_commands.assert_job_submitted(result.stdout)
        scheduler_commands.wait_job_completed(job_id)
    assert_that(scheduler_commands.compute_nodes_count()).is_equal_to(1)

    scheduler_commands.wait_for_locked_node()

    result = scheduler_commands.submit_command("sleep 1")
    scheduler_commands.assert_job_submitted(result.stdout)
    # do not check scheduler scaling but only ASG.
    assert_scaling_worked(
        scheduler_commands,
        region,
        stack_name,
        scaledown_idletime,
        expected_max=1,
        expected_final=0,
        assert_scheduler=False,
    )
Esempio n. 8
0
def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory,
                    kms_key_factory, region, os):
    mount_dir = "ebs_mount_dir"
    kms_key_id = kms_key_factory.create_kms_key(region)
    cluster_config = pcluster_config_reader(
        mount_dir=mount_dir,
        ec2_iam_role=kms_key_factory.iam_role_arn,
        ebs_kms_key_id=kms_key_id)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    mount_dir = "/" + mount_dir
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    volume_id = get_ebs_volume_ids(cluster, region)[0]

    _test_ebs_correctly_mounted(remote_command_executor,
                                mount_dir,
                                volume_size=35)
    _test_ebs_correctly_shared(remote_command_executor, mount_dir,
                               scheduler_commands)
    _test_ebs_encrypted_with_kms(volume_id,
                                 region,
                                 encrypted=True,
                                 kms_key_id=kms_key_id)

    _test_root_volume_encryption(cluster,
                                 os,
                                 region,
                                 scheduler,
                                 encrypted=True)
Esempio n. 9
0
def test_hit_disable_hyperthreading(
    region, scheduler, instance, os, pcluster_config_reader, clusters_factory, default_threads_per_core
):
    """Test Disable Hyperthreading for HIT clusters."""
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    _test_disable_hyperthreading_settings(
        remote_command_executor,
        scheduler_commands,
        slots_per_instance,
        scheduler,
        hyperthreading_disabled=False,
        partition="ht-enabled",
        default_threads_per_core=default_threads_per_core,
    )
    _test_disable_hyperthreading_settings(
        remote_command_executor,
        scheduler_commands,
        slots_per_instance,
        scheduler,
        hyperthreading_disabled=True,
        partition="ht-disabled",
        default_threads_per_core=default_threads_per_core,
    )

    assert_no_errors_in_logs(remote_command_executor, scheduler)
Esempio n. 10
0
def test_nodewatcher_terminates_failing_node(scheduler, region,
                                             pcluster_config_reader,
                                             clusters_factory, test_datadir):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    compute_nodes = scheduler_commands.get_compute_nodes()

    # submit a job that kills the slurm daemon so that the node enters a failing state
    scheduler_commands.submit_script(
        str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler)))
    instance_id = wait_compute_log(remote_command_executor)

    _assert_compute_logs(remote_command_executor, instance_id)
    assert_instance_replaced_or_terminating(instance_id, region)
    # verify that desired capacity is still 1
    assert_that(get_desired_asg_capacity(region,
                                         cluster.cfn_name)).is_equal_to(1)
    _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
Esempio n. 11
0
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir):
    scaledown_idletime = 4
    # Test jobs should take at most 9 minutes to be executed.
    # These guarantees that the jobs are executed in parallel.
    max_jobs_execution_time = 9

    cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)

    logging.info("Executing test jobs on cluster")
    remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler])

    logging.info("Monitoring asg capacity and compute nodes")
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=cluster.cfn_name,
        max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5),
    )

    logging.info("Verifying test jobs completed successfully and in the expected time")
    _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60)

    logging.info("Verifying auto-scaling worked correctly")
    _assert_scaling_works(
        asg_capacity_time_series=asg_capacity_time_series,
        compute_nodes_time_series=compute_nodes_time_series,
        expected_asg_capacity=(0, 3),
        expected_compute_nodes=(0, 3),
    )
def test_multiple_nics(scheduler, region, pcluster_config_reader, clusters_factory):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)

    _test_head_node_nics(remote_command_executor, region)
    _test_compute_node_nics(cluster, region, remote_command_executor, scheduler_commands)
def test_spot_default(scheduler, pcluster_config_reader, clusters_factory):
    """Test that a cluster with spot instances can be created with default spot_price_value."""
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    assert_that(scheduler_commands.compute_nodes_count()).is_equal_to(1)
def test_intel_hpc(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir):
    """Test Intel Cluster Checker"""
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    _test_intel_clck(remote_command_executor, scheduler_commands, test_datadir, os)

    assert_no_errors_in_logs(remote_command_executor, scheduler)
Esempio n. 15
0
def test_default_ebs(scheduler, pcluster_config_reader, clusters_factory):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    mount_dir = "/shared"
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size=20)
    _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)
def test_scaling_performance(region, scheduler, os, instance,
                             pcluster_config_reader, clusters_factory,
                             request):
    """The test runs benchmarks for the scaling logic."""
    benchmarks_max_time = request.config.getoption("benchmarks_max_time")

    benchmark_params = {
        "region": region,
        "scheduler": scheduler,
        "os": os,
        "instance": instance,
        "scaling_target":
        request.config.getoption("benchmarks_target_capacity"),
        "scaledown_idletime": 2,
        "job_duration": 60,
    }

    cluster_config = pcluster_config_reader(
        scaledown_idletime=benchmark_params["scaledown_idletime"],
        scaling_target=benchmark_params["scaling_target"])
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    logging.info("Starting benchmark with following parameters: %s",
                 benchmark_params)
    start_time = datetime.datetime.utcnow()
    kwargs = {"nodes": benchmark_params["scaling_target"]}
    result = scheduler_commands.submit_command(
        "sleep {0}".format(benchmark_params["job_duration"]), **kwargs)
    scheduler_commands.assert_job_submitted(result.stdout)
    compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric(
        scheduler_commands,
        max_monitoring_time=minutes(benchmarks_max_time),
        region=region,
        cluster_name=cluster.cfn_name,
    )

    logging.info(
        "Benchmark completed. Producing outputs and performing assertions.")
    benchmark_params["total_time"] = "{0}seconds".format(
        int((end_time - start_time).total_seconds()))
    produce_benchmark_metrics_report(
        benchmark_params,
        region,
        cluster.cfn_name,
        start_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        end_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        benchmark_params["scaling_target"],
        request,
    )
    assert_that(max(compute_nodes_time_series)).is_equal_to(
        benchmark_params["scaling_target"])
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
    assert_no_errors_in_logs(remote_command_executor, scheduler)
Esempio n. 17
0
def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory,
                      region, os):
    mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)]
    volume_sizes = [15 + 5 * i for i in range(0, 5)]

    # for volume type sc1 and st1, the minimum volume sizes are 500G
    volume_sizes[3] = 500
    volume_sizes[4] = 500
    cluster_config = pcluster_config_reader(mount_dirs=mount_dirs,
                                            volume_sizes=volume_sizes)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    for mount_dir, volume_size in zip(mount_dirs, volume_sizes):
        # for volume size equal to 500G, the filesystem size is only about 492G
        # This is because the file systems use some of the total space available on a device for storing internal
        # structures and data (the file system's metadata). The overhead of the XFS filesystem is around 0.5%.
        # If we test with small volume size(eg: 40G), the number is not large enough to show the gap between the
        # partition size and the filesystem size. For sc1 and st1, the minimum size is 500G, so there will be a size
        # difference.
        _test_ebs_correctly_mounted(
            remote_command_executor, mount_dir,
            volume_size if volume_size != 500 else "49[0-9]")
        _test_ebs_correctly_shared(remote_command_executor, mount_dir,
                                   scheduler_commands)

    volume_ids = get_ebs_volume_ids(cluster, region)
    for i in range(len(volume_ids)):
        # test different volume types
        volume_id = volume_ids[i]
        ebs_settings = _get_ebs_settings_by_name(cluster.config, f"ebs{i+1}")
        volume_type = ebs_settings["VolumeType"]
        volume = describe_volume(volume_id, region)
        assert_that(volume[0]).is_equal_to(volume_type)
        encrypted = ebs_settings.get("Encrypted")
        if encrypted is None:
            # Default encryption if not specified
            encrypted = True
        _test_ebs_encrypted_with_kms(volume_id,
                                     region,
                                     encrypted=encrypted,
                                     kms_key_id=ebs_settings.get("KmsKeyId"))
        # test different iops
        # only io1, io2, gp3 can configure iops
        if volume_type in ["io1", "io2", "gp3"]:
            volume_iops = ebs_settings["Iops"]
            assert_that(volume[1]).is_equal_to(int(volume_iops))

    _test_root_volume_encryption(cluster,
                                 os,
                                 region,
                                 scheduler,
                                 encrypted=False)
    _assert_root_volume_configuration(cluster, os, region, scheduler)
Esempio n. 18
0
def test_raid_fault_tolerance_mode(scheduler, pcluster_config_reader, clusters_factory):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    mount_dir = "/raid_dir"
    _test_raid_correctly_configured(remote_command_executor, raid_type="1", volume_size=20, raid_devices=2)
    _test_raid_correctly_mounted(remote_command_executor, mount_dir, volume_size=20)
    _test_raid_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)
Esempio n. 19
0
def test_raid_fault_tolerance_mode(scheduler, pcluster_config_reader, clusters_factory):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    mount_dir = "/raid_dir"
    _test_raid_correctly_configured(remote_command_executor, raid_type="1", volume_size=20, raid_devices=2)
    _test_raid_correctly_mounted(remote_command_executor, mount_dir, volume_size=20)
    _test_raid_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)
 def _submit_one_slot_job():
     if not hasattr(local_data, "scheduler_commands"):
         local_data.scheduler_commands = get_scheduler_commands(
             benchmark_params["scheduler"],
             RemoteCommandExecutor(cluster))
     local_data.scheduler_commands.submit_command(
         "sleep {0}; mkdir -p /shared/job-results; mktemp /shared/job-results/job.XXXXXXXX"
         .format(benchmark_params["job_duration"]),
         slots=1,
         after_ok=job_id,
     )
Esempio n. 21
0
def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory):
    mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)]
    volume_sizes = [15 + 5 * i for i in range(0, 5)]
    cluster_config = pcluster_config_reader(mount_dirs=mount_dirs, volume_sizes=volume_sizes)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    for mount_dir, volume_size in zip(mount_dirs, volume_sizes):
        _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size)
        _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands)
Esempio n. 22
0
 def __init__(self, scheduler, os, cluster, feature_key=None, shared_dir=DEFAULT_SHARED_DIR):
     """Get the state of the cluster as it pertains to the CloudWatch logging feature."""
     self.scheduler = scheduler
     self.platform = self._base_os_to_platform(os)
     self.cluster = cluster
     self.feature_key = feature_key
     self.shared_dir = self._get_shared_dir(shared_dir)
     self.remote_command_executor = RemoteCommandExecutor(self.cluster)
     self.scheduler_commands = get_scheduler_commands(self.scheduler, self.remote_command_executor)
     self._relevant_logs = {HEAD_NODE_ROLE_NAME: [], COMPUTE_NODE_ROLE_NAME: []}
     self._cluster_log_state = {HEAD_NODE_ROLE_NAME: {}, COMPUTE_NODE_ROLE_NAME: {}}
     self._set_cluster_log_state()
Esempio n. 23
0
 def reset_stateful_connection_objects(self, default_user_remote_command_executor):
     """Reset objects that might maintain an open SSH connection."""
     del self._default_user_remote_command_executor
     del self._personalized_remote_command_executor
     del self._personalized_scheduler_commands
     self._default_user_remote_command_executor = default_user_remote_command_executor
     self._personalized_remote_command_executor = RemoteCommandExecutor(
         self.cluster, username=self.alias, alternate_ssh_key=self.ssh_private_key_path
     )
     self._personalized_scheduler_commands = get_scheduler_commands(
         self.scheduler, self._personalized_remote_command_executor
     )
Esempio n. 24
0
def test_nodewatcher_terminates_failing_node(scheduler, region,
                                             pcluster_config_reader,
                                             clusters_factory, test_datadir):
    # slurm test use more nodes because of internal request to test in multi-node settings
    initial_queue_size = 1
    maintain_initial_size = "true"
    environ["AWS_DEFAULT_REGION"] = region
    cluster_config = pcluster_config_reader(
        initial_queue_size=initial_queue_size,
        maintain_initial_size=maintain_initial_size)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    compute_nodes = scheduler_commands.get_compute_nodes()
    instance_ids = get_compute_nodes_instance_ids(cluster.cfn_name, region)
    hostname_to_instance_id = get_instance_ids_compute_hostnames_conversion_dict(
        instance_ids, id_to_hostname=False)

    logging.info(
        "Testing that nodewatcher will terminate a node in failing state")
    # submit a job to run on all nodes
    scheduler_commands.submit_command("sleep infinity",
                                      nodes=initial_queue_size)
    expected_num_nodes_killed = 1
    # simulate unexpected hardware failure by killing first x nodes
    nodes_to_remove = compute_nodes[:expected_num_nodes_killed]
    for node in nodes_to_remove:
        remote_command_executor.run_remote_script(str(
            test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler)),
                                                  args=[node])

    # assert failing nodes are terminated according to ASG
    _assert_failing_nodes_terminated(nodes_to_remove, hostname_to_instance_id,
                                     region)
    nodes_to_retain = [
        compute for compute in compute_nodes if compute not in nodes_to_remove
    ]
    # verify that desired capacity is still the initial_queue_size
    assert_that(get_desired_asg_capacity(
        region, cluster.cfn_name)).is_equal_to(initial_queue_size)
    # assert failing nodes are removed from scheduler config
    _assert_nodes_removed_and_replaced_in_scheduler(
        scheduler_commands,
        nodes_to_remove,
        nodes_to_retain,
        desired_capacity=initial_queue_size)

    assert_no_errors_in_logs(remote_command_executor, scheduler)
    test_maintain_initial_size(cluster.cfn_name, region, maintain_initial_size,
                               initial_queue_size)
Esempio n. 25
0
def test_default_ebs(scheduler, pcluster_config_reader, clusters_factory):
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    mount_dir = "/shared"
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    _test_ebs_correctly_mounted(remote_command_executor,
                                mount_dir,
                                volume_size=20)
    _test_ebs_correctly_shared(remote_command_executor, mount_dir,
                               scheduler_commands)
def test_slurm_scaling(scheduler, region, instance, pcluster_config_reader,
                       clusters_factory, test_datadir):
    """Test that slurm-specific scaling logic is behaving as expected for normal actions and failures."""
    cluster_config = pcluster_config_reader(scaledown_idletime=3)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    _assert_cluster_initial_conditions(scheduler_commands, instance, 20, 20, 4,
                                       1)
    _test_partition_states(
        scheduler_commands,
        cluster.cfn_name,
        region,
        active_partition="ondemand1",
        inactive_partition="ondemand2",
        num_static_nodes=2,
        num_dynamic_nodes=3,
        dynamic_instance_type=instance,
    )
    _test_reset_terminated_nodes(
        scheduler_commands,
        cluster.cfn_name,
        region,
        partition="ondemand1",
        num_static_nodes=2,
        num_dynamic_nodes=3,
        dynamic_instance_type=instance,
    )
    _test_replace_down_nodes(
        remote_command_executor,
        scheduler_commands,
        test_datadir,
        cluster.cfn_name,
        region,
        partition="ondemand1",
        num_static_nodes=2,
        num_dynamic_nodes=3,
        dynamic_instance_type=instance,
    )
    _test_keep_or_replace_suspended_nodes(
        scheduler_commands,
        cluster.cfn_name,
        region,
        partition="ondemand1",
        num_static_nodes=2,
        num_dynamic_nodes=3,
        dynamic_instance_type=instance,
    )
    assert_no_errors_in_logs(remote_command_executor, scheduler)
Esempio n. 27
0
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader,
                                  clusters_factory, test_datadir):
    scaledown_idletime = 4
    # Test jobs should take at most 9 minutes to be executed.
    # These guarantees that the jobs are executed in parallel.
    max_jobs_execution_time = 9

    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    logging.info("Executing sleep job to start a dynamic node")
    result = scheduler_commands.submit_command("sleep 1")
    job_id = scheduler_commands.assert_job_submitted(result.stdout)
    retry(wait_fixed=seconds(30),
          stop_max_delay=seconds(500))(_assert_job_state)(
              scheduler_commands, job_id, job_state="COMPLETED")

    logging.info("Executing test jobs on cluster")
    remote_command_executor.run_remote_script(test_datadir /
                                              "cluster-check.sh",
                                              args=["submit", scheduler])

    logging.info("Monitoring ec2 capacity and compute nodes")
    ec2_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=cluster.cfn_name,
        max_monitoring_time=minutes(max_jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(5),
    )

    logging.info(
        "Verifying test jobs completed successfully and in the expected time")
    _assert_test_jobs_completed(remote_command_executor,
                                max_jobs_execution_time * 60)

    logging.info("Verifying auto-scaling worked correctly")
    _assert_scaling_works(
        ec2_capacity_time_series=ec2_capacity_time_series,
        compute_nodes_time_series=compute_nodes_time_series,
        expected_ec2_capacity=(0, 3),
        expected_compute_nodes=(0, 3),
    )

    logging.info("Verifying no error in logs")
    assert_no_errors_in_logs(remote_command_executor, scheduler)
Esempio n. 28
0
def test_disable_hyperthreading(region, scheduler, instance, os,
                                pcluster_config_reader, clusters_factory):
    """Test Disable Hyperthreading"""
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    _test_disable_hyperthreading(remote_command_executor, scheduler_commands,
                                 slots_per_instance, scheduler)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
Esempio n. 29
0
def _test_mpi(
    remote_command_executor,
    slots_per_instance,
    scheduler,
    os,
    region=None,
    stack_name=None,
    scaledown_idletime=None,
    verify_scaling=False,
):
    logging.info("Testing mpi job")
    datadir = pathlib.Path(__file__).parent / "data/mpi/"
    mpi_module = OS_TO_OPENMPI_MODULE_MAP[os]
    # Compile mpi script
    command = "mpicc -o mpi_hello_world mpi_hello_world.c"
    if mpi_module != "no_module_available":
        command = "module load {0} && {1}".format(mpi_module, command)
    remote_command_executor.run_remote_command(
        command, additional_files=[str(datadir / "mpi_hello_world.c")])
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    # submit script using additional files
    result = scheduler_commands.submit_script(str(
        datadir / "mpi_submit_{0}.sh".format(mpi_module)),
                                              slots=2 * slots_per_instance)
    job_id = scheduler_commands.assert_job_submitted(result.stdout)

    if verify_scaling:
        assert_scaling_worked(scheduler_commands,
                              region,
                              stack_name,
                              scaledown_idletime,
                              expected_max=2,
                              expected_final=0)
        # not checking assert_job_succeeded after cluster scale down cause the scheduler history might be gone
    else:
        scheduler_commands.wait_job_completed(job_id)
        scheduler_commands.assert_job_succeeded(job_id)

    mpi_out = remote_command_executor.run_remote_command(
        "cat /shared/mpi.out").stdout
    assert_that(mpi_out.splitlines()).is_length(2)
    assert_that(mpi_out).matches(
        r"Hello world from processor ip-.+, rank 0 out of 2 processors")
    assert_that(mpi_out).matches(
        r"Hello world from processor ip-.+, rank 1 out of 2 processors")

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
Esempio n. 30
0
def test_intel_hpc(region, scheduler, instance, os, pcluster_config_reader,
                   clusters_factory, test_datadir):
    """Test Intel Cluster Checker"""
    slots_per_instance = fetch_instance_slots(region, instance)
    cluster_config = pcluster_config_reader()
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    _test_intel_clck(remote_command_executor, scheduler_commands,
                     slots_per_instance, test_datadir)

    assert_no_errors_in_logs(remote_command_executor,
                             ["/var/log/sqswatcher", "/var/log/jobwatcher"])
Esempio n. 31
0
def test_scheduler_performance(region, scheduler, os, instance, pcluster_config_reader, clusters_factory, request):
    """The test runs a stress test to verify scheduler behaviour with many submitted jobs."""
    benchmarks_max_time = request.config.getoption("benchmarks_max_time")
    instance_slots = get_instance_vcpus(region, instance)

    benchmark_params = {
        "region": region,
        "scheduler": scheduler,
        "os": os,
        "instance": instance,
        "scaling_target": request.config.getoption("benchmarks_target_capacity"),
        "scaledown_idletime": 2,
        "job_duration": 60,
        "jobs_to_submit": 2 * instance_slots * request.config.getoption("benchmarks_target_capacity"),
    }

    cluster_config = pcluster_config_reader(
        scaledown_idletime=benchmark_params["scaledown_idletime"], scaling_target=benchmark_params["scaling_target"]
    )
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    enable_asg_metrics(region, cluster)

    logging.info("Starting benchmark with following parameters: %s", benchmark_params)
    start_time = datetime.datetime.utcnow()
    _submit_jobs(benchmark_params, scheduler_commands, instance_slots, cluster)
    compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric(
        scheduler_commands,
        max_monitoring_time=minutes(benchmarks_max_time),
        region=region,
        cluster_name=cluster.cfn_name,
    )

    logging.info("Benchmark completed. Producing outputs and performing assertions.")
    benchmark_params["total_time"] = "{0}seconds".format(int((end_time - start_time).total_seconds()))
    produce_benchmark_metrics_report(
        benchmark_params,
        region,
        cluster.cfn_name,
        cluster.asg,
        start_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        end_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        benchmark_params["scaling_target"],
        request,
    )
    assert_that(max(compute_nodes_time_series)).is_equal_to(benchmark_params["scaling_target"])
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
    _assert_jobs_completed(remote_command_executor, benchmark_params["jobs_to_submit"])
    assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
def test_sit_cli_commands(scheduler, region, pcluster_config_reader,
                          clusters_factory):
    """Test pcluster cli commands are working."""
    cluster_config = pcluster_config_reader(scaledown_idletime=60)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    _test_pcluster_instances_and_status(cluster, region)
    _test_pcluster_stop_and_start(scheduler_commands,
                                  cluster,
                                  region,
                                  expected_num_nodes=1)
    assert_no_errors_in_logs(remote_command_executor, scheduler)
Esempio n. 33
0
def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory):
    mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)]
    volume_sizes = [15 + 5 * i for i in range(0, 5)]
    cluster_config = pcluster_config_reader(mount_dirs=mount_dirs,
                                            volume_sizes=volume_sizes)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)

    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)
    for mount_dir, volume_size in zip(mount_dirs, volume_sizes):
        _test_ebs_correctly_mounted(remote_command_executor, mount_dir,
                                    volume_size)
        _test_ebs_correctly_shared(remote_command_executor, mount_dir,
                                   scheduler_commands)