Exemple #1
0
def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime):
    logging.info("Testing cluster doesn't scale when job dependencies are not satisfied")
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions"
    )
    assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time)
        + minutes(scaledown_idletime)
        + minutes(estimated_scaleup_time)
        + minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir):
    scaledown_idletime = 4
    # Test jobs should take at most 9 minutes to be executed.
    # These guarantees that the jobs are executed in parallel.
    max_jobs_execution_time = 9

    cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)

    logging.info("Executing test jobs on cluster")
    remote_command_executor.run_remote_script(test_datadir / "cluster-check.sh", args=["submit", scheduler])

    logging.info("Monitoring asg capacity and compute nodes")
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=cluster.cfn_name,
        max_monitoring_time=minutes(max_jobs_execution_time) + minutes(scaledown_idletime) + minutes(5),
    )

    logging.info("Verifying test jobs completed successfully and in the expected time")
    _assert_test_jobs_completed(remote_command_executor, max_jobs_execution_time * 60)

    logging.info("Verifying auto-scaling worked correctly")
    _assert_scaling_works(
        asg_capacity_time_series=asg_capacity_time_series,
        compute_nodes_time_series=compute_nodes_time_series,
        expected_asg_capacity=(0, 3),
        expected_compute_nodes=(0, 3),
    )
Exemple #3
0
def assert_scaling_worked(
    scheduler_commands,
    region,
    stack_name,
    scaledown_idletime,
    expected_max,
    expected_final,
    assert_asg=True,
    assert_scheduler=True,
):
    jobs_execution_time = 1
    estimated_scaleup_time = 5
    max_scaledown_time = 10
    asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(estimated_scaleup_time) +
        minutes(max_scaledown_time),
    )

    with soft_assertions():
        if assert_asg:
            asg_capacity_time_series_str = f"asg_capacity_time_series={asg_capacity_time_series}"
            assert_that(max(asg_capacity_time_series)).described_as(
                asg_capacity_time_series_str).is_equal_to(expected_max)
            assert_that(asg_capacity_time_series[-1]).described_as(
                asg_capacity_time_series_str).is_equal_to(expected_final)
        if assert_scheduler:
            compute_nodes_time_series_str = f"compute_nodes_time_series={compute_nodes_time_series}"
            assert_that(max(compute_nodes_time_series)).described_as(
                compute_nodes_time_series_str).is_equal_to(expected_max)
            assert_that(compute_nodes_time_series[-1]).described_as(
                compute_nodes_time_series_str).is_equal_to(expected_final)
Exemple #4
0
def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader,
                                  clusters_factory, test_datadir):
    scaledown_idletime = 4
    # Test jobs should take at most 9 minutes to be executed.
    # These guarantees that the jobs are executed in parallel.
    max_jobs_execution_time = 9

    cluster_config = pcluster_config_reader(
        scaledown_idletime=scaledown_idletime)
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    logging.info("Executing sleep job to start a dynamic node")
    result = scheduler_commands.submit_command("sleep 1")
    job_id = scheduler_commands.assert_job_submitted(result.stdout)
    retry(wait_fixed=seconds(30),
          stop_max_delay=seconds(500))(_assert_job_state)(
              scheduler_commands, job_id, job_state="COMPLETED")

    logging.info("Executing test jobs on cluster")
    remote_command_executor.run_remote_script(test_datadir /
                                              "cluster-check.sh",
                                              args=["submit", scheduler])

    logging.info("Monitoring ec2 capacity and compute nodes")
    ec2_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=cluster.cfn_name,
        max_monitoring_time=minutes(max_jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(5),
    )

    logging.info(
        "Verifying test jobs completed successfully and in the expected time")
    _assert_test_jobs_completed(remote_command_executor,
                                max_jobs_execution_time * 60)

    logging.info("Verifying auto-scaling worked correctly")
    _assert_scaling_works(
        ec2_capacity_time_series=ec2_capacity_time_series,
        compute_nodes_time_series=compute_nodes_time_series,
        expected_ec2_capacity=(0, 3),
        expected_compute_nodes=(0, 3),
    )

    logging.info("Verifying no error in logs")
    assert_no_errors_in_logs(remote_command_executor, scheduler)
Exemple #5
0
def _test_jobs_executed_concurrently(remote_command_executor, max_slots):
    logging.info("Testing jobs are executed concurrently and nodes are fully allocated")
    torque_commands = TorqueCommands(remote_command_executor)

    # GIVEN: a cluster with 3 free nodes
    assert_that(torque_commands.compute_nodes_count()).is_equal_to(3)

    # WHEN: an array job that requires 3 nodes and all slots is submitted
    jobs_start_time = int(remote_command_executor.run_remote_command("date +%s").stdout)
    job_exec_time = 30
    job_ids = []
    for i in range(0, 3 * max_slots):
        result = torque_commands.submit_command(
            f"sleep {job_exec_time} && hostname > /shared/job{i} && date +%s >> /shared/end_time", nodes=1, slots=1
        )
        job_id = torque_commands.assert_job_submitted(result.stdout)
        job_ids.append(job_id)

    # THEN: cluster scales down correctly after completion
    watch_compute_nodes(torque_commands, minutes(10), 0)
    for id in job_ids:
        _assert_job_completed(remote_command_executor, id)

    # THEN: each host executes 4 jobs in the expected time
    jobs_to_hosts_count = (
        remote_command_executor.run_remote_command("cat /shared/job* | sort | uniq -c | awk '{print $1}'")
        .stdout.strip()
        .splitlines()
    )
    assert_that(jobs_to_hosts_count).is_equal_to(["4", "4", "4"])
    # verify execution time
    jobs_completion_time = int(
        remote_command_executor.run_remote_command("cat /shared/end_time | sort -n | tail -1").stdout.split()[-1]
    )
    assert_that(jobs_completion_time - jobs_start_time).is_greater_than(0).is_less_than(2 * job_exec_time)
Exemple #6
0
def _test_dynamic_cluster_limits(remote_command_executor, max_queue_size, max_slots, region, asg_name):
    logging.info("Testing cluster limits are dynamically updated")
    torque_commands = TorqueCommands(remote_command_executor)

    # Make sure cluster is scaled to 0 when this test starts
    assert_that(torque_commands.compute_nodes_count()).is_equal_to(0)

    _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size)

    # Submit a job to scale up to 1 node
    result = torque_commands.submit_command("sleep 1", nodes=1)
    job_id = torque_commands.assert_job_submitted(result.stdout)
    # Change ASG max size
    asg_client = boto3.client("autoscaling", region_name=region)
    new_max_size = max_queue_size + 1
    asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=new_max_size)
    # sleeping for 200 seconds since daemons fetch this data every 3 minutes
    time.sleep(200)
    # Wait for job completion to be sure cluster scaled
    torque_commands.wait_job_completed(job_id)

    _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, new_max_size)

    # Restore initial cluster size
    asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=max_queue_size)
    # sleeping for 200 seconds since daemons fetch this data every 3 minutes
    time.sleep(200)
    # make sure cluster scaled to 0
    watch_compute_nodes(torque_commands, minutes(10), 0)
    _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size)
def _add_compute_nodes(slurm_commands, number_of_nodes=1):
    """
    Add new compute nodes to the cluster.

    It is required because some changes will be available only on new compute nodes.
    :param cluster: the cluster
    :param number_of_nodes: number of nodes to add
    :return an array containing the new compute nodes only
    """
    initial_compute_nodes = slurm_commands.get_compute_nodes()

    number_of_nodes = len(initial_compute_nodes) + number_of_nodes
    # submit a job to perform a scaling up action and have new instances
    result = slurm_commands.submit_command("sleep 1", nodes=number_of_nodes)
    slurm_commands.assert_job_submitted(result.stdout)

    estimated_scaleup_time = 5
    watch_compute_nodes(
        scheduler_commands=slurm_commands,
        max_monitoring_time=minutes(estimated_scaleup_time),
        number_of_nodes=number_of_nodes,
    )

    return [
        node for node in slurm_commands.get_compute_nodes()
        if node not in initial_compute_nodes
    ]
Exemple #8
0
def assert_initial_conditions(scheduler_commands,
                              num_static_nodes,
                              num_dynamic_nodes,
                              partition,
                              cancel_job_id=None):
    """Assert cluster is in expected state before test starts; return list of compute nodes."""
    logging.info(
        "Assert initial condition, expect cluster to have {num_nodes} idle nodes"
        .format(num_nodes=num_static_nodes + num_dynamic_nodes))
    wait_for_num_nodes_in_scheduler(scheduler_commands,
                                    num_static_nodes + num_dynamic_nodes,
                                    filter_by_partition=partition)
    nodes_in_scheduler = scheduler_commands.get_compute_nodes(partition)
    static_nodes = []
    dynamic_nodes = []
    for node in nodes_in_scheduler:
        if "-st-" in node:
            static_nodes.append(node)
        if "-dy-" in node:
            dynamic_nodes.append(node)
    assert_that(len(static_nodes)).is_equal_to(num_static_nodes)
    assert_that(len(dynamic_nodes)).is_equal_to(num_dynamic_nodes)
    assert_compute_node_states(scheduler_commands,
                               nodes_in_scheduler,
                               expected_states=["idle", "mixed", "allocated"])
    if cancel_job_id:
        # Cancel warm up job so no extra scaling behavior should be happening
        scheduler_commands.cancel_job(cancel_job_id)
        retry(wait_fixed=seconds(20),
              stop_max_delay=minutes(2))(assert_compute_node_states)(
                  scheduler_commands,
                  nodes_in_scheduler,
                  expected_states=["idle"])

    return static_nodes, dynamic_nodes
Exemple #9
0
def assert_scaling_worked(scheduler_commands, region, stack_name,
                          scaledown_idletime, expected_max, expected_final):
    jobs_execution_time = 1
    estimated_scaleup_time = 5
    max_scaledown_time = 10
    asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation(
        scheduler_commands=scheduler_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(estimated_scaleup_time) +
        minutes(max_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(expected_max)
    assert_that(max(compute_nodes_time_series)).is_equal_to(expected_max)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(expected_final)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(expected_final)
def test_scaling_performance(region, scheduler, os, instance,
                             pcluster_config_reader, clusters_factory,
                             request):
    """The test runs benchmarks for the scaling logic."""
    benchmarks_max_time = request.config.getoption("benchmarks_max_time")

    benchmark_params = {
        "region": region,
        "scheduler": scheduler,
        "os": os,
        "instance": instance,
        "scaling_target":
        request.config.getoption("benchmarks_target_capacity"),
        "scaledown_idletime": 2,
        "job_duration": 60,
    }

    cluster_config = pcluster_config_reader(
        scaledown_idletime=benchmark_params["scaledown_idletime"],
        scaling_target=benchmark_params["scaling_target"])
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler,
                                                remote_command_executor)

    logging.info("Starting benchmark with following parameters: %s",
                 benchmark_params)
    start_time = datetime.datetime.utcnow()
    kwargs = {"nodes": benchmark_params["scaling_target"]}
    result = scheduler_commands.submit_command(
        "sleep {0}".format(benchmark_params["job_duration"]), **kwargs)
    scheduler_commands.assert_job_submitted(result.stdout)
    compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric(
        scheduler_commands,
        max_monitoring_time=minutes(benchmarks_max_time),
        region=region,
        cluster_name=cluster.cfn_name,
    )

    logging.info(
        "Benchmark completed. Producing outputs and performing assertions.")
    benchmark_params["total_time"] = "{0}seconds".format(
        int((end_time - start_time).total_seconds()))
    produce_benchmark_metrics_report(
        benchmark_params,
        region,
        cluster.cfn_name,
        start_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        end_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        benchmark_params["scaling_target"],
        request,
    )
    assert_that(max(compute_nodes_time_series)).is_equal_to(
        benchmark_params["scaling_target"])
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
    assert_no_errors_in_logs(remote_command_executor, scheduler)
Exemple #11
0
class AWSBatchCommands(SchedulerCommands):
    """Implement commands for awsbatch scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(
        retry_on_result=lambda result: "FAILED" not in result and any(
            status != "SUCCEEDED" for status in result),
        wait_fixed=seconds(7),
        stop_max_delay=minutes(15),
    )
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "awsbstat -d {0}".format(job_id))
        return re.findall(r"status\s+: (.+)", result.stdout)

    def get_job_exit_status(self, job_id):  # noqa: D102
        return self.wait_job_completed(job_id)

    def assert_job_submitted(self, awsbsub_output):  # noqa: D102
        __tracebackhide__ = True
        match = re.match(r"Job ([a-z0-9\-]{36}) \(.+\) has been submitted.",
                         awsbsub_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self, command, nodes=1, slots=None):  # noqa: D102
        return self._remote_command_executor.run_remote_command(
            'echo "{0}" | awsbsub -n {1}'.format(command, nodes))

    def submit_script(self,
                      script,
                      script_args=None,
                      nodes=1,
                      additional_files=None,
                      slots=None):  # noqa: D102
        raise NotImplementedError

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        __tracebackhide__ = True
        status = self.get_job_exit_status(job_id)
        assert_that(status).is_length(1 + children_number)
        assert_that(status).contains_only("SUCCEEDED")

    def compute_nodes_count(self):  # noqa: D102
        raise NotImplementedError

    def get_compute_nodes(self):  # noqa: D102
        raise NotImplementedError

    def wait_for_locked_node(self):  # noqa: D102
        raise NotImplementedError

    def get_node_cores(self):  # noqa: D102
        raise NotImplementedError
Exemple #12
0
def test_scheduler_performance(region, scheduler, os, instance, pcluster_config_reader, clusters_factory, request):
    """The test runs a stress test to verify scheduler behaviour with many submitted jobs."""
    benchmarks_max_time = request.config.getoption("benchmarks_max_time")
    instance_slots = get_instance_vcpus(region, instance)

    benchmark_params = {
        "region": region,
        "scheduler": scheduler,
        "os": os,
        "instance": instance,
        "scaling_target": request.config.getoption("benchmarks_target_capacity"),
        "scaledown_idletime": 2,
        "job_duration": 60,
        "jobs_to_submit": 2 * instance_slots * request.config.getoption("benchmarks_target_capacity"),
    }

    cluster_config = pcluster_config_reader(
        scaledown_idletime=benchmark_params["scaledown_idletime"], scaling_target=benchmark_params["scaling_target"]
    )
    cluster = clusters_factory(cluster_config)
    remote_command_executor = RemoteCommandExecutor(cluster)
    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
    enable_asg_metrics(region, cluster)

    logging.info("Starting benchmark with following parameters: %s", benchmark_params)
    start_time = datetime.datetime.utcnow()
    _submit_jobs(benchmark_params, scheduler_commands, instance_slots, cluster)
    compute_nodes_time_series, timestamps, end_time = publish_compute_nodes_metric(
        scheduler_commands,
        max_monitoring_time=minutes(benchmarks_max_time),
        region=region,
        cluster_name=cluster.cfn_name,
    )

    logging.info("Benchmark completed. Producing outputs and performing assertions.")
    benchmark_params["total_time"] = "{0}seconds".format(int((end_time - start_time).total_seconds()))
    produce_benchmark_metrics_report(
        benchmark_params,
        region,
        cluster.cfn_name,
        cluster.asg,
        start_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        end_time.replace(tzinfo=datetime.timezone.utc).isoformat(),
        benchmark_params["scaling_target"],
        request,
    )
    assert_that(max(compute_nodes_time_series)).is_equal_to(benchmark_params["scaling_target"])
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
    _assert_jobs_completed(remote_command_executor, benchmark_params["jobs_to_submit"])
    assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
    def wait_job_completed(self, job_id, timeout=None):  # noqa: D102
        if not timeout:
            timeout = 12

        @retry(
            retry_on_result=lambda result: "job_state = C" not in result,
            wait_fixed=seconds(3),
            stop_max_delay=minutes(timeout),
        )
        def _job_status_retryer():
            result = self._remote_command_executor.run_remote_command(
                "qstat -f {0}".format(job_id))
            return result.stdout

        return _job_status_retryer()
def _test_job_dependencies(remote_command_executor, region, stack_name,
                           scaledown_idletime):
    logging.info(
        "Testing cluster doesn't scale when job dependencies are not satisfied"
    )
    slurm_commands = SlurmCommands(remote_command_executor)
    result = slurm_commands.submit_command("sleep 60", nodes=1)
    job_id = slurm_commands.assert_job_submitted(result.stdout)
    result = remote_command_executor.run_remote_command(
        "sbatch -N 1 --wrap='sleep 1' -d afterok:{0}".format(job_id))
    dependent_job_id = slurm_commands.assert_job_submitted(result.stdout)

    # Wait for reason to be computed
    time.sleep(3)
    assert_that(_get_job_info(remote_command_executor, job_id)).contains(
        "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED"
        "_or_reserved_for_jobs_in_higher_priority_partitions")
    assert_that(_get_job_info(
        remote_command_executor,
        dependent_job_id)).contains("JobState=PENDING Reason=Dependency")

    jobs_execution_time = 1
    estimated_scaleup_time = 5
    estimated_scaledown_time = 20
    asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation(
        scheduler_commands=slurm_commands,
        region=region,
        stack_name=stack_name,
        max_monitoring_time=minutes(jobs_execution_time) +
        minutes(scaledown_idletime) + minutes(estimated_scaleup_time) +
        minutes(estimated_scaledown_time),
    )
    assert_that(max(asg_capacity_time_series)).is_equal_to(1)
    assert_that(max(compute_nodes_time_series)).is_equal_to(1)
    assert_that(asg_capacity_time_series[-1]).is_equal_to(0)
    assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
    def wait_job_completed(self, job_id, timeout=None):  # noqa: D102
        if not timeout:
            timeout = 15

        @retry(
            retry_on_result=lambda result: "FAILED" not in result and any(
                status != "SUCCEEDED" for status in result),
            wait_fixed=seconds(7),
            stop_max_delay=minutes(timeout),
        )
        def _job_status_retryer():
            result = self._remote_command_executor.run_remote_command(
                "awsbstat -d {0}".format(job_id), log_output=True)
            return re.findall(r"status\s+: (.+)", result.stdout)

        return _job_status_retryer()
Exemple #16
0
class SgeCommands(SchedulerCommands):
    """Implement commands for sge scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(retry_on_result=lambda result: result != 0,
           wait_fixed=seconds(7),
           stop_max_delay=minutes(5))
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qacct -j {0}".format(job_id), raise_on_error=False)
        return result.return_code

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qacct -j {0}".format(job_id))
        match = re.search(r"exit_status\s+([0-9]+)", result.stdout)
        assert_that(match).is_not_none()
        return match.group(1)

    def assert_job_submitted(self, qsub_output):  # noqa: D102
        __tracebackhide__ = True
        match = re.search(r"Your job ([0-9]+) \(.+\) has been submitted",
                          qsub_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self, command, nodes=1):  # noqa: D102
        # TODO add support for multiple nodes
        return self._remote_command_executor.run_remote_command(
            "echo '{0}' | qsub".format(command))

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        __tracebackhide__ = True
        status = self.get_job_exit_status(job_id)
        assert_that(status).is_equal_to("0")

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qhost | grep -o ip- | wc -l")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])
    def wait_job_completed(self, job_id, timeout=None):  # noqa: D102
        if not timeout:
            timeout = 12

        @retry(
            retry_on_result=lambda result: "JobState" not in result or
            any(value in result for value in [
                "EndTime=Unknown", "JobState=RUNNING", "JobState=COMPLETING",
                "JobState=CONFIGURING"
            ]),
            wait_fixed=seconds(10),
            stop_max_delay=minutes(timeout),
        )
        def _job_status_retryer():
            result = self._remote_command_executor.run_remote_command(
                "scontrol show jobs -o {0}".format(job_id),
                raise_on_error=False)
            return result.stdout

        return _job_status_retryer()
Exemple #18
0
class SlurmCommands(SchedulerCommands):
    """Implement commands for slurm scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(retry_on_result=lambda result: result == "Unknown",
           wait_fixed=seconds(7),
           stop_max_delay=minutes(5))
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        match = re.search(r"EndTime=(.+?) ", result.stdout)
        return match.group(1)

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        match = re.search(r"ExitCode=(.+?) ", result.stdout)
        return match.group(1)

    def assert_job_submitted(self, sbatch_output):  # noqa: D102
        __tracebackhide__ = True
        match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self, command, nodes=1):  # noqa: D102
        return self._remote_command_executor.run_remote_command(
            "sbatch -N {0} --wrap='{1}'".format(nodes, command))

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        return "JobState=COMPLETED" in result.stdout

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "sinfo --Node --noheader | grep compute | wc -l")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])
def _test_ec2_status_check_replacement(
    remote_command_executor,
    scheduler_commands,
    cluster_name,
    region,
    partition,
    num_static_nodes,
):
    """Test nodes with failing ec2 status checks are correctly replaced."""
    logging.info(
        "Testing that nodes with failing ec2 status checks are correctly replaced"
    )
    static_nodes, _ = assert_initial_conditions(scheduler_commands,
                                                num_static_nodes, 0, partition)
    # Can take up to 15 mins for ec2_status_check to show
    # Need to increase SlurmdTimeout to avoid slurm health check and trigger ec2_status_check code path
    _set_slurmd_timeout(remote_command_executor, timeout=10000)
    kill_job_id = _submit_kill_networking_job(remote_command_executor,
                                              scheduler_commands,
                                              partition,
                                              node_type="static",
                                              num_nodes=num_static_nodes)
    # Assert ec2_status_check code path is triggered
    retry(
        wait_fixed=seconds(20), stop_max_delay=minutes(15)
    )(assert_errors_in_logs)(
        remote_command_executor,
        ["/var/log/parallelcluster/clustermgtd"],
        ["Setting nodes failing health check type ec2_health_check to DRAIN"],
    )
    scheduler_commands.cancel_job(kill_job_id)
    # Assert static nodes are reset
    _wait_for_node_reset(scheduler_commands,
                         static_nodes=static_nodes,
                         dynamic_nodes=[])
    assert_num_instances_in_cluster(cluster_name, region, len(static_nodes))
    # Reset SlurmdTimeout to 180s
    _set_slurmd_timeout(remote_command_executor, timeout=180)

def _create_image_roles(create_roles_stack):
    # Create build image roles
    image_roles_stack = create_roles_stack(
        stack_prefix="integ-tests-iam-image-roles",
        roles_file="image-roles.cfn.yaml")
    lambda_cleanup_role = image_roles_stack.cfn_outputs[
        "BuildImageLambdaCleanupRole"]
    instance_profile = image_roles_stack.cfn_outputs[
        "BuildImageInstanceProfile"]
    # instance_role = image_roles_stack.cfn_outputs["BuildImageInstanceRole"]
    return instance_profile, lambda_cleanup_role


@retry(wait_fixed=minutes(1), stop_max_delay=minutes(60))
def _wait_build_image_complete(image):
    pcluster_describe_image_result = image.describe()
    logging.info(pcluster_describe_image_result)
    assert_that(image.image_status).is_equal_to("BUILD_COMPLETE")


def _check_roles(
    cfn_client,
    ec2_client,
    lambda_client,
    stack_name,
    instance_profile,
    lambda_cleanup_role,
):
    """Test roles are attached to EC2 build instance and Lambda cleanup function in the building stack."""
Exemple #21
0
    if gres:
        retrieve_nodes_command = "sudo cat /opt/slurm/etc/slurm_parallelcluster_gres.conf"
    else:
        retrieve_nodes_command = "sudo cat /opt/slurm/etc/slurm_parallelcluster_nodes.conf"
    return remote_command_executor.run_remote_command(
        retrieve_nodes_command).stdout


def _retrieve_slurm_dummy_nodes(remote_command_executor, gres=False):
    retrieve_dummy_nodes_command = "scontrol -F show nodes | grep 'State=FUTURE'"
    return len(
        remote_command_executor.run_remote_command(
            retrieve_dummy_nodes_command).stdout.split("\n"))


@retry(wait_fixed=seconds(20), stop_max_delay=minutes(7))
def _assert_no_nodes_in_scheduler(scheduler_commands):
    assert_that(scheduler_commands.compute_nodes_count()).is_equal_to(0)


@retry(wait_fixed=seconds(20), stop_max_delay=minutes(7))
def _assert_asg_has_no_node(region, asg_name):
    assert_asg_desired_capacity(region, asg_name, expected=0)


def _assert_dummy_nodes(remote_command_executor, count, slots=4, gpus=0):
    __tracebackhide__ = True
    if gpus > 0:
        # If GPU instance, need to check for extra GPU info in slurm_parallelcluster_nodes.conf
        gpu_entry = "Gres=gpu:tesla:{gpus} ".format(gpus=gpus)
        # Checking dummy nodes in slurm_parallelcluster_gres.conf
Exemple #22
0
class SlurmCommands(SchedulerCommands):
    """Implement commands for slurm scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(retry_on_result=lambda result: result == "Unknown",
           wait_fixed=seconds(7),
           stop_max_delay=minutes(5))
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        match = re.search(r"EndTime=(.+?) ", result.stdout)
        return match.group(1)

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        match = re.search(r"ExitCode=(.+?) ", result.stdout)
        return match.group(1)

    def assert_job_submitted(self, sbatch_output):  # noqa: D102
        __tracebackhide__ = True
        match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self,
                       command,
                       nodes=1,
                       slots=None,
                       host=None):  # noqa: D102
        submission_command = "sbatch -N {0} --wrap='{1}'".format(
            nodes, command)
        if host:
            submission_command += " --nodelist={0}".format(host)
        if slots:
            submission_command += " -n {0}".format(slots)
        return self._remote_command_executor.run_remote_command(
            submission_command)

    def submit_script(self,
                      script,
                      nodes=1,
                      slots=None,
                      host=None,
                      additional_files=None):  # noqa: D102
        if not additional_files:
            additional_files = []
        additional_files.append(script)
        script_name = os.path.basename(script)
        submission_command = "sbatch"
        if host:
            submission_command += " --nodelist={0}".format(host)
        if slots:
            submission_command += " -n {0}".format(slots)
        if nodes > 1:
            submission_command += " -N {0}".format(slots)
        submission_command += " {1}".format(nodes, script_name)
        return self._remote_command_executor.run_remote_command(
            submission_command, additional_files=additional_files)

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        assert_that(result.stdout).contains("JobState=COMPLETED")

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "sinfo --Node --noheader | grep compute | wc -l")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])

    def get_compute_nodes(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "sinfo --Node --noheader | grep compute | awk '{print $1}'")
        return result.stdout.splitlines()
Exemple #23
0
class SgeCommands(SchedulerCommands):
    """Implement commands for sge scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(retry_on_result=lambda result: result != 0,
           wait_fixed=seconds(7),
           stop_max_delay=minutes(5))
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qacct -j {0}".format(job_id), raise_on_error=False)
        return result.return_code

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qacct -j {0}".format(job_id))
        match = re.search(r"exit_status\s+([0-9]+)", result.stdout)
        assert_that(match).is_not_none()
        return match.group(1)

    def assert_job_submitted(self, qsub_output, is_array=False):  # noqa: D102
        __tracebackhide__ = True
        if is_array:
            regex = r"Your job-array ([0-9]+)\.[0-9\-:]+ \(.+\) has been submitted"
        else:
            regex = r"Your job ([0-9]+) \(.+\) has been submitted"
        match = re.search(regex, qsub_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self,
                       command,
                       nodes=1,
                       slots=None,
                       hold=False):  # noqa: D102
        flags = ""
        if nodes != 1:
            raise Exception("SGE does not support nodes option")
        if slots:
            flags += "-pe mpi {0} ".format(slots)
        if hold:
            flags += "-h "
        return self._remote_command_executor.run_remote_command(
            "echo '{0}' | qsub {1}".format(command, flags),
            raise_on_error=False)

    def submit_script(self,
                      script,
                      nodes=1,
                      slots=None,
                      additional_files=None):  # noqa: D102
        if not additional_files:
            additional_files = []
        additional_files.append(script)
        flags = ""
        if slots:
            flags += "-pe mpi {0} ".format(slots)
        script_name = os.path.basename(script)
        return self._remote_command_executor.run_remote_command(
            "qsub {0} {1}".format(flags, script_name),
            additional_files=additional_files)

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        __tracebackhide__ = True
        status = self.get_job_exit_status(job_id)
        assert_that(status).is_equal_to("0")

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qhost | grep -o ip- | wc -l")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])

    def get_compute_nodes(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qhost | grep ip- | awk '{print $1}'")
        return result.stdout.splitlines()
def _test_cloud_node_health_check(
    remote_command_executor,
    scheduler_commands,
    cluster_name,
    region,
    partition,
    num_static_nodes,
    num_dynamic_nodes,
    dynamic_instance_type,
):
    """
    Test nodes with networking failure are correctly replaced.

    This will test if slurm is performing health check on CLOUD nodes correctly.
    """
    logging.info(
        "Testing that nodes with networking failure fails slurm health check and replaced"
    )
    job_id = submit_initial_job(
        scheduler_commands,
        "sleep 500",
        partition,
        dynamic_instance_type,
        num_dynamic_nodes,
        other_options="--no-requeue",
    )
    static_nodes, dynamic_nodes = assert_initial_conditions(
        scheduler_commands, num_static_nodes, num_dynamic_nodes, partition,
        job_id)
    # Assert that the default SlurmdTimeout=180 is in effect
    _assert_slurmd_timeout(remote_command_executor, timeout=180)
    # Nodes with networking failures should fail slurm health check before failing ec2_status_check
    # Test on freshly launched dynamic nodes
    kill_job_id = _submit_kill_networking_job(remote_command_executor,
                                              scheduler_commands,
                                              partition,
                                              node_type="dynamic",
                                              num_nodes=num_dynamic_nodes)
    # Sleep for a bit so the command to detach network interface can be run
    time.sleep(15)
    # Job will hang, cancel it manually to avoid waiting for job failing
    scheduler_commands.cancel_job(kill_job_id)
    # Assert nodes are put into DOWN for not responding
    # TO-DO: this test only works with num_dynamic = 1 because slurm will record this error in nodelist format
    # i.e. error: Nodes q2-st-t2large-[1-2] not responding, setting DOWN
    # To support multiple nodes, need to convert list of node into nodelist format string
    retry(wait_fixed=seconds(20),
          stop_max_delay=minutes(5))(assert_errors_in_logs)(
              remote_command_executor,
              ["/var/log/slurmctld.log"],
              [
                  "Nodes {} not responding, setting DOWN".format(
                      ",".join(dynamic_nodes))
              ],
          )
    # Assert dynamic nodes are reset
    _wait_for_node_reset(scheduler_commands,
                         static_nodes=[],
                         dynamic_nodes=dynamic_nodes)
    assert_num_instances_in_cluster(cluster_name, region, len(static_nodes))
    # Assert ec2_status_check code path is not triggered
    assert_no_msg_in_logs(
        remote_command_executor,
        ["/var/log/parallelcluster/clustermgtd"],
        ["Setting nodes failing health check type ec2_health_check to DRAIN"],
    )
    remote_command_executor.run_remote_command(
        "aws s3 cp s3://{bucket_name}/export_dir/file_to_export ./file_to_export"
        .format(bucket_name=bucket_name))
    result = remote_command_executor.run_remote_command("cat ./file_to_export")
    assert_that(result.stdout).is_equal_to("Exported by FSx Lustre")


def _assert_job_submitted(qsub_output):
    __tracebackhide__ = True
    match = re.search(r"Your job ([0-9]+) \(.+\) has been submitted",
                      qsub_output)
    assert_that(match).is_not_none()
    return match.group(1)


@retry(retry_on_result=lambda result: result != 0,
       wait_fixed=seconds(7),
       stop_max_delay=minutes(5))
def _wait_job_completed(remote_command_executor, job_id):
    result = remote_command_executor.run_remote_command(
        "qacct -j {0}".format(job_id), raise_on_error=False)
    return result.return_code


def _get_job_exit_status(remote_command_executor, job_id):
    result = remote_command_executor.run_remote_command(
        "qacct -j {0}".format(job_id))
    match = re.search(r"exit_status\s+([0-9]+)", result.stdout)
    assert_that(match).is_not_none()
    return match.group(1)
class EBSSnapshotsFactory:
    """Manage creation and destruction of volume snapshots."""
    def __init__(self):
        self.config = None
        self.instance = None
        self.volume = None
        self.snapshot = None
        self.security_group_id = None
        self.ec2 = None
        self.boto_client = None

    def create_snapshot(self, request, subnet_id, region):
        """
        Create a snapshot in a given region.
        :param request: The current request
        :param subnet_id: The subnet id where to get the snapshot
        :param region: The region where to get the snapshot
        """
        # Only one snapshot creation per factory allowed
        if self.snapshot:
            raise Exception("Snapshot already created")

        self.ec2 = boto3.resource("ec2", region_name=region)
        self.boto_client = boto3.client("ec2", region_name=region)

        snapshot_config = SnapshotConfig(
            request.config.getoption("key_path"),
            request.config.getoption("key_name"),
            self.ec2.Subnet(subnet_id).vpc_id,
            subnet_id,
        )
        self.snapshot = self._create_snapshot(region, snapshot_config)
        return self.snapshot.id

    def create_existing_volume(self, request, subnet_id, region):
        """
        Create a volume in a given region.
        :param request: The current request
        :param subnet_id: The subnet id where to get the snapshot
        :param region: The region where to get the snapshot
        """
        # Only one volume creation per factory allowed
        if self.volume:
            raise Exception("Volume already created")

        self.ec2 = boto3.resource("ec2", region_name=region)
        self.boto_client = boto3.client("ec2", region_name=region)
        volume_config = SnapshotConfig(
            request.config.getoption("key_path"),
            request.config.getoption("key_name"),
            self.ec2.Subnet(subnet_id).vpc_id,
            subnet_id,
        )
        self._create_volume_process(region, volume_config)
        return self.volume.id

    def _create_volume_process(self, region, snapshot_config):
        self.config = snapshot_config
        ami_id = self._get_amazonlinux2_ami()

        self.security_group_id = self._get_security_group_id()

        subnet = self.ec2.Subnet(self.config.head_node_subnet_id)

        # Create a new volume and attach to the instance
        self.volume = self._create_volume(subnet)
        self.instance = self._launch_instance(ami_id, subnet)
        self._attach_volume()
        # Open ssh connection
        self.ssh_conn = self._open_ssh_connection()

        # Partitions the disk with a gpt table and 1 single partition inside
        self._format_volume(self.ssh_conn)

        # Stops the instance before taking the snapshot
        self._release_instance()

    def _create_snapshot(self, region, snapshot_config):
        self._create_volume_process(region, snapshot_config)
        self.snapshot = self._create_volume_snapshot()
        return self.snapshot

    def _create_volume_snapshot(self):
        logging.info("creating snapshot...")
        snapshot = self.ec2.create_snapshot(
            Description="parallelcluster-test-snapshot",
            VolumeId=self.volume.id)
        while snapshot.state == "pending":
            time.sleep(10)
            snapshot = self.ec2.Snapshot(snapshot.id)
        logging.info("Snapshot ready: %s" % snapshot.id)
        return snapshot

    def _format_volume(self, ssh_conn):
        logging.info("Partitioning device...")
        ssh_conn.run(
            "sudo sh -c 'echo -e \"g\nn\np\n1\n\n\nw\" | fdisk /dev/sdf'",
            warn=True,
            pty=False,
            hide=False)
        # Finds out the device name of the volume
        logging.info("Finding device name...")
        device_name = ssh_conn.run("readlink -f /dev/sdf").stdout.strip()
        # formats the 1st partition of disk
        logging.info("Formatting 1st partition...")
        ssh_conn.run("sudo sh -c 'mkfs.ext4 {}1'".format(device_name))
        logging.info("Mounting partition...")
        ssh_conn.run("sudo mkdir /mnt/tmp")
        ssh_conn.run("sudo mount {}1 /mnt/tmp".format(device_name))
        logging.info("Writing test data...")
        ssh_conn.run("echo 'hello world' | sudo tee -a /mnt/tmp/test.txt")
        logging.info("Device ready")

    def _open_ssh_connection(self):
        tries = 5
        logging.info("Connecting to instance %s " %
                     self.instance.public_ip_address)
        logging.info("ssh_key: %s " % self.config.ssh_key)
        ssh_conn = None

        while tries > 0:
            try:
                ssh_conn = Connection(
                    host=self.instance.public_ip_address,
                    user="******",
                    forward_agent=False,
                    connect_kwargs={"key_filename": [self.config.ssh_key]},
                )
                ssh_conn.open()
                tries = 0
            except BaseException:
                logging.info("SSH connection error - retrying...")
                tries -= 1
                time.sleep(20)

        if (ssh_conn is None) or (not ssh_conn.is_connected):
            raise ConnectionError()
        return ssh_conn

    @retry(retry_on_result=lambda state: state != "attached",
           wait_fixed=seconds(2),
           stop_max_delay=minutes(5))
    def _wait_volume_attached(self):
        vol = self.ec2.Volume(self.volume.id)
        attachment_state = next(
            (attachment["State"] for attachment in vol.attachments
             if attachment["InstanceId"] == self.instance.id), "")
        return attachment_state

    def _attach_volume(self):
        result = self.volume.attach_to_instance(InstanceId=self.instance.id,
                                                Device="/dev/sdf")
        logging.info("Attach Volume Result: %s", result)
        self._wait_volume_attached()
        logging.info("Volume attached")

    def _create_volume(self, subnet):
        vol = self.ec2.create_volume(
            Size=10,
            Encrypted=False,
            AvailabilityZone=subnet.availability_zone,
            TagSpecifications=[{
                "ResourceType":
                "volume",
                "Tags": [{
                    "Key": "name",
                    "Value": "parallel-cluster-test-volume"
                }]
            }],
        )
        logging.info("Volume Id: %s" % vol.id)
        # We can check if the volume is now ready and available:
        logging.info("Waiting for the volume to be ready...")
        while vol.state == "creating":
            vol = self.ec2.Volume(vol.id)
            time.sleep(2)
        logging.info("Volume ready")
        return vol

    def _get_security_group_id(self):
        security_group_id = self.boto_client.create_security_group(
            Description="security group for snapshot instance node",
            GroupName="snapshot-" + random_alphanumeric(),
            VpcId=self.config.vpc_id,
        )["GroupId"]

        self.boto_client.authorize_security_group_ingress(
            GroupId=security_group_id,
            IpPermissions=[{
                "IpProtocol": "tcp",
                "FromPort": 22,
                "ToPort": 22,
                "IpRanges": [{
                    "CidrIp": "0.0.0.0/0"
                }]
            }],
        )

        return security_group_id

    def _launch_instance(self, ami_id, subnet):
        instance = self.ec2.create_instances(
            ImageId=ami_id,
            KeyName=self.config.key_name,
            MinCount=1,
            MaxCount=1,
            InstanceType="t2.micro",
            NetworkInterfaces=[{
                "SubnetId": subnet.id,
                "DeviceIndex": 0,
                "AssociatePublicIpAddress": True,
                "Groups": [self.security_group_id],
            }],
            TagSpecifications=[{
                "ResourceType":
                "instance",
                "Tags": [{
                    "Key": "Name",
                    "Value": "pcluster-snapshot-instance"
                }]
            }],
        )[0]
        logging.info("Waiting for instance to be running...")
        while instance.state["Name"] == "pending":
            time.sleep(10)
            instance = self.ec2.Instance(instance.id)

        logging.info("Instance state: %s" % instance.state)
        logging.info("Public dns: %s" % instance.public_dns_name)
        return instance

    def _get_amazonlinux2_ami(self):
        # Finds most recent alinux2 ami in region
        response = self.boto_client.describe_images(
            Owners=["amazon"],
            Filters=[
                {
                    "Name": "name",
                    "Values": ["amzn2-ami-hvm-*"]
                },
                {
                    "Name": "description",
                    "Values": ["Amazon Linux 2 AMI*"]
                },
                {
                    "Name": "architecture",
                    "Values": ["x86_64"]
                },
                {
                    "Name": "root-device-type",
                    "Values": ["ebs"]
                },
                {
                    "Name": "state",
                    "Values": ["available"]
                },
            ],
        )

        amis = sorted(response["Images"],
                      key=lambda x: x["CreationDate"],
                      reverse=True)
        return amis[0]["ImageId"]

    def release_all(self):
        """Release all resources"""
        self._release_instance()
        self._release_volume()
        self._release_snapshot()
        self._release_security_group()

    @retry(stop_max_attempt_number=5, wait_fixed=5000)
    def _release_snapshot(self):
        if self.snapshot:
            logging.info("Deleting snapshot %s" % self.snapshot.id)
            self.snapshot.delete()

    @retry(stop_max_attempt_number=5, wait_fixed=5000)
    def _release_instance(self):
        if self.instance:
            self.instance.terminate()
            logging.info("Waiting for instance to be terminated...")
            while self.instance.state["Name"] != "terminated":
                time.sleep(10)
                self.instance = self.ec2.Instance(self.instance.id)
            logging.info("Instance terminated")
        self.instance = None

    @retry(stop_max_attempt_number=5, wait_fixed=5000)
    def _release_volume(self):
        if self.volume:
            logging.info("Deleting volume %s" % self.volume.id)
            self.volume.delete()
        self.volume = None

    def _release_security_group(self):
        if self.security_group_id:
            logging.info("Deleting security group %s" % self.security_group_id)
            self.boto_client.delete_security_group(
                GroupId=self.security_group_id)
        self.security_group_id = None
Exemple #27
0
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
# with the License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
from retrying import retry

from remote_command_executor import RemoteCommandExecutionError
from time_utils import minutes, seconds


@retry(
    retry_on_exception=lambda exception: isinstance(
        exception, RemoteCommandExecutionError),
    wait_fixed=seconds(30),
    stop_max_delay=minutes(10),
)
def wait_compute_log(remote_command_executor):
    remote_command_executor.run_remote_command("test -d /home/logs/compute",
                                               log_error=False)
    # return instance-id
    return remote_command_executor.run_remote_command(
        "find /home/logs/compute/ -type f -printf '%f\\n' -quit  | head -1 | cut -d. -f1",
        log_error=False).stdout
Exemple #28
0
class SgeCommands(SchedulerCommands):
    """Implement commands for sge scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(retry_on_result=lambda result: result != 0,
           wait_fixed=seconds(3),
           stop_max_delay=minutes(7))
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qacct -j {0}".format(job_id), raise_on_error=False)
        return result.return_code

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qacct -j {0}".format(job_id))
        match = re.search(r"exit_status\s+([0-9]+)", result.stdout)
        assert_that(match).is_not_none()
        return match.group(1)

    def assert_job_submitted(self, qsub_output, is_array=False):  # noqa: D102
        __tracebackhide__ = True
        if is_array:
            regex = r"Your job-array ([0-9]+)\.[0-9\-:]+ \(.+\) has been submitted"
        else:
            regex = r"Your job ([0-9]+) \(.+\) has been submitted"
        match = re.search(regex, qsub_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self,
                       command,
                       nodes=1,
                       slots=None,
                       hold=False,
                       after_ok=None):  # noqa: D102
        flags = ""
        if nodes > 1:
            slots = nodes * slots
        if slots:
            flags += "-pe mpi {0} ".format(slots)
        if hold:
            flags += "-h "
        if after_ok:
            flags += "-hold_jid {0} ".format(after_ok)
        return self._remote_command_executor.run_remote_command(
            "echo '{0}' | qsub {1}".format(command, flags),
            raise_on_error=False)

    def submit_script(self,
                      script,
                      script_args=None,
                      nodes=1,
                      slots=None,
                      additional_files=None):  # noqa: D102
        if not additional_files:
            additional_files = []
        if not script_args:
            script_args = []
        additional_files.append(script)
        flags = ""
        if slots:
            flags += "-pe mpi {0} ".format(slots)
        script_name = os.path.basename(script)
        return self._remote_command_executor.run_remote_command(
            "qsub {0} {1} {2}".format(flags, script_name,
                                      " ".join(script_args)),
            additional_files=additional_files)

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        __tracebackhide__ = True
        status = self.get_job_exit_status(job_id)
        assert_that(status).is_equal_to("0")

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qhost | grep -o ip- | wc -l")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])

    def get_compute_nodes(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qhost | grep ip- | awk '{print $1}'")
        return result.stdout.splitlines()

    @retry(
        retry_on_result=lambda result: "<state>d</state>" not in result,
        wait_fixed=seconds(3),
        stop_max_delay=minutes(5),
    )
    def wait_for_locked_node(self):  # noqa: D102
        return self._remote_command_executor.run_remote_command(
            "qstat -f -xml").stdout

    def get_node_cores(self):
        """Return number of slots from the scheduler."""
        result = self._remote_command_executor.run_remote_command(
            "qhost -F | grep hl:m_core")
        return re.search(r"hl:m_core=(\d+).000000", result.stdout).group(1)
Exemple #29
0
class SlurmCommands(SchedulerCommands):
    """Implement commands for slurm scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(
        retry_on_result=lambda result: "JobState" not in result or
        any(value in result for value in
            ["EndTime=Unknown", "JobState=RUNNING", "JobState=COMPLETING"]),
        wait_fixed=seconds(3),
        stop_max_delay=minutes(7),
    )
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id), raise_on_error=False)
        return result.stdout

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        match = re.search(r"ExitCode=(.+?) ", result.stdout)
        return match.group(1)

    def assert_job_submitted(self, sbatch_output):  # noqa: D102
        __tracebackhide__ = True
        match = re.search(r"Submitted batch job ([0-9]+)", sbatch_output)
        assert_that(match).is_not_none()
        return match.group(1)

    def submit_command(self,
                       command,
                       nodes=1,
                       slots=None,
                       host=None,
                       after_ok=None,
                       other_options=None):  # noqa: D102
        submission_command = "sbatch --wrap='{0}'".format(command)
        if nodes > 0:
            submission_command += "  -N {0}".format(nodes)
        if host:
            submission_command += " --nodelist={0}".format(host)
        if slots:
            submission_command += " -n {0}".format(slots)
        if after_ok:
            submission_command += " -d afterok:{0}".format(after_ok)
        if other_options:
            submission_command += " {0}".format(other_options)
        return self._remote_command_executor.run_remote_command(
            submission_command)

    def submit_script(self,
                      script,
                      script_args=None,
                      nodes=1,
                      slots=None,
                      host=None,
                      additional_files=None):  # noqa: D102
        if not additional_files:
            additional_files = []
        if not script_args:
            script_args = []
        additional_files.append(script)
        script_name = os.path.basename(script)
        submission_command = "sbatch"
        if host:
            submission_command += " --nodelist={0}".format(host)
        if slots:
            submission_command += " -n {0}".format(slots)
        if nodes > 1:
            submission_command += " -N {0}".format(nodes)
        submission_command += " {1} {2}".format(nodes, script_name,
                                                " ".join(script_args))
        return self._remote_command_executor.run_remote_command(
            submission_command, additional_files=additional_files)

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id))
        assert_that(result.stdout).contains("JobState=COMPLETED")

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "sinfo --Node --noheader | grep compute | wc -l")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])

    def get_compute_nodes(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "sinfo --Node --noheader | grep compute | awk '{print $1}'")
        return result.stdout.splitlines()

    @retry(retry_on_result=lambda result: "drain" not in result,
           wait_fixed=seconds(3),
           stop_max_delay=minutes(5))
    def wait_for_locked_node(self):  # noqa: D102
        return self._remote_command_executor.run_remote_command(
            "/opt/slurm/bin/sinfo -h -o '%t'").stdout

    def get_node_cores(self):
        """Return number of slots from the scheduler."""
        result = self._remote_command_executor.run_remote_command(
            "/opt/slurm/bin/sinfo -o '%c' -h")
        return re.search(r"(\d+)", result.stdout).group(1)

    def get_job_info(self, job_id):
        """Return job details from slurm"""
        return self._remote_command_executor.run_remote_command(
            "scontrol show jobs -o {0}".format(job_id)).stdout
Exemple #30
0
class TorqueCommands(SchedulerCommands):
    """Implement commands for torque scheduler."""
    def __init__(self, remote_command_executor):
        super().__init__(remote_command_executor)

    @retry(retry_on_result=lambda result: "job_state = C" not in result,
           wait_fixed=seconds(3),
           stop_max_delay=minutes(12))
    def wait_job_completed(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qstat -f {0}".format(job_id))
        return result.stdout

    def get_job_exit_status(self, job_id):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "qstat -f {0}".format(job_id))
        match = re.search(r"exit_status = (\d+)", result.stdout)
        return match.group(1)

    def assert_job_submitted(self, qsub_output):  # noqa: D102
        __tracebackhide__ = True
        # qsub_output is the id of the job in case of successful submissions
        id = qsub_output
        # check that the job exists
        self._remote_command_executor.run_remote_command(
            "qstat -f {0}".format(id))
        return id

    def submit_command(self,
                       command,
                       nodes=1,
                       slots=None,
                       after_ok=None):  # noqa: D102
        flags = "-l nodes={0}:ppn={1}".format(nodes or 1, slots or 1)
        if after_ok:
            flags += " -W depend=afterok:{0}".format(after_ok)
        return self._remote_command_executor.run_remote_command(
            "echo '{0}' | qsub {1}".format(command, flags),
            raise_on_error=False)

    def submit_script(self,
                      script,
                      script_args=None,
                      nodes=1,
                      slots=None,
                      additional_files=None):  # noqa: D102
        if not additional_files:
            additional_files = []
        script_name = os.path.basename(script)
        additional_files.append(script)
        flags = "-l nodes={0}:ppn={1}".format(nodes or 1, slots or 1)
        if script_args:
            flags += ' -F "{0}"'.format(" ".join(script_args))
        return self._remote_command_executor.run_remote_command(
            "qsub {0} {1}".format(flags, script_name),
            additional_files=additional_files)

    def assert_job_succeeded(self, job_id, children_number=0):  # noqa: D102
        __tracebackhide__ = True
        status = self.get_job_exit_status(job_id)
        assert_that(status).is_equal_to("0")

    def compute_nodes_count(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "echo $(( $(/opt/torque/bin/pbsnodes -l all | wc -l) - 1))")
        # split()[-1] to extract last line and trim whitespaces
        return int(result.stdout.split()[-1])

    def get_compute_nodes(self):  # noqa: D102
        result = self._remote_command_executor.run_remote_command(
            "pbsnodes -l all | grep -v $(hostname) | awk '{print $1}'")
        return result.stdout.splitlines()

    @retry(retry_on_result=lambda result: "offline" not in result,
           wait_fixed=seconds(5),
           stop_max_delay=minutes(5))
    def wait_for_locked_node(self):  # noqa: D102
        # discard the first node since that is the master server
        return self._remote_command_executor.run_remote_command(
            r'pbsnodes | grep -e "\sstate = " | tail -n +2').stdout

    def get_node_cores(self):
        """Return number of slots from the scheduler."""
        result = self._remote_command_executor.run_remote_command(
            "pbsnodes | tail -n +10")
        return re.search(r"np = (\d+)", result.stdout).group(1)
    if auto_import_policy in ("NEW", "NEW_CHANGED"):
        result = remote_command_executor.run_remote_command(
            f"cat {mount_dir}/{filename}".format(mount_dir=mount_dir))
        assert_that(
            result.stdout).is_equal_to(modified_file_body if auto_import_policy
                                       == "NEW_CHANGED" else new_file_body)
    else:
        result = remote_command_executor.run_remote_command(f"ls {mount_dir}/")
        assert_that(result.stdout).does_not_contain(filename)


@retry(
    retry_on_result=lambda result: result.get("Lifecycle") in
    ["PENDING", "EXECUTING", "CANCELLING"],
    wait_fixed=seconds(5),
    stop_max_delay=minutes(7),
)
def poll_on_data_export(task, fsx):
    logging.info("Data Export Task {task_id}: {status}".format(
        task_id=task.get("TaskId"), status=task.get("Lifecycle")))
    return fsx.describe_data_repository_tasks(
        TaskIds=[task.get("TaskId")]).get("DataRepositoryTasks")[0]


def _test_data_repository_task(remote_command_executor, mount_dir, bucket_name,
                               fsx_fs_id, region):
    logging.info("Testing fsx lustre data repository task")
    file_contents = "Exported by FSx Lustre"
    remote_command_executor.run_remote_command(
        "echo '{file_contents}' > {mount_dir}/file_to_export".format(
            file_contents=file_contents, mount_dir=mount_dir))