Esempio n. 1
0
def _execute_test_jobs_on_cluster(cluster_config, log_file):
    """
    Executes test jobs defined in cluster-check.sh on a given cluster.
    Args:
        cluster_config: named tuple of type ClusterConfig containing the configuration of the cluster.
        log_file: file where to write logs.
    """
    ssh_params = list(ssh_config_options)
    if cluster_config.key_path:
        ssh_params.extend(["-i", cluster_config.key_path])

    prochelp.exec_command(
        ["scp"] + ssh_params + [
            os.path.join(_dirname(), "cluster-check.sh"),
            "%s@%s:." % (cluster_config.username, cluster_config.master_node),
        ],
        stdout=log_file,
        stderr=sub.STDOUT,
        universal_newlines=True,
    )
    _exec_ssh_command(
        command="/bin/bash --login cluster-check.sh submit %s" %
        cluster_config.scheduler,
        username=cluster_config.username,
        host=cluster_config.master_node,
        key_path=cluster_config.key_path,
        stdout=log_file,
    )
Esempio n. 2
0
def _exec_ssh_command(command,
                      host,
                      username,
                      key_path,
                      stdout=sub.PIPE,
                      stderr=sub.STDOUT):
    """
    Executes an ssh command on a remote host.
    Args:
        command: command to execute.
        host: host where the command is executed.
        username: username used to ssh into the host.
        key_path: key used to ssh into the host.
        stdout: stdout redirection. Defaults to sub.PIPE.
        stderr: stderr redirection. Defaults to sub.STDOUT.

    Returns:
        the stdout for the executed command.
    """
    ssh_params = list(ssh_config_options)
    if key_path:
        ssh_params.extend(["-i", key_path])

    return prochelp.exec_command(
        ["ssh", "-n"] + ssh_params + ["%s@%s" % (username, host), command],
        stdout=stdout,
        stderr=stderr,
        universal_newlines=True,
    )
Esempio n. 3
0
def _get_master_ip(cluster_config_file, cluster_name, log_file):
    """
    Retrieves the ip of the master node for a given cluster.
    Args:
        cluster_config_file: file containing the config of the cluster.
        cluster_name: name of the cluster.
        log_file: file where to write logs.

    Returns:
        master_ip: the ip of the master node.
    """
    master_ip = ""
    # get the master ip, which means grepping through pcluster status output
    dump = prochelp.exec_command(
        ["pcluster", "status", "--config", cluster_config_file, cluster_name],
        stderr=sub.STDOUT,
        universal_newlines=True,
    )
    dump_array = dump.splitlines()
    for line in dump_array:
        m = re.search("MasterPublicIP: (.+)$", line)
        if m:
            master_ip = m.group(1)
            break

    # Check master ip was correctly retrieved
    if master_ip == "":
        _double_writeln(
            log_file,
            "!! %s: Master IP not found. This usually occurs when cluster creation failed."
            % cluster_name)
        raise ReleaseCheckException("--> %s: Master IP not found!" %
                                    cluster_name)
    _double_writeln(log_file,
                    "--> %s Master IP: %s" % (cluster_name, master_ip))

    return master_ip
Esempio n. 4
0
                         "r").read().split("\n")
            for index, line in enumerate(rfile):
                m = re.search("efs_fs_id", line)
                if m:
                    rfile[index] = "efs_fs_id = %s" % fsrelated["fsid"]

            wfile = open("./config-%s-%s" % (region, distro), "w")
            wfile.write("\n".join(rfile))
            wfile.close()

        print("Creating cluster...")
        prochelp.exec_command(
            [
                "pcluster", "create",
                "autoTest-%s" % testname, "--config",
                "./config-%s-%s" % (region, distro)
            ],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )
        _create_done = True
        dump = prochelp.exec_command(
            [
                "pcluster", "status",
                "autoTest-%s" % testname, "--config",
                "./config-%s-%s" % (region, distro)
            ],
            stderr=sub.STDOUT,
            universal_newlines=True,
        )
        dump_array = dump.splitlines()
def run_test(distro, clustername, mastersubnet, region):
    testname = ("%s-%s" % (distro, clustername)) + "".join(
        random.choice(string.ascii_uppercase + string.digits)
        for _ in range(8))
    print(testname)
    out_f = open("%s-out.txt" % testname, "w")
    username = username_map[distro]
    create_done = False
    create_interrupted = False
    volume_id = ""
    az = _get_az(mastersubnet, region)
    region = az[:-1]

    try:
        ec2 = boto3.client("ec2", region_name=region)
        response_vol = ec2.create_volume(AvailabilityZone=az, Size=10)
        volume_id = response_vol["VolumeId"]
        print("Volume created: %s" % volume_id)

        if volume_id == "":
            _double_writeln(
                out_f, "!! %s: Volume ID not found; exiting !!" % testname)
            raise ReleaseCheckException("--> %s: Volume ID not found!" %
                                        testname)
        _double_writeln(out_f, "--> %s Volume ID: %s" % (testname, volume_id))
        print("Preparing volume...")
        while True:
            response_vol = ec2.describe_volumes(VolumeIds=[volume_id])
            vol_state = response_vol["Volumes"][0]["State"]
            if vol_state == "available":
                print("Volume is good to go!")
                break
            time.sleep(5)

        response_snapshot = ec2.create_snapshot(VolumeId=volume_id)
        snap_id = response_snapshot["SnapshotId"]
        print("Snapshot created: %s" % snap_id)

        if snap_id == "":
            _double_writeln(
                out_f, "!! %s: Snapshot ID not found; exiting !!" % testname)
            raise ReleaseCheckException("--> %s: Snapshot ID not found!" %
                                        testname)
        _double_writeln(out_f, "--> %s Snapshot ID: %s" % (testname, snap_id))
        print("Preparing snapshot...")
        while True:
            response_snap = ec2.describe_snapshots(SnapshotIds=[snap_id])
            snap_state = response_snap["Snapshots"][0]["State"]
            if snap_state == "completed":
                print("Snapshot is good to go!")
                break
            time.sleep(5)

        rfile = open("./config-%s-%s" % (region, distro),
                     "r").read().split("\n")
        for index, line in enumerate(rfile):
            m = re.search("ebs_volume_id", line)
            if m:
                rfile[index] = "ebs_volume_id = %s" % volume_id
            n = re.search("ebs_snapshot_id", line)
            if n:
                rfile[index] = "ebs_snapshot_id = %s" % snap_id

        wfile = open("./config-%s-%s" % (region, distro), "w")
        wfile.write("\n".join(rfile))
        wfile.close()

        print("Creating cluster...")
        prochelp.exec_command(
            [
                "pcluster",
                "create",
                "autoTest-%s" % testname,
                "--config",
                "./config-%s-%s" % (region, distro),
                "--cluster-template",
                "%s" % clustername,
            ],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )
        create_done = True
        dump = prochelp.exec_command(
            [
                "pcluster", "status",
                "autoTest-%s" % testname, "--config",
                "./config-%s-%s" % (region, distro)
            ],
            stderr=sub.STDOUT,
            universal_newlines=True,
        )
        master_ip = ""
        dump_array = dump.splitlines()
        for line in dump_array:
            m = re.search("MasterPublicIP: (.+)$", line)
            if m:
                master_ip = m.group(1)
                break
        if master_ip == "":
            _double_writeln(
                out_f, "!! %s: Master IP not found; exiting !!" % testname)
            raise ReleaseCheckException("--> %s: Master IP not found!" %
                                        testname)
        _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip))

        time.sleep(10)

        # run test on the cluster...
        ssh_params = ["-o", "StrictHostKeyChecking=no"]
        ssh_params += ["-o", "BatchMode=yes"]
        # ssh_params += ['-o', 'ConnectionAttempts=30']
        ssh_params += ["-o", "ConnectTimeout=60"]
        ssh_params += ["-o", "ServerAliveCountMax=5"]
        ssh_params += ["-o", "ServerAliveInterval=30"]

        print("Running tests...")
        prochelp.exec_command(
            ["scp"] + ssh_params + [
                os.path.join(_dirname(), "ebs-check.sh"),
                "%s@%s:." % (username, master_ip)
            ],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )

        time.sleep(5)

        if clustername == "custom3Vol" or clustername == "custom5Vol":
            prochelp.exec_command(
                ["ssh", "-n"] + ssh_params + [
                    "%s@%s" % (username, master_ip),
                    "/bin/bash --login ebs-check.sh %s %s %s %s" %
                    (testargs_map[clustername], region, volume_id, snap_id),
                ],
                stdout=out_f,
                stderr=sub.STDOUT,
                universal_newlines=True,
            )
        else:
            prochelp.exec_command(
                ["ssh", "-n"] + ssh_params + [
                    "%s@%s" % (username, master_ip),
                    "/bin/bash --login ebs-check.sh %s %s" %
                    (testargs_map[clustername], region),
                ],
                stdout=out_f,
                stderr=sub.STDOUT,
                universal_newlines=True,
            )

    except prochelp.ProcessHelperError as exc:
        if not create_done and isinstance(exc, prochelp.KilledProcessError):
            create_interrupted = True
            _double_writeln(
                out_f,
                "--> %s: Interrupting AWS ParallelCluster create!" % testname)
        _double_writeln(out_f, "!! ABORTED: %s!!" % testname)
        open("%s.aborted" % testname, "w").close()
        raise exc
    except Exception as exc:
        if not create_done:
            create_interrupted = True
        _double_writeln(
            out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc)))
        _double_writeln(out_f, "!! FAILURE: %s!!" % testname)
        open("%s.failed" % testname, "w").close()
        raise exc

    finally:
        print("Cleaning up!")
        if create_interrupted or create_done:
            # if the create process was interrupted it may take few seconds for the stack id to be actually registered
            _max_del_iters = _del_iters = 10
        else:
            # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError)
            _del_iters = 0
        if _del_iters > 0:
            _del_done = False
            _double_writeln(
                out_f, "--> %s: Deleting - max iterations: %s" %
                (testname, _del_iters))
            while not _del_done and _del_iters > 0:
                try:
                    time.sleep(2)
                    # clean up the cluster
                    _del_output = sub.check_output(
                        [
                            "pcluster",
                            "delete",
                            "autoTest-%s" % testname,
                            "--config",
                            "./config-%s-%s" % (region, distro),
                        ],
                        stderr=sub.STDOUT,
                        universal_newlines=True,
                    )
                    _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output
                    out_f.write(_del_output + "\n")
                except sub.CalledProcessError as exc:
                    out_f.write(
                        "CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n"
                        % (str(exc), exc.output))
                except Exception as exc:
                    out_f.write(
                        "Unexpected exception launching 'pcluster delete' %s: %s\n"
                        % (str(type(exc)), str(exc)))
                finally:
                    _double_writeln(
                        out_f,
                        "--> %s: Deleting - iteration: %s - successfully submitted: %s"
                        % (testname,
                           (_max_del_iters - _del_iters + 1), _del_done),
                    )
                    _del_iters -= 1

            try:
                prochelp.exec_command(
                    [
                        "pcluster", "status",
                        "autoTest-%s" % testname, "--config",
                        "./config-%s-%s" % (region, distro)
                    ],
                    stdout=out_f,
                    stderr=sub.STDOUT,
                    universal_newlines=True,
                )
            except (prochelp.ProcessHelperError, sub.CalledProcessError):
                # Usually it terminates with exit status 1 since at the end of the delete operation
                # the stack is not found.
                pass
            except Exception as exc:
                out_f.write(
                    "Unexpected exception launching 'pcluster status' %s: %s\n"
                    % (str(type(exc)), str(exc)))
        ec2.delete_snapshot(SnapshotId=snap_id)
        ec2.delete_volume(VolumeId=volume_id)
        out_f.close()
    print("--> %s: Finished" % testname)
Esempio n. 6
0
def run_test(distro, clustername, mastersubnet, region):
    testname = ("%s-%s" % (distro, clustername)) + "".join(
        random.choice(string.ascii_uppercase + string.digits)
        for _ in range(8))
    print(testname)
    out_f = open("%s-out.txt" % testname, "w")
    username = username_map[distro]
    _create_done = False
    _create_interrupted = False
    _volume_id = ""
    _az = _get_az(mastersubnet, region)
    _region = _az[:-1]

    try:

        print("Creating cluster...")
        prochelp.exec_command(
            [
                "pcluster",
                "create",
                "autoTest-%s" % testname,
                "--config",
                "./config-%s-%s" % (_region, distro),
                "--cluster-template",
                "%s" % clustername,
            ],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )
        _create_done = True
        dump = prochelp.exec_command(
            [
                "pcluster", "status",
                "autoTest-%s" % testname, "--config",
                "./config-%s-%s" % (_region, distro)
            ],
            stderr=sub.STDOUT,
            universal_newlines=True,
        )
        dump_array = dump.splitlines()
        for line in dump_array:
            m = re.search("MasterPublicIP: (.+)$", line)
            if m:
                master_ip = m.group(1)
                break
        if master_ip == "":
            _double_writeln(
                out_f, "!! %s: Master IP not found; exiting !!" % (testname))
            raise ReleaseCheckException("--> %s: Master IP not found!" %
                                        testname)
        _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip))

        time.sleep(10)

        # run test on the cluster...
        ssh_params = ["-o", "StrictHostKeyChecking=no"]
        ssh_params += ["-o", "BatchMode=yes"]
        # ssh_params += ['-o', 'ConnectionAttempts=30']
        ssh_params += ["-o", "ConnectTimeout=60"]
        ssh_params += ["-o", "ServerAliveCountMax=5"]
        ssh_params += ["-o", "ServerAliveInterval=30"]

        print("Running tests...")
        prochelp.exec_command(
            ["scp"] + ssh_params + [
                os.path.join(_dirname(), "raid-check.sh"),
                "%s@%s:." % (username, master_ip)
            ],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )

        time.sleep(5)

        if clustername == "default":
            prochelp.exec_command(
                ["ssh", "-n"] + ssh_params + [
                    "%s@%s" %
                    (username, master_ip), "/bin/bash --login raid-check.sh"
                ],
                stdout=out_f,
                stderr=sub.STDOUT,
                universal_newlines=True,
            )
        else:
            prochelp.exec_command(
                ["ssh", "-n"] + ssh_params + [
                    "%s@%s" % (username, master_ip),
                    "/bin/bash --login raid-check.sh %s" % raid_path
                ],
                stdout=out_f,
                stderr=sub.STDOUT,
                universal_newlines=True,
            )
        print("Test passed...")

    except prochelp.ProcessHelperError as exc:
        if not _create_done and isinstance(exc, prochelp.KilledProcessError):
            _create_interrupted = True
            _double_writeln(
                out_f,
                "--> %s: Interrupting AWS ParallelCluster create!" % testname)
        _double_writeln(out_f, "!! ABORTED: %s!!" % (testname))
        open("%s.aborted" % testname, "w").close()
        raise exc
    except Exception as exc:
        if not _create_done:
            _create_interrupted = True
        _double_writeln(
            out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc)))
        _double_writeln(out_f, "!! FAILURE: %s!!" % (testname))
        open("%s.failed" % testname, "w").close()
        raise exc

    finally:
        print("Cleaning up!")
        if _create_interrupted or _create_done:
            # if the create process was interrupted it may take few seconds for the stack id to be actually registered
            _max_del_iters = _del_iters = 10
        else:
            # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError)
            _del_iters = 0
        if _del_iters > 0:
            _del_done = False
            _double_writeln(
                out_f, "--> %s: Deleting - max iterations: %s" %
                (testname, _del_iters))
            while not _del_done and _del_iters > 0:
                try:
                    time.sleep(2)
                    # clean up the cluster
                    _del_output = sub.check_output(
                        [
                            "pcluster",
                            "delete",
                            "autoTest-%s" % testname,
                            "--config",
                            "./config-%s-%s" % (_region, distro),
                        ],
                        stderr=sub.STDOUT,
                        universal_newlines=True,
                    )
                    _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output
                    out_f.write(_del_output + "\n")
                except sub.CalledProcessError as exc:
                    out_f.write(
                        "CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n"
                        % (str(exc), exc.output))
                except Exception as exc:
                    out_f.write(
                        "Unexpected exception launching 'pcluster delete' %s: %s\n"
                        % (str(type(exc)), str(exc)))
                finally:
                    _double_writeln(
                        out_f,
                        "--> %s: Deleting - iteration: %s - successfully submitted: %s"
                        % (testname,
                           (_max_del_iters - _del_iters + 1), _del_done),
                    )
                    _del_iters -= 1

            try:
                prochelp.exec_command(
                    [
                        "pcluster", "status",
                        "autoTest-%s" % testname, "--config",
                        "./config-%s-%s" % (region, distro)
                    ],
                    stdout=out_f,
                    stderr=sub.STDOUT,
                    universal_newlines=True,
                )
            except (prochelp.ProcessHelperError, sub.CalledProcessError):
                # Usually terminates with exit status 1 since at the end of the delete operation the stack is not found
                pass
            except Exception as exc:
                out_f.write(
                    "Unexpected exception launching 'pcluster status' %s: %s\n"
                    % (str(type(exc)), str(exc)))
        out_f.close()
    print("--> %s: Finished" % (testname))
Esempio n. 7
0
def run_test(region, distro, scheduler, instance_type, key_name,
             expected_asg_capacity, expected_compute_nodes, extra_args):
    _create_interrupted = False
    _create_done = False
    testname = "%s-%s-%s-%s-%s" % (region, distro, scheduler,
                                   instance_type.replace(".", ""), _timestamp)
    test_filename = "%s-config.cfg" % testname
    out_f = open("%s-out.txt" % testname, "w", 0)
    # Test jobs should take at most 9 minutes to be executed.
    # These guarantees that the jobs are executed in parallel.
    max_jobs_exec_time = 9 * 60

    try:
        _double_writeln(out_f, "--> %s: Starting" % testname)

        cluster_config = ClusterConfig(
            config_file=test_filename,
            stack_name="parallelcluster-" + testname,
            region=region,
            distro=distro,
            instance_type=instance_type,
            scheduler=scheduler,
            username=username_map[distro],
            key_path=extra_args["key_path"],
            key_name=key_name,
            master_node="",
            scaledown_idletime=4,
        )

        _write_pcluster_config(cluster_config=cluster_config,
                               extra_args=extra_args)
        _double_writeln(
            out_f, "--> %s: Created pcluster config file %s" %
            (testname, test_filename))

        # build the cluster
        _double_writeln(out_f, "--> %s: Creating the cluster" % testname)
        prochelp.exec_command(
            ["pcluster", "create", "--config", test_filename, testname],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )
        _create_done = True
        _double_writeln(out_f,
                        "--> %s: Cluster created successfully" % testname)

        cluster_config = cluster_config._replace(
            master_node=_get_master_ip(cluster_config_file=test_filename,
                                       cluster_name=testname,
                                       log_file=out_f))

        _double_writeln(out_f,
                        "--> %s: Executing test jobs on cluster." % testname)
        _execute_test_jobs_on_cluster(cluster_config=cluster_config,
                                      log_file=out_f)
        _double_writeln(out_f,
                        "--> %s: Test jobs successfully started" % testname)

        _double_writeln(
            out_f,
            "--> %s: Monitoring asg capacity and compute nodes" % testname)
        additional_watching_time = 5 * 60
        asg_capacity_time_series, compute_nodes_time_series, timestamps = _watch_compute_nodes_allocation(
            duration=max_jobs_exec_time +
            cluster_config.scaledown_idletime * 60 + additional_watching_time,
            frequency=20,
            cluster_config=cluster_config,
        )
        _double_writeln(
            out_f,
            "--> %s: Monitoring completed: %s, %s, %s" % (
                testname,
                "asg_capacity_time_series [" +
                " ".join(map(str, asg_capacity_time_series)) + "]",
                "compute_nodes_time_series [" +
                " ".join(map(str, compute_nodes_time_series)) + "]",
                "timestamps [" + " ".join(map(str, timestamps)) + "]",
            ),
        )

        _double_writeln(
            out_f,
            "--> %s: Verifying test jobs completed successfully" % testname)
        # jobs need to complete in 9 mins in order to verify parallelism
        _assert_test_jobs_completed(cluster_config=cluster_config,
                                    max_jobs_exec_time=max_jobs_exec_time,
                                    log_file=out_f)
        _double_writeln(out_f,
                        "--> %s: Test jobs completed successfully" % testname)

        _double_writeln(
            out_f,
            "--> %s: Verifying auto-scaling worked correctly" % testname)
        _assert_scaling_works(
            asg_capacity_time_series=asg_capacity_time_series,
            compute_nodes_time_series=compute_nodes_time_series,
            expected_asg_capacity=expected_asg_capacity,
            expected_compute_nodes=expected_compute_nodes,
        )
        _double_writeln(out_f,
                        "--> %s: Autoscaling worked as expected" % testname)

        _double_writeln(out_f, "SUCCESS:  %s!!" % testname)
        open("%s.success" % testname, "w").close()
    except prochelp.ProcessHelperError as exc:
        if not _create_done and isinstance(exc, prochelp.KilledProcessError):
            _create_interrupted = True
            _double_writeln(out_f,
                            "--> %s: Interrupting pcluster create!" % testname)
        _double_writeln(out_f, "!! ABORTED: %s!!" % (testname))
        open("%s.aborted" % testname, "w").close()
        raise exc
    except AssertionError as err:
        _double_writeln(
            out_f,
            "--> %s: Test assertion failed: %s" % (testname, err.message))
        _double_writeln(out_f, "!! FAILURE: %s!!" % testname)
        open("%s.failed" % testname, "w").close()
        raise err
    except Exception as exc:
        if not _create_done:
            _create_interrupted = True
        _double_writeln(
            out_f, "--> %s: Unexpected exception %s: %s" %
            (testname, str(type(exc)), str(exc)))
        _double_writeln(out_f, "!! FAILURE: %s!!" % testname)
        open("%s.failed" % testname, "w").close()
        raise exc
    finally:
        if _create_interrupted or _create_done:
            # if the create process was interrupted it may take few seconds for the stack id to be actually registered
            _max_del_iters = _del_iters = 10
        else:
            # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError)
            _del_iters = 0
        if _del_iters > 0:
            _del_done = False
            _double_writeln(
                out_f, "--> %s: Deleting - max iterations: %s" %
                (testname, _del_iters))
            while not _del_done and _del_iters > 0:
                try:
                    time.sleep(2)
                    # clean up the cluster
                    _del_output = sub.check_output(
                        [
                            "pcluster", "delete", "--config", test_filename,
                            "-nw", testname
                        ],
                        stderr=sub.STDOUT,
                        universal_newlines=True,
                    )
                    _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output
                    out_f.write(_del_output + "\n")
                except sub.CalledProcessError as exc:
                    out_f.write(
                        "CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n"
                        % (str(exc), exc.output))
                except Exception as exc:
                    out_f.write(
                        "Unexpected exception launching 'pcluster delete' %s: %s\n"
                        % (str(type(exc)), str(exc)))
                finally:
                    _double_writeln(
                        out_f,
                        "--> %s: Deleting - iteration: %s - successfully submitted: %s"
                        % (testname,
                           (_max_del_iters - _del_iters + 1), _del_done),
                    )
                    _del_iters -= 1

            try:
                prochelp.exec_command(
                    [
                        "pcluster", "status", "--config", test_filename,
                        testname
                    ],
                    stdout=out_f,
                    stderr=sub.STDOUT,
                    universal_newlines=True,
                )
            except (prochelp.ProcessHelperError, sub.CalledProcessError):
                # Usually it terminates with exit status 1 since at the end of the delete operation the stack is not found.
                pass
            except Exception as exc:
                out_f.write(
                    "Unexpected exception launching 'pcluster status' %s: %s\n"
                    % (str(type(exc)), str(exc)))
        _double_writeln(out_f, "--> %s: Finished" % testname)
        out_f.close()
Esempio n. 8
0
def run_test(region, distro, scheduler, instance_type, key_name, extra_args):
    scaledown_idletime = 2
    testname = "%s-%s-%s-%s-%s" % (region, distro, scheduler,
                                   instance_type.replace(".", ""), _timestamp)
    test_filename = "%s-config.cfg" % testname
    key_path = extra_args["key_path"]
    custom_cookbook = extra_args["custom_cookbook_url"]
    custom_node = extra_args["custom_node_url"]
    custom_template = extra_args["custom_template_url"]

    print("--> %s: Starting" % (testname))

    file = open(test_filename, "w")
    file.write("[aws]\n")
    file.write("aws_region_name = %s\n" % region)
    file.write("[cluster default]\n")
    file.write("vpc_settings = public\n")
    file.write("key_name = %s\n" % key_name)
    file.write("base_os = %s\n" % distro)
    file.write("master_instance_type = %s\n" % instance_type)
    file.write("compute_instance_type = %s\n" % instance_type)
    file.write("initial_queue_size = 1\n")
    file.write("maintain_initial_size = false\n")
    file.write("scheduler = %s\n" % (scheduler))
    file.write("scaling_settings = custom\n")
    if custom_template:
        file.write("template_url = %s\n" % custom_template)
    if custom_cookbook:
        file.write("custom_chef_cookbook = %s\n" % custom_cookbook)
    if custom_node:
        file.write(
            'extra_json = { "cluster" : { "custom_node_package" : "%s" } }\n' %
            custom_node)
    file.write("[vpc public]\n")
    file.write("master_subnet_id = %s\n" % (setup[region]["subnet"]))
    file.write("vpc_id = %s\n" % (setup[region]["vpc"]))
    file.write("[global]\n")
    file.write("cluster_template = default\n")
    file.write("[scaling custom]\n")
    file.write("scaledown_idletime = %s\n" % scaledown_idletime)
    file.close()

    out_f = open("%s-out.txt" % testname, "w", 0)

    master_ip = ""
    username = username_map[distro]
    _create_interrupted = False
    _create_done = False
    try:
        # build the cluster
        prochelp.exec_command(
            ["pcluster", "create", "--config", test_filename, testname],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )
        _create_done = True
        # get the master ip, which means grepping through pcluster status gorp
        dump = prochelp.exec_command(
            ["pcluster", "status", "--config", test_filename, testname],
            stderr=sub.STDOUT,
            universal_newlines=True)
        dump_array = dump.splitlines()
        for line in dump_array:
            m = re.search("MasterPublicIP: (.+)$", line)
            if m:
                master_ip = m.group(1)
                break
        if master_ip == "":
            _double_writeln(
                out_f, "!! %s: Master IP not found; exiting !!" % (testname))
            raise ReleaseCheckException("--> %s: Master IP not found!" %
                                        testname)
        _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip))

        # run test on the cluster...
        ssh_params = ["-o", "StrictHostKeyChecking=no"]
        ssh_params += ["-o", "BatchMode=yes"]
        # ssh_params += ['-o', 'ConnectionAttempts=30']
        ssh_params += ["-o", "ConnectTimeout=60"]
        ssh_params += ["-o", "ServerAliveCountMax=5"]
        ssh_params += ["-o", "ServerAliveInterval=30"]
        if key_path:
            ssh_params.extend(["-i", key_path])

        prochelp.exec_command(
            ["scp"] + ssh_params + [
                os.path.join(_dirname(), "cluster-check.sh"),
                "%s@%s:." % (username, master_ip)
            ],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )
        prochelp.exec_command(
            ["ssh", "-n"] + ssh_params + [
                "%s@%s" % (username, master_ip),
                "/bin/bash --login cluster-check.sh submit %s" % scheduler
            ],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )

        # Sleep for scaledown_idletime to give time for the instances to scale down
        time.sleep(60 * scaledown_idletime)

        check_asg_capacity("parallelcluster-" + testname, region, out_f)

        prochelp.exec_command(
            ["ssh", "-n"] + ssh_params + [
                "%s@%s" % (username, master_ip),
                "/bin/bash --login cluster-check.sh scaledown_check %s" %
                scheduler
            ],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True,
        )

        _double_writeln(out_f, "SUCCESS:  %s!!" % testname)
        open("%s.success" % testname, "w").close()
    except prochelp.ProcessHelperError as exc:
        if not _create_done and isinstance(exc, prochelp.KilledProcessError):
            _create_interrupted = True
            _double_writeln(out_f,
                            "--> %s: Interrupting pcluster create!" % testname)
        _double_writeln(out_f, "!! ABORTED: %s!!" % (testname))
        open("%s.aborted" % testname, "w").close()
        raise exc
    except Exception as exc:
        if not _create_done:
            _create_interrupted = True
        _double_writeln(
            out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc)))
        _double_writeln(out_f, "!! FAILURE: %s!!" % (testname))
        open("%s.failed" % testname, "w").close()
        raise exc
    finally:
        if _create_interrupted or _create_done:
            # if the create process was interrupted it may take few seconds for the stack id to be actually registered
            _max_del_iters = _del_iters = 10
        else:
            # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError)
            _del_iters = 0
        if _del_iters > 0:
            _del_done = False
            _double_writeln(
                out_f, "--> %s: Deleting - max iterations: %s" %
                (testname, _del_iters))
            while not _del_done and _del_iters > 0:
                try:
                    time.sleep(2)
                    # clean up the cluster
                    _del_output = sub.check_output(
                        [
                            "pcluster", "delete", "--config", test_filename,
                            "-nw", testname
                        ],
                        stderr=sub.STDOUT,
                        universal_newlines=True,
                    )
                    _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output
                    out_f.write(_del_output + "\n")
                except sub.CalledProcessError as exc:
                    out_f.write(
                        "CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n"
                        % (str(exc), exc.output))
                except Exception as exc:
                    out_f.write(
                        "Unexpected exception launching 'pcluster delete' %s: %s\n"
                        % (str(type(exc)), str(exc)))
                finally:
                    _double_writeln(
                        out_f,
                        "--> %s: Deleting - iteration: %s - successfully submitted: %s"
                        % (testname,
                           (_max_del_iters - _del_iters + 1), _del_done),
                    )
                    _del_iters -= 1

            try:
                prochelp.exec_command(
                    [
                        "pcluster", "status", "--config", test_filename,
                        testname
                    ],
                    stdout=out_f,
                    stderr=sub.STDOUT,
                    universal_newlines=True,
                )
            except (prochelp.ProcessHelperError, sub.CalledProcessError):
                # Usually it terminates with exit status 1 since at the end of the delete operation the stack is not found.
                pass
            except Exception as exc:
                out_f.write(
                    "Unexpected exception launching 'pcluster status' %s: %s\n"
                    % (str(type(exc)), str(exc)))
        out_f.close()
    print("--> %s: Finished" % (testname))
Esempio n. 9
0
def run_test(region, distro, scheduler, instance_type, key_name, extra_args):
    testname = '%s-%s-%s-%s-%s' % (region, distro, scheduler,
                                   instance_type.replace('.', ''), _timestamp)
    test_filename = "%s-config.cfg" % testname
    key_path = extra_args['key_path']
    custom_cookbook = extra_args['custom_cookbook_url']
    custom_node = extra_args['custom_node_url']
    custom_template = extra_args['custom_template_url']

    print("--> %s: Starting" % (testname))

    file = open(test_filename, "w")
    file.write("[aws]\n")
    file.write("aws_region_name = %s\n" % region)
    file.write("[cluster default]\n")
    file.write("vpc_settings = public\n")
    file.write("key_name = %s\n" % key_name)
    file.write("base_os = %s\n" % distro)
    file.write("master_instance_type = %s\n" % instance_type)
    file.write("compute_instance_type = %s\n" % instance_type)
    file.write("initial_queue_size = 1\n")
    file.write("maintain_initial_size = true\n")
    file.write("scheduler = %s\n" % (scheduler))
    file.write("scaling_settings = custom\n")
    if custom_template:
        file.write("template_url = %s\n" % custom_template)
    if custom_cookbook:
        file.write("custom_chef_cookbook = %s\n" % custom_cookbook)
    if custom_node:
        file.write(
            'extra_json = { "cfncluster" : { "custom_node_package" : "%s" } }\n'
            % custom_node)
    file.write("[vpc public]\n")
    file.write("master_subnet_id = %s\n" % (setup[region]['subnet']))
    file.write("vpc_id = %s\n" % (setup[region]['vpc']))
    file.write("[global]\n")
    file.write("cluster_template = default\n")
    file.write("[scaling custom]\n")
    file.write("scaling_adjustment = 1\n")
    file.write("scaling_period = 30\n")
    file.write("scaling_evaluation_periods = 1\n")
    file.write("scaling_cooldown = 300\n")
    file.close()

    out_f = open('%s-out.txt' % testname, 'w', 0)

    master_ip = ''
    username = username_map[distro]
    _create_interrupted = False
    _create_done = False
    try:
        # build the cluster
        prochelp.exec_command(
            ['cfncluster', 'create', '--config', test_filename, testname],
            stdout=out_f,
            stderr=sub.STDOUT,
            universal_newlines=True)
        _create_done = True
        # get the master ip, which means grepping through cfncluster status gorp
        dump = prochelp.exec_command(
            ['cfncluster', 'status', '--config', test_filename, testname],
            stderr=sub.STDOUT,
            universal_newlines=True)
        dump_array = dump.splitlines()
        for line in dump_array:
            m = re.search('MasterPublicIP: (.+)$', line)
            if m:
                master_ip = m.group(1)
                break
        if master_ip == '':
            _double_writeln(
                out_f, '!! %s: Master IP not found; exiting !!' % (testname))
            raise ReleaseCheckException('--> %s: Master IP not found!' %
                                        testname)
        _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip))

        # run test on the cluster...
        ssh_params = ['-o', 'StrictHostKeyChecking=no']
        ssh_params += ['-o', 'BatchMode=yes']
        # ssh_params += ['-o', 'ConnectionAttempts=30']
        ssh_params += ['-o', 'ConnectTimeout=60']
        ssh_params += ['-o', 'ServerAliveCountMax=5']
        ssh_params += ['-o', 'ServerAliveInterval=30']
        if key_path:
            ssh_params.extend(['-i', key_path])

        prochelp.exec_command(['scp'] + ssh_params + [
            os.path.join(_dirname(), 'cluster-check.sh'),
            '%s@%s:.' % (username, master_ip)
        ],
                              stdout=out_f,
                              stderr=sub.STDOUT,
                              universal_newlines=True)
        prochelp.exec_command(['ssh', '-n'] + ssh_params + [
            '%s@%s' % (username, master_ip),
            '/bin/bash --login cluster-check.sh %s' % scheduler
        ],
                              stdout=out_f,
                              stderr=sub.STDOUT,
                              universal_newlines=True)

        _double_writeln(out_f, 'SUCCESS:  %s!!' % (testname))
        open('%s.success' % testname, 'w').close()
    except prochelp.ProcessHelperError as exc:
        if not _create_done and isinstance(exc, prochelp.KilledProcessError):
            _create_interrupted = True
            _double_writeln(
                out_f, "--> %s: Interrupting cfncluster create!" % testname)
        _double_writeln(out_f, '!! ABORTED: %s!!' % (testname))
        open('%s.aborted' % testname, 'w').close()
        raise exc
    except Exception as exc:
        if not _create_done:
            _create_interrupted = True
        _double_writeln(
            out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc)))
        _double_writeln(out_f, "!! FAILURE: %s!!" % (testname))
        open('%s.failed' % testname, 'w').close()
        raise exc
    finally:
        if _create_interrupted or _create_done:
            # if the create process was interrupted it may take few seconds for the stack id to be actually registered
            _max_del_iters = _del_iters = 10
        else:
            # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError)
            _del_iters = 0
        if _del_iters > 0:
            _del_done = False
            _double_writeln(
                out_f, "--> %s: Deleting - max iterations: %s" %
                (testname, _del_iters))
            while not _del_done and _del_iters > 0:
                try:
                    time.sleep(2)
                    # clean up the cluster
                    _del_output = sub.check_output([
                        'cfncluster', 'delete', '--config', test_filename,
                        '-nw', testname
                    ],
                                                   stderr=sub.STDOUT,
                                                   universal_newlines=True)
                    _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output
                    out_f.write(_del_output + '\n')
                except sub.CalledProcessError as exc:
                    out_f.write(
                        "CalledProcessError exception launching 'cfncluster delete': %s - Output:\n%s\n"
                        % (str(exc), exc.output))
                except Exception as exc:
                    out_f.write(
                        "Unexpected exception launching 'cfncluster delete' %s: %s\n"
                        % (str(type(exc)), str(exc)))
                finally:
                    _double_writeln(
                        out_f,
                        "--> %s: Deleting - iteration: %s - successfully submitted: %s"
                        % (testname,
                           (_max_del_iters - _del_iters + 1), _del_done))
                    _del_iters -= 1

            try:
                prochelp.exec_command([
                    'cfncluster', 'status', '--config', test_filename, testname
                ],
                                      stdout=out_f,
                                      stderr=sub.STDOUT,
                                      universal_newlines=True)
            except (prochelp.ProcessHelperError, sub.CalledProcessError):
                # Usually it terminates with exit status 1 since at the end of the delete operation the stack is not found.
                pass
            except Exception as exc:
                out_f.write(
                    "Unexpected exception launching 'cfncluster status' %s: %s\n"
                    % (str(type(exc)), str(exc)))
        out_f.close()
    print("--> %s: Finished" % (testname))
def run_test(distro, clustername, mastersubnet, region):
    testname=("%s-%s" % (distro, clustername)) + ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(8))
    print(testname)
    out_f = open('%s-out.txt' % testname, 'w')
    username = username_map[distro]
    _create_done = False
    _create_interrupted = False
    _volume_id = ''
    _az = _get_az(mastersubnet, region)
    _region = _az[:-1]

    try:
        ec2 = boto3.client('ec2', region_name=region)
        response_vol = ec2.create_volume(AvailabilityZone=_az, Size=10)
        _volume_id = response_vol['VolumeId']
        print ("Volume created: %s" % _volume_id)

        if _volume_id == '':
            _double_writeln(out_f, '!! %s: Volume ID not found; exiting !!' % (testname))
            raise ReleaseCheckException('--> %s: Volume ID not found!' % testname)
        _double_writeln(out_f, "--> %s Volume ID: %s" % (testname, _volume_id))

        time.sleep(10)

        response_snapshot = ec2.create_snapshot(VolumeId=_volume_id)
        _snap_id = response_snapshot['SnapshotId']
        print ("Snapshot created: %s" % _snap_id)

        if _volume_id == '':
            _double_writeln(out_f, '!! %s: Snapshot ID not found; exiting !!' % (testname))
            raise ReleaseCheckException('--> %s: Snapshot ID not found!' % testname)
        _double_writeln(out_f, "--> %s Snapshot ID: %s" % (testname, _snap_id))

        rfile = open("./config-%s" % distro, "r").read().split('\n')
        for index, line in enumerate(rfile):
            m = re.search('ebs_volume_id', line)
            if m:
                rfile[index] = 'ebs_volume_id = %s' % _volume_id
            n = re.search('ebs_snapshot_id', line)
            if n:
                rfile[index] = 'ebs_snapshot_id = %s' % _snap_id

        wfile = open("./config-%s" % distro, "w")
        wfile.write('\n'.join(rfile))
        wfile.close()

        print("Creating cluster...")
        prochelp.exec_command(['pcluster', 'create', 'autoTest-%s' % testname,'--config','./config-%s' % distro,'--cluster-template', '%s' % clustername],
                              stdout=out_f, stderr=sub.STDOUT, universal_newlines=True)
        _create_done = True
        dump = prochelp.exec_command(['pcluster', 'status', 'autoTest-%s' % testname, '--config','./config-%s' % distro],
                                     stderr=sub.STDOUT, universal_newlines=True)
        dump_array = dump.splitlines()
        for line in dump_array:
            m = re.search('MasterPublicIP: (.+)$', line)
            if m:
                master_ip = m.group(1)
                break
        if master_ip == '':
            _double_writeln(out_f, '!! %s: Master IP not found; exiting !!' % (testname))
            raise ReleaseCheckException('--> %s: Master IP not found!' % testname)
        _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip))

        time.sleep(10)

        # run test on the cluster...
        ssh_params = ['-o', 'StrictHostKeyChecking=no']
        ssh_params += ['-o', 'BatchMode=yes']
        # ssh_params += ['-o', 'ConnectionAttempts=30']
        ssh_params += ['-o', 'ConnectTimeout=60']
        ssh_params += ['-o', 'ServerAliveCountMax=5']
        ssh_params += ['-o', 'ServerAliveInterval=30']

        print("Running tests...")
        prochelp.exec_command(
            ['scp'] + ssh_params + [os.path.join(_dirname(), 'ebs-check.sh'),
                                    '%s@%s:.' % (username, master_ip)],
            stdout=out_f, stderr=sub.STDOUT, universal_newlines=True)

        time.sleep(5)

        if clustername == 'custom3Vol' or clustername == 'custom5Vol':
            prochelp.exec_command(['ssh', '-n'] + ssh_params + ['%s@%s' % (username, master_ip),
                                                                '/bin/bash --login ebs-check.sh %s %s %s %s'
                                                                % (testargs_map[clustername], _region, _volume_id, _snap_id)],
                                  stdout=out_f, stderr=sub.STDOUT, universal_newlines=True)
        else:
            prochelp.exec_command(['ssh', '-n'] + ssh_params + ['%s@%s' % (username, master_ip),
                                                                '/bin/bash --login ebs-check.sh %s %s' % (testargs_map[clustername], _region)],
                                  stdout=out_f, stderr=sub.STDOUT, universal_newlines=True)

    except prochelp.ProcessHelperError as exc:
        if not _create_done and isinstance(exc, prochelp.KilledProcessError):
            _create_interrupted = True
            _double_writeln(out_f, "--> %s: Interrupting AWS ParallelCluster create!" % testname)
        _double_writeln(out_f, '!! ABORTED: %s!!' % (testname))
        open('%s.aborted' % testname, 'w').close()
        raise exc
    except Exception as exc:
        if not _create_done:
            _create_interrupted = True
        _double_writeln(out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc)))
        _double_writeln(out_f, "!! FAILURE: %s!!" % (testname))
        open('%s.failed' % testname, 'w').close()
        raise exc

    finally:
        print("Cleaning up!")
        if _create_interrupted or _create_done:
            # if the create process was interrupted it may take few seconds for the stack id to be actually registered
            _max_del_iters = _del_iters = 10
        else:
            # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError)
            _del_iters = 0
        if _del_iters > 0:
            _del_done = False
            _double_writeln(out_f, "--> %s: Deleting - max iterations: %s" % (testname, _del_iters))
            while not _del_done and _del_iters > 0:
                try:
                    time.sleep(2)
                    # clean up the cluster
                    _del_output = sub.check_output(['pcluster', 'delete', 'autoTest-%s' % testname, '--config','./config-%s' % distro],
                                                   stderr=sub.STDOUT, universal_newlines=True)
                    _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output
                    out_f.write(_del_output + '\n')
                except sub.CalledProcessError as exc:
                    out_f.write("CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n" % (
                        str(exc), exc.output))
                except Exception as exc:
                    out_f.write("Unexpected exception launching 'pcluster delete' %s: %s\n" % (str(type(exc)), str(exc)))
                finally:
                    _double_writeln(out_f, "--> %s: Deleting - iteration: %s - successfully submitted: %s" % (
                        testname, (_max_del_iters - _del_iters + 1), _del_done))
                    _del_iters -= 1

            try:
                prochelp.exec_command(['pcluster', 'status', 'autoTest-%s' % testname, '--config','./config-%s' % distro], stdout=out_f,
                                      stderr=sub.STDOUT, universal_newlines=True)
            except (prochelp.ProcessHelperError, sub.CalledProcessError):
                # Usually it terminates with exit status 1 since at the end of the delete operation the stack is not found.
                pass
            except Exception as exc:
                out_f.write("Unexpected exception launching 'pcluster status' %s: %s\n" % (str(type(exc)), str(exc)))
        ec2.delete_snapshot(SnapshotId=_snap_id)
        ec2.delete_volume(VolumeId=_volume_id)
        out_f.close()
    print("--> %s: Finished" % (testname))