def _execute_test_jobs_on_cluster(cluster_config, log_file): """ Executes test jobs defined in cluster-check.sh on a given cluster. Args: cluster_config: named tuple of type ClusterConfig containing the configuration of the cluster. log_file: file where to write logs. """ ssh_params = list(ssh_config_options) if cluster_config.key_path: ssh_params.extend(["-i", cluster_config.key_path]) prochelp.exec_command( ["scp"] + ssh_params + [ os.path.join(_dirname(), "cluster-check.sh"), "%s@%s:." % (cluster_config.username, cluster_config.master_node), ], stdout=log_file, stderr=sub.STDOUT, universal_newlines=True, ) _exec_ssh_command( command="/bin/bash --login cluster-check.sh submit %s" % cluster_config.scheduler, username=cluster_config.username, host=cluster_config.master_node, key_path=cluster_config.key_path, stdout=log_file, )
def _exec_ssh_command(command, host, username, key_path, stdout=sub.PIPE, stderr=sub.STDOUT): """ Executes an ssh command on a remote host. Args: command: command to execute. host: host where the command is executed. username: username used to ssh into the host. key_path: key used to ssh into the host. stdout: stdout redirection. Defaults to sub.PIPE. stderr: stderr redirection. Defaults to sub.STDOUT. Returns: the stdout for the executed command. """ ssh_params = list(ssh_config_options) if key_path: ssh_params.extend(["-i", key_path]) return prochelp.exec_command( ["ssh", "-n"] + ssh_params + ["%s@%s" % (username, host), command], stdout=stdout, stderr=stderr, universal_newlines=True, )
def _get_master_ip(cluster_config_file, cluster_name, log_file): """ Retrieves the ip of the master node for a given cluster. Args: cluster_config_file: file containing the config of the cluster. cluster_name: name of the cluster. log_file: file where to write logs. Returns: master_ip: the ip of the master node. """ master_ip = "" # get the master ip, which means grepping through pcluster status output dump = prochelp.exec_command( ["pcluster", "status", "--config", cluster_config_file, cluster_name], stderr=sub.STDOUT, universal_newlines=True, ) dump_array = dump.splitlines() for line in dump_array: m = re.search("MasterPublicIP: (.+)$", line) if m: master_ip = m.group(1) break # Check master ip was correctly retrieved if master_ip == "": _double_writeln( log_file, "!! %s: Master IP not found. This usually occurs when cluster creation failed." % cluster_name) raise ReleaseCheckException("--> %s: Master IP not found!" % cluster_name) _double_writeln(log_file, "--> %s Master IP: %s" % (cluster_name, master_ip)) return master_ip
"r").read().split("\n") for index, line in enumerate(rfile): m = re.search("efs_fs_id", line) if m: rfile[index] = "efs_fs_id = %s" % fsrelated["fsid"] wfile = open("./config-%s-%s" % (region, distro), "w") wfile.write("\n".join(rfile)) wfile.close() print("Creating cluster...") prochelp.exec_command( [ "pcluster", "create", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (region, distro) ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) _create_done = True dump = prochelp.exec_command( [ "pcluster", "status", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (region, distro) ], stderr=sub.STDOUT, universal_newlines=True, ) dump_array = dump.splitlines()
def run_test(distro, clustername, mastersubnet, region): testname = ("%s-%s" % (distro, clustername)) + "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(8)) print(testname) out_f = open("%s-out.txt" % testname, "w") username = username_map[distro] create_done = False create_interrupted = False volume_id = "" az = _get_az(mastersubnet, region) region = az[:-1] try: ec2 = boto3.client("ec2", region_name=region) response_vol = ec2.create_volume(AvailabilityZone=az, Size=10) volume_id = response_vol["VolumeId"] print("Volume created: %s" % volume_id) if volume_id == "": _double_writeln( out_f, "!! %s: Volume ID not found; exiting !!" % testname) raise ReleaseCheckException("--> %s: Volume ID not found!" % testname) _double_writeln(out_f, "--> %s Volume ID: %s" % (testname, volume_id)) print("Preparing volume...") while True: response_vol = ec2.describe_volumes(VolumeIds=[volume_id]) vol_state = response_vol["Volumes"][0]["State"] if vol_state == "available": print("Volume is good to go!") break time.sleep(5) response_snapshot = ec2.create_snapshot(VolumeId=volume_id) snap_id = response_snapshot["SnapshotId"] print("Snapshot created: %s" % snap_id) if snap_id == "": _double_writeln( out_f, "!! %s: Snapshot ID not found; exiting !!" % testname) raise ReleaseCheckException("--> %s: Snapshot ID not found!" % testname) _double_writeln(out_f, "--> %s Snapshot ID: %s" % (testname, snap_id)) print("Preparing snapshot...") while True: response_snap = ec2.describe_snapshots(SnapshotIds=[snap_id]) snap_state = response_snap["Snapshots"][0]["State"] if snap_state == "completed": print("Snapshot is good to go!") break time.sleep(5) rfile = open("./config-%s-%s" % (region, distro), "r").read().split("\n") for index, line in enumerate(rfile): m = re.search("ebs_volume_id", line) if m: rfile[index] = "ebs_volume_id = %s" % volume_id n = re.search("ebs_snapshot_id", line) if n: rfile[index] = "ebs_snapshot_id = %s" % snap_id wfile = open("./config-%s-%s" % (region, distro), "w") wfile.write("\n".join(rfile)) wfile.close() print("Creating cluster...") prochelp.exec_command( [ "pcluster", "create", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (region, distro), "--cluster-template", "%s" % clustername, ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) create_done = True dump = prochelp.exec_command( [ "pcluster", "status", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (region, distro) ], stderr=sub.STDOUT, universal_newlines=True, ) master_ip = "" dump_array = dump.splitlines() for line in dump_array: m = re.search("MasterPublicIP: (.+)$", line) if m: master_ip = m.group(1) break if master_ip == "": _double_writeln( out_f, "!! %s: Master IP not found; exiting !!" % testname) raise ReleaseCheckException("--> %s: Master IP not found!" % testname) _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip)) time.sleep(10) # run test on the cluster... ssh_params = ["-o", "StrictHostKeyChecking=no"] ssh_params += ["-o", "BatchMode=yes"] # ssh_params += ['-o', 'ConnectionAttempts=30'] ssh_params += ["-o", "ConnectTimeout=60"] ssh_params += ["-o", "ServerAliveCountMax=5"] ssh_params += ["-o", "ServerAliveInterval=30"] print("Running tests...") prochelp.exec_command( ["scp"] + ssh_params + [ os.path.join(_dirname(), "ebs-check.sh"), "%s@%s:." % (username, master_ip) ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) time.sleep(5) if clustername == "custom3Vol" or clustername == "custom5Vol": prochelp.exec_command( ["ssh", "-n"] + ssh_params + [ "%s@%s" % (username, master_ip), "/bin/bash --login ebs-check.sh %s %s %s %s" % (testargs_map[clustername], region, volume_id, snap_id), ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) else: prochelp.exec_command( ["ssh", "-n"] + ssh_params + [ "%s@%s" % (username, master_ip), "/bin/bash --login ebs-check.sh %s %s" % (testargs_map[clustername], region), ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) except prochelp.ProcessHelperError as exc: if not create_done and isinstance(exc, prochelp.KilledProcessError): create_interrupted = True _double_writeln( out_f, "--> %s: Interrupting AWS ParallelCluster create!" % testname) _double_writeln(out_f, "!! ABORTED: %s!!" % testname) open("%s.aborted" % testname, "w").close() raise exc except Exception as exc: if not create_done: create_interrupted = True _double_writeln( out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc))) _double_writeln(out_f, "!! FAILURE: %s!!" % testname) open("%s.failed" % testname, "w").close() raise exc finally: print("Cleaning up!") if create_interrupted or create_done: # if the create process was interrupted it may take few seconds for the stack id to be actually registered _max_del_iters = _del_iters = 10 else: # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError) _del_iters = 0 if _del_iters > 0: _del_done = False _double_writeln( out_f, "--> %s: Deleting - max iterations: %s" % (testname, _del_iters)) while not _del_done and _del_iters > 0: try: time.sleep(2) # clean up the cluster _del_output = sub.check_output( [ "pcluster", "delete", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (region, distro), ], stderr=sub.STDOUT, universal_newlines=True, ) _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output out_f.write(_del_output + "\n") except sub.CalledProcessError as exc: out_f.write( "CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n" % (str(exc), exc.output)) except Exception as exc: out_f.write( "Unexpected exception launching 'pcluster delete' %s: %s\n" % (str(type(exc)), str(exc))) finally: _double_writeln( out_f, "--> %s: Deleting - iteration: %s - successfully submitted: %s" % (testname, (_max_del_iters - _del_iters + 1), _del_done), ) _del_iters -= 1 try: prochelp.exec_command( [ "pcluster", "status", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (region, distro) ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) except (prochelp.ProcessHelperError, sub.CalledProcessError): # Usually it terminates with exit status 1 since at the end of the delete operation # the stack is not found. pass except Exception as exc: out_f.write( "Unexpected exception launching 'pcluster status' %s: %s\n" % (str(type(exc)), str(exc))) ec2.delete_snapshot(SnapshotId=snap_id) ec2.delete_volume(VolumeId=volume_id) out_f.close() print("--> %s: Finished" % testname)
def run_test(distro, clustername, mastersubnet, region): testname = ("%s-%s" % (distro, clustername)) + "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(8)) print(testname) out_f = open("%s-out.txt" % testname, "w") username = username_map[distro] _create_done = False _create_interrupted = False _volume_id = "" _az = _get_az(mastersubnet, region) _region = _az[:-1] try: print("Creating cluster...") prochelp.exec_command( [ "pcluster", "create", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (_region, distro), "--cluster-template", "%s" % clustername, ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) _create_done = True dump = prochelp.exec_command( [ "pcluster", "status", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (_region, distro) ], stderr=sub.STDOUT, universal_newlines=True, ) dump_array = dump.splitlines() for line in dump_array: m = re.search("MasterPublicIP: (.+)$", line) if m: master_ip = m.group(1) break if master_ip == "": _double_writeln( out_f, "!! %s: Master IP not found; exiting !!" % (testname)) raise ReleaseCheckException("--> %s: Master IP not found!" % testname) _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip)) time.sleep(10) # run test on the cluster... ssh_params = ["-o", "StrictHostKeyChecking=no"] ssh_params += ["-o", "BatchMode=yes"] # ssh_params += ['-o', 'ConnectionAttempts=30'] ssh_params += ["-o", "ConnectTimeout=60"] ssh_params += ["-o", "ServerAliveCountMax=5"] ssh_params += ["-o", "ServerAliveInterval=30"] print("Running tests...") prochelp.exec_command( ["scp"] + ssh_params + [ os.path.join(_dirname(), "raid-check.sh"), "%s@%s:." % (username, master_ip) ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) time.sleep(5) if clustername == "default": prochelp.exec_command( ["ssh", "-n"] + ssh_params + [ "%s@%s" % (username, master_ip), "/bin/bash --login raid-check.sh" ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) else: prochelp.exec_command( ["ssh", "-n"] + ssh_params + [ "%s@%s" % (username, master_ip), "/bin/bash --login raid-check.sh %s" % raid_path ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) print("Test passed...") except prochelp.ProcessHelperError as exc: if not _create_done and isinstance(exc, prochelp.KilledProcessError): _create_interrupted = True _double_writeln( out_f, "--> %s: Interrupting AWS ParallelCluster create!" % testname) _double_writeln(out_f, "!! ABORTED: %s!!" % (testname)) open("%s.aborted" % testname, "w").close() raise exc except Exception as exc: if not _create_done: _create_interrupted = True _double_writeln( out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc))) _double_writeln(out_f, "!! FAILURE: %s!!" % (testname)) open("%s.failed" % testname, "w").close() raise exc finally: print("Cleaning up!") if _create_interrupted or _create_done: # if the create process was interrupted it may take few seconds for the stack id to be actually registered _max_del_iters = _del_iters = 10 else: # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError) _del_iters = 0 if _del_iters > 0: _del_done = False _double_writeln( out_f, "--> %s: Deleting - max iterations: %s" % (testname, _del_iters)) while not _del_done and _del_iters > 0: try: time.sleep(2) # clean up the cluster _del_output = sub.check_output( [ "pcluster", "delete", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (_region, distro), ], stderr=sub.STDOUT, universal_newlines=True, ) _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output out_f.write(_del_output + "\n") except sub.CalledProcessError as exc: out_f.write( "CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n" % (str(exc), exc.output)) except Exception as exc: out_f.write( "Unexpected exception launching 'pcluster delete' %s: %s\n" % (str(type(exc)), str(exc))) finally: _double_writeln( out_f, "--> %s: Deleting - iteration: %s - successfully submitted: %s" % (testname, (_max_del_iters - _del_iters + 1), _del_done), ) _del_iters -= 1 try: prochelp.exec_command( [ "pcluster", "status", "autoTest-%s" % testname, "--config", "./config-%s-%s" % (region, distro) ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) except (prochelp.ProcessHelperError, sub.CalledProcessError): # Usually terminates with exit status 1 since at the end of the delete operation the stack is not found pass except Exception as exc: out_f.write( "Unexpected exception launching 'pcluster status' %s: %s\n" % (str(type(exc)), str(exc))) out_f.close() print("--> %s: Finished" % (testname))
def run_test(region, distro, scheduler, instance_type, key_name, expected_asg_capacity, expected_compute_nodes, extra_args): _create_interrupted = False _create_done = False testname = "%s-%s-%s-%s-%s" % (region, distro, scheduler, instance_type.replace(".", ""), _timestamp) test_filename = "%s-config.cfg" % testname out_f = open("%s-out.txt" % testname, "w", 0) # Test jobs should take at most 9 minutes to be executed. # These guarantees that the jobs are executed in parallel. max_jobs_exec_time = 9 * 60 try: _double_writeln(out_f, "--> %s: Starting" % testname) cluster_config = ClusterConfig( config_file=test_filename, stack_name="parallelcluster-" + testname, region=region, distro=distro, instance_type=instance_type, scheduler=scheduler, username=username_map[distro], key_path=extra_args["key_path"], key_name=key_name, master_node="", scaledown_idletime=4, ) _write_pcluster_config(cluster_config=cluster_config, extra_args=extra_args) _double_writeln( out_f, "--> %s: Created pcluster config file %s" % (testname, test_filename)) # build the cluster _double_writeln(out_f, "--> %s: Creating the cluster" % testname) prochelp.exec_command( ["pcluster", "create", "--config", test_filename, testname], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) _create_done = True _double_writeln(out_f, "--> %s: Cluster created successfully" % testname) cluster_config = cluster_config._replace( master_node=_get_master_ip(cluster_config_file=test_filename, cluster_name=testname, log_file=out_f)) _double_writeln(out_f, "--> %s: Executing test jobs on cluster." % testname) _execute_test_jobs_on_cluster(cluster_config=cluster_config, log_file=out_f) _double_writeln(out_f, "--> %s: Test jobs successfully started" % testname) _double_writeln( out_f, "--> %s: Monitoring asg capacity and compute nodes" % testname) additional_watching_time = 5 * 60 asg_capacity_time_series, compute_nodes_time_series, timestamps = _watch_compute_nodes_allocation( duration=max_jobs_exec_time + cluster_config.scaledown_idletime * 60 + additional_watching_time, frequency=20, cluster_config=cluster_config, ) _double_writeln( out_f, "--> %s: Monitoring completed: %s, %s, %s" % ( testname, "asg_capacity_time_series [" + " ".join(map(str, asg_capacity_time_series)) + "]", "compute_nodes_time_series [" + " ".join(map(str, compute_nodes_time_series)) + "]", "timestamps [" + " ".join(map(str, timestamps)) + "]", ), ) _double_writeln( out_f, "--> %s: Verifying test jobs completed successfully" % testname) # jobs need to complete in 9 mins in order to verify parallelism _assert_test_jobs_completed(cluster_config=cluster_config, max_jobs_exec_time=max_jobs_exec_time, log_file=out_f) _double_writeln(out_f, "--> %s: Test jobs completed successfully" % testname) _double_writeln( out_f, "--> %s: Verifying auto-scaling worked correctly" % testname) _assert_scaling_works( asg_capacity_time_series=asg_capacity_time_series, compute_nodes_time_series=compute_nodes_time_series, expected_asg_capacity=expected_asg_capacity, expected_compute_nodes=expected_compute_nodes, ) _double_writeln(out_f, "--> %s: Autoscaling worked as expected" % testname) _double_writeln(out_f, "SUCCESS: %s!!" % testname) open("%s.success" % testname, "w").close() except prochelp.ProcessHelperError as exc: if not _create_done and isinstance(exc, prochelp.KilledProcessError): _create_interrupted = True _double_writeln(out_f, "--> %s: Interrupting pcluster create!" % testname) _double_writeln(out_f, "!! ABORTED: %s!!" % (testname)) open("%s.aborted" % testname, "w").close() raise exc except AssertionError as err: _double_writeln( out_f, "--> %s: Test assertion failed: %s" % (testname, err.message)) _double_writeln(out_f, "!! FAILURE: %s!!" % testname) open("%s.failed" % testname, "w").close() raise err except Exception as exc: if not _create_done: _create_interrupted = True _double_writeln( out_f, "--> %s: Unexpected exception %s: %s" % (testname, str(type(exc)), str(exc))) _double_writeln(out_f, "!! FAILURE: %s!!" % testname) open("%s.failed" % testname, "w").close() raise exc finally: if _create_interrupted or _create_done: # if the create process was interrupted it may take few seconds for the stack id to be actually registered _max_del_iters = _del_iters = 10 else: # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError) _del_iters = 0 if _del_iters > 0: _del_done = False _double_writeln( out_f, "--> %s: Deleting - max iterations: %s" % (testname, _del_iters)) while not _del_done and _del_iters > 0: try: time.sleep(2) # clean up the cluster _del_output = sub.check_output( [ "pcluster", "delete", "--config", test_filename, "-nw", testname ], stderr=sub.STDOUT, universal_newlines=True, ) _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output out_f.write(_del_output + "\n") except sub.CalledProcessError as exc: out_f.write( "CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n" % (str(exc), exc.output)) except Exception as exc: out_f.write( "Unexpected exception launching 'pcluster delete' %s: %s\n" % (str(type(exc)), str(exc))) finally: _double_writeln( out_f, "--> %s: Deleting - iteration: %s - successfully submitted: %s" % (testname, (_max_del_iters - _del_iters + 1), _del_done), ) _del_iters -= 1 try: prochelp.exec_command( [ "pcluster", "status", "--config", test_filename, testname ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) except (prochelp.ProcessHelperError, sub.CalledProcessError): # Usually it terminates with exit status 1 since at the end of the delete operation the stack is not found. pass except Exception as exc: out_f.write( "Unexpected exception launching 'pcluster status' %s: %s\n" % (str(type(exc)), str(exc))) _double_writeln(out_f, "--> %s: Finished" % testname) out_f.close()
def run_test(region, distro, scheduler, instance_type, key_name, extra_args): scaledown_idletime = 2 testname = "%s-%s-%s-%s-%s" % (region, distro, scheduler, instance_type.replace(".", ""), _timestamp) test_filename = "%s-config.cfg" % testname key_path = extra_args["key_path"] custom_cookbook = extra_args["custom_cookbook_url"] custom_node = extra_args["custom_node_url"] custom_template = extra_args["custom_template_url"] print("--> %s: Starting" % (testname)) file = open(test_filename, "w") file.write("[aws]\n") file.write("aws_region_name = %s\n" % region) file.write("[cluster default]\n") file.write("vpc_settings = public\n") file.write("key_name = %s\n" % key_name) file.write("base_os = %s\n" % distro) file.write("master_instance_type = %s\n" % instance_type) file.write("compute_instance_type = %s\n" % instance_type) file.write("initial_queue_size = 1\n") file.write("maintain_initial_size = false\n") file.write("scheduler = %s\n" % (scheduler)) file.write("scaling_settings = custom\n") if custom_template: file.write("template_url = %s\n" % custom_template) if custom_cookbook: file.write("custom_chef_cookbook = %s\n" % custom_cookbook) if custom_node: file.write( 'extra_json = { "cluster" : { "custom_node_package" : "%s" } }\n' % custom_node) file.write("[vpc public]\n") file.write("master_subnet_id = %s\n" % (setup[region]["subnet"])) file.write("vpc_id = %s\n" % (setup[region]["vpc"])) file.write("[global]\n") file.write("cluster_template = default\n") file.write("[scaling custom]\n") file.write("scaledown_idletime = %s\n" % scaledown_idletime) file.close() out_f = open("%s-out.txt" % testname, "w", 0) master_ip = "" username = username_map[distro] _create_interrupted = False _create_done = False try: # build the cluster prochelp.exec_command( ["pcluster", "create", "--config", test_filename, testname], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) _create_done = True # get the master ip, which means grepping through pcluster status gorp dump = prochelp.exec_command( ["pcluster", "status", "--config", test_filename, testname], stderr=sub.STDOUT, universal_newlines=True) dump_array = dump.splitlines() for line in dump_array: m = re.search("MasterPublicIP: (.+)$", line) if m: master_ip = m.group(1) break if master_ip == "": _double_writeln( out_f, "!! %s: Master IP not found; exiting !!" % (testname)) raise ReleaseCheckException("--> %s: Master IP not found!" % testname) _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip)) # run test on the cluster... ssh_params = ["-o", "StrictHostKeyChecking=no"] ssh_params += ["-o", "BatchMode=yes"] # ssh_params += ['-o', 'ConnectionAttempts=30'] ssh_params += ["-o", "ConnectTimeout=60"] ssh_params += ["-o", "ServerAliveCountMax=5"] ssh_params += ["-o", "ServerAliveInterval=30"] if key_path: ssh_params.extend(["-i", key_path]) prochelp.exec_command( ["scp"] + ssh_params + [ os.path.join(_dirname(), "cluster-check.sh"), "%s@%s:." % (username, master_ip) ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) prochelp.exec_command( ["ssh", "-n"] + ssh_params + [ "%s@%s" % (username, master_ip), "/bin/bash --login cluster-check.sh submit %s" % scheduler ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) # Sleep for scaledown_idletime to give time for the instances to scale down time.sleep(60 * scaledown_idletime) check_asg_capacity("parallelcluster-" + testname, region, out_f) prochelp.exec_command( ["ssh", "-n"] + ssh_params + [ "%s@%s" % (username, master_ip), "/bin/bash --login cluster-check.sh scaledown_check %s" % scheduler ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) _double_writeln(out_f, "SUCCESS: %s!!" % testname) open("%s.success" % testname, "w").close() except prochelp.ProcessHelperError as exc: if not _create_done and isinstance(exc, prochelp.KilledProcessError): _create_interrupted = True _double_writeln(out_f, "--> %s: Interrupting pcluster create!" % testname) _double_writeln(out_f, "!! ABORTED: %s!!" % (testname)) open("%s.aborted" % testname, "w").close() raise exc except Exception as exc: if not _create_done: _create_interrupted = True _double_writeln( out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc))) _double_writeln(out_f, "!! FAILURE: %s!!" % (testname)) open("%s.failed" % testname, "w").close() raise exc finally: if _create_interrupted or _create_done: # if the create process was interrupted it may take few seconds for the stack id to be actually registered _max_del_iters = _del_iters = 10 else: # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError) _del_iters = 0 if _del_iters > 0: _del_done = False _double_writeln( out_f, "--> %s: Deleting - max iterations: %s" % (testname, _del_iters)) while not _del_done and _del_iters > 0: try: time.sleep(2) # clean up the cluster _del_output = sub.check_output( [ "pcluster", "delete", "--config", test_filename, "-nw", testname ], stderr=sub.STDOUT, universal_newlines=True, ) _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output out_f.write(_del_output + "\n") except sub.CalledProcessError as exc: out_f.write( "CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n" % (str(exc), exc.output)) except Exception as exc: out_f.write( "Unexpected exception launching 'pcluster delete' %s: %s\n" % (str(type(exc)), str(exc))) finally: _double_writeln( out_f, "--> %s: Deleting - iteration: %s - successfully submitted: %s" % (testname, (_max_del_iters - _del_iters + 1), _del_done), ) _del_iters -= 1 try: prochelp.exec_command( [ "pcluster", "status", "--config", test_filename, testname ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True, ) except (prochelp.ProcessHelperError, sub.CalledProcessError): # Usually it terminates with exit status 1 since at the end of the delete operation the stack is not found. pass except Exception as exc: out_f.write( "Unexpected exception launching 'pcluster status' %s: %s\n" % (str(type(exc)), str(exc))) out_f.close() print("--> %s: Finished" % (testname))
def run_test(region, distro, scheduler, instance_type, key_name, extra_args): testname = '%s-%s-%s-%s-%s' % (region, distro, scheduler, instance_type.replace('.', ''), _timestamp) test_filename = "%s-config.cfg" % testname key_path = extra_args['key_path'] custom_cookbook = extra_args['custom_cookbook_url'] custom_node = extra_args['custom_node_url'] custom_template = extra_args['custom_template_url'] print("--> %s: Starting" % (testname)) file = open(test_filename, "w") file.write("[aws]\n") file.write("aws_region_name = %s\n" % region) file.write("[cluster default]\n") file.write("vpc_settings = public\n") file.write("key_name = %s\n" % key_name) file.write("base_os = %s\n" % distro) file.write("master_instance_type = %s\n" % instance_type) file.write("compute_instance_type = %s\n" % instance_type) file.write("initial_queue_size = 1\n") file.write("maintain_initial_size = true\n") file.write("scheduler = %s\n" % (scheduler)) file.write("scaling_settings = custom\n") if custom_template: file.write("template_url = %s\n" % custom_template) if custom_cookbook: file.write("custom_chef_cookbook = %s\n" % custom_cookbook) if custom_node: file.write( 'extra_json = { "cfncluster" : { "custom_node_package" : "%s" } }\n' % custom_node) file.write("[vpc public]\n") file.write("master_subnet_id = %s\n" % (setup[region]['subnet'])) file.write("vpc_id = %s\n" % (setup[region]['vpc'])) file.write("[global]\n") file.write("cluster_template = default\n") file.write("[scaling custom]\n") file.write("scaling_adjustment = 1\n") file.write("scaling_period = 30\n") file.write("scaling_evaluation_periods = 1\n") file.write("scaling_cooldown = 300\n") file.close() out_f = open('%s-out.txt' % testname, 'w', 0) master_ip = '' username = username_map[distro] _create_interrupted = False _create_done = False try: # build the cluster prochelp.exec_command( ['cfncluster', 'create', '--config', test_filename, testname], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True) _create_done = True # get the master ip, which means grepping through cfncluster status gorp dump = prochelp.exec_command( ['cfncluster', 'status', '--config', test_filename, testname], stderr=sub.STDOUT, universal_newlines=True) dump_array = dump.splitlines() for line in dump_array: m = re.search('MasterPublicIP: (.+)$', line) if m: master_ip = m.group(1) break if master_ip == '': _double_writeln( out_f, '!! %s: Master IP not found; exiting !!' % (testname)) raise ReleaseCheckException('--> %s: Master IP not found!' % testname) _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip)) # run test on the cluster... ssh_params = ['-o', 'StrictHostKeyChecking=no'] ssh_params += ['-o', 'BatchMode=yes'] # ssh_params += ['-o', 'ConnectionAttempts=30'] ssh_params += ['-o', 'ConnectTimeout=60'] ssh_params += ['-o', 'ServerAliveCountMax=5'] ssh_params += ['-o', 'ServerAliveInterval=30'] if key_path: ssh_params.extend(['-i', key_path]) prochelp.exec_command(['scp'] + ssh_params + [ os.path.join(_dirname(), 'cluster-check.sh'), '%s@%s:.' % (username, master_ip) ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True) prochelp.exec_command(['ssh', '-n'] + ssh_params + [ '%s@%s' % (username, master_ip), '/bin/bash --login cluster-check.sh %s' % scheduler ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True) _double_writeln(out_f, 'SUCCESS: %s!!' % (testname)) open('%s.success' % testname, 'w').close() except prochelp.ProcessHelperError as exc: if not _create_done and isinstance(exc, prochelp.KilledProcessError): _create_interrupted = True _double_writeln( out_f, "--> %s: Interrupting cfncluster create!" % testname) _double_writeln(out_f, '!! ABORTED: %s!!' % (testname)) open('%s.aborted' % testname, 'w').close() raise exc except Exception as exc: if not _create_done: _create_interrupted = True _double_writeln( out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc))) _double_writeln(out_f, "!! FAILURE: %s!!" % (testname)) open('%s.failed' % testname, 'w').close() raise exc finally: if _create_interrupted or _create_done: # if the create process was interrupted it may take few seconds for the stack id to be actually registered _max_del_iters = _del_iters = 10 else: # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError) _del_iters = 0 if _del_iters > 0: _del_done = False _double_writeln( out_f, "--> %s: Deleting - max iterations: %s" % (testname, _del_iters)) while not _del_done and _del_iters > 0: try: time.sleep(2) # clean up the cluster _del_output = sub.check_output([ 'cfncluster', 'delete', '--config', test_filename, '-nw', testname ], stderr=sub.STDOUT, universal_newlines=True) _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output out_f.write(_del_output + '\n') except sub.CalledProcessError as exc: out_f.write( "CalledProcessError exception launching 'cfncluster delete': %s - Output:\n%s\n" % (str(exc), exc.output)) except Exception as exc: out_f.write( "Unexpected exception launching 'cfncluster delete' %s: %s\n" % (str(type(exc)), str(exc))) finally: _double_writeln( out_f, "--> %s: Deleting - iteration: %s - successfully submitted: %s" % (testname, (_max_del_iters - _del_iters + 1), _del_done)) _del_iters -= 1 try: prochelp.exec_command([ 'cfncluster', 'status', '--config', test_filename, testname ], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True) except (prochelp.ProcessHelperError, sub.CalledProcessError): # Usually it terminates with exit status 1 since at the end of the delete operation the stack is not found. pass except Exception as exc: out_f.write( "Unexpected exception launching 'cfncluster status' %s: %s\n" % (str(type(exc)), str(exc))) out_f.close() print("--> %s: Finished" % (testname))
def run_test(distro, clustername, mastersubnet, region): testname=("%s-%s" % (distro, clustername)) + ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(8)) print(testname) out_f = open('%s-out.txt' % testname, 'w') username = username_map[distro] _create_done = False _create_interrupted = False _volume_id = '' _az = _get_az(mastersubnet, region) _region = _az[:-1] try: ec2 = boto3.client('ec2', region_name=region) response_vol = ec2.create_volume(AvailabilityZone=_az, Size=10) _volume_id = response_vol['VolumeId'] print ("Volume created: %s" % _volume_id) if _volume_id == '': _double_writeln(out_f, '!! %s: Volume ID not found; exiting !!' % (testname)) raise ReleaseCheckException('--> %s: Volume ID not found!' % testname) _double_writeln(out_f, "--> %s Volume ID: %s" % (testname, _volume_id)) time.sleep(10) response_snapshot = ec2.create_snapshot(VolumeId=_volume_id) _snap_id = response_snapshot['SnapshotId'] print ("Snapshot created: %s" % _snap_id) if _volume_id == '': _double_writeln(out_f, '!! %s: Snapshot ID not found; exiting !!' % (testname)) raise ReleaseCheckException('--> %s: Snapshot ID not found!' % testname) _double_writeln(out_f, "--> %s Snapshot ID: %s" % (testname, _snap_id)) rfile = open("./config-%s" % distro, "r").read().split('\n') for index, line in enumerate(rfile): m = re.search('ebs_volume_id', line) if m: rfile[index] = 'ebs_volume_id = %s' % _volume_id n = re.search('ebs_snapshot_id', line) if n: rfile[index] = 'ebs_snapshot_id = %s' % _snap_id wfile = open("./config-%s" % distro, "w") wfile.write('\n'.join(rfile)) wfile.close() print("Creating cluster...") prochelp.exec_command(['pcluster', 'create', 'autoTest-%s' % testname,'--config','./config-%s' % distro,'--cluster-template', '%s' % clustername], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True) _create_done = True dump = prochelp.exec_command(['pcluster', 'status', 'autoTest-%s' % testname, '--config','./config-%s' % distro], stderr=sub.STDOUT, universal_newlines=True) dump_array = dump.splitlines() for line in dump_array: m = re.search('MasterPublicIP: (.+)$', line) if m: master_ip = m.group(1) break if master_ip == '': _double_writeln(out_f, '!! %s: Master IP not found; exiting !!' % (testname)) raise ReleaseCheckException('--> %s: Master IP not found!' % testname) _double_writeln(out_f, "--> %s Master IP: %s" % (testname, master_ip)) time.sleep(10) # run test on the cluster... ssh_params = ['-o', 'StrictHostKeyChecking=no'] ssh_params += ['-o', 'BatchMode=yes'] # ssh_params += ['-o', 'ConnectionAttempts=30'] ssh_params += ['-o', 'ConnectTimeout=60'] ssh_params += ['-o', 'ServerAliveCountMax=5'] ssh_params += ['-o', 'ServerAliveInterval=30'] print("Running tests...") prochelp.exec_command( ['scp'] + ssh_params + [os.path.join(_dirname(), 'ebs-check.sh'), '%s@%s:.' % (username, master_ip)], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True) time.sleep(5) if clustername == 'custom3Vol' or clustername == 'custom5Vol': prochelp.exec_command(['ssh', '-n'] + ssh_params + ['%s@%s' % (username, master_ip), '/bin/bash --login ebs-check.sh %s %s %s %s' % (testargs_map[clustername], _region, _volume_id, _snap_id)], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True) else: prochelp.exec_command(['ssh', '-n'] + ssh_params + ['%s@%s' % (username, master_ip), '/bin/bash --login ebs-check.sh %s %s' % (testargs_map[clustername], _region)], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True) except prochelp.ProcessHelperError as exc: if not _create_done and isinstance(exc, prochelp.KilledProcessError): _create_interrupted = True _double_writeln(out_f, "--> %s: Interrupting AWS ParallelCluster create!" % testname) _double_writeln(out_f, '!! ABORTED: %s!!' % (testname)) open('%s.aborted' % testname, 'w').close() raise exc except Exception as exc: if not _create_done: _create_interrupted = True _double_writeln(out_f, "Unexpected exception %s: %s" % (str(type(exc)), str(exc))) _double_writeln(out_f, "!! FAILURE: %s!!" % (testname)) open('%s.failed' % testname, 'w').close() raise exc finally: print("Cleaning up!") if _create_interrupted or _create_done: # if the create process was interrupted it may take few seconds for the stack id to be actually registered _max_del_iters = _del_iters = 10 else: # No delete is necessary if cluster creation wasn't started (process_helper.AbortedProcessError) _del_iters = 0 if _del_iters > 0: _del_done = False _double_writeln(out_f, "--> %s: Deleting - max iterations: %s" % (testname, _del_iters)) while not _del_done and _del_iters > 0: try: time.sleep(2) # clean up the cluster _del_output = sub.check_output(['pcluster', 'delete', 'autoTest-%s' % testname, '--config','./config-%s' % distro], stderr=sub.STDOUT, universal_newlines=True) _del_done = "DELETE_IN_PROGRESS" in _del_output or "DELETE_COMPLETE" in _del_output out_f.write(_del_output + '\n') except sub.CalledProcessError as exc: out_f.write("CalledProcessError exception launching 'pcluster delete': %s - Output:\n%s\n" % ( str(exc), exc.output)) except Exception as exc: out_f.write("Unexpected exception launching 'pcluster delete' %s: %s\n" % (str(type(exc)), str(exc))) finally: _double_writeln(out_f, "--> %s: Deleting - iteration: %s - successfully submitted: %s" % ( testname, (_max_del_iters - _del_iters + 1), _del_done)) _del_iters -= 1 try: prochelp.exec_command(['pcluster', 'status', 'autoTest-%s' % testname, '--config','./config-%s' % distro], stdout=out_f, stderr=sub.STDOUT, universal_newlines=True) except (prochelp.ProcessHelperError, sub.CalledProcessError): # Usually it terminates with exit status 1 since at the end of the delete operation the stack is not found. pass except Exception as exc: out_f.write("Unexpected exception launching 'pcluster status' %s: %s\n" % (str(type(exc)), str(exc))) ec2.delete_snapshot(SnapshotId=_snap_id) ec2.delete_volume(VolumeId=_volume_id) out_f.close() print("--> %s: Finished" % (testname))