Beispiel #1
0
def fatal_warn(msg, exit=True):
    print()
    for m in msg.split("\n"):
        agenda.subfailure(m)
    print()
    if exit:
        sys.exit(1)
Beispiel #2
0
def expect(res, msg):
    if res and res.exited:
        agenda.subfailure(msg)
        print("exit code: {}\ncommand: {}\nstdout: {}\nstderr: {}".format(
            res.exited,
            res.command,
            res.stdout,
            res.stderr
        ))
    return res
Beispiel #3
0
 def check_file(self, grep, where):
     res = self.run(f"grep \"{grep}\" {where}")
     if res.exited != 0:
         agenda.subfailure(
             f"Unable to find search string (\"{grep}\") in process output file {where}"
         )
         res = self.run(f'tail {where}')
         if res.exited == 0:
             print(res.command)
             print(res.stdout)
         sys.exit(1)
Beispiel #4
0
 def check_proc(self, proc_name, proc_out):
     res = self.run(f"pgrep {proc_name}")
     if res.exited != 0:
         agenda.subfailure(
             f'failed to find running process with name \"{proc_name}\" on {self.addr}'
         )
         res = self.run(f'tail {proc_out}')
         if res.exited == 0:
             print(res.command)
             print(res.stdout)
         else:
             print(res)
         sys.exit(1)
Beispiel #5
0
def check_existing_experiment(driver):
    agenda.task("Check for existing experiment")
    driver.get("https://www.cloudlab.us/user-dashboard.php#experiments")
    table = None
    try:
        table = driver.find_element_by_id("experiments_table")
    except:
        agenda.subfailure("No existing experiment found")
        return None

    elements = [
        e.text.split()[0]
        for e in table.find_elements_by_xpath("//table/tbody")
        if len(e.text.split()) > 0
    ]
    agenda.subtask("Existing experiment found")
    driver.find_element_by_link_text(elements[0]).click()
    time.sleep(6)
    return get_machines_from_experiment(driver)
Beispiel #6
0
def run_loads(conn, server, redis_addr, outf, wrkfile):
    conn.run("sudo pkill -INT iokerneld")

    write_shenango_config(conn)
    while True:
        conn.run("./iokerneld",
                 wd="~/burrito/shenango-chunnel/caladan",
                 sudo=True,
                 background=True)
        time.sleep(2)
        loads_start = time.time()
        agenda.subtask(f"loads client starting")
        ok = None
        try:
            ok = conn.run(
                f"RUST_LOG=info,ycsb=debug,bertha=debug ./target/release/ycsb \
                    --addr {server}:4242 \
                    --redis-addr={redis_addr} \
                    -i 1000 \
                    --accesses {wrkfile} \
                    -s host.config \
                    --logging --tracing --loads-only",
                wd="~/burrito",
                stdout=f"{outf}-loads.out",
                stderr=f"{outf}-loads.err",
                timeout=30,
            )
        except:
            agenda.subfailure(
                f"loads failed, retrying after {time.time() - loads_start} s")
        finally:
            conn.run("sudo pkill -INT iokerneld")
        if ok is None or ok.exited != 0:
            agenda.subfailure(
                f"loads failed, retrying after {time.time() - loads_start} s")
            continue
        else:
            agenda.subtask(f"loads client done: {time.time() - loads_start} s")
            break
Beispiel #7
0
def check(ok, msg, addr, allowed=[]):
    # exit code 0 is always ok, allowed is in addition
    if ok.exited != 0 and ok.exited not in allowed:
        agenda.subfailure(f"{msg} on {addr}: {ok.exited} not in {allowed}")
        agenda.subfailure("stdout")
        print(ok.stdout)
        agenda.subfailure("stderr")
        print(ok.stderr)
        thread_ok = False  # something went wrong.
        sys.exit(1)
def get_asn(ip):
    try:
        return ip2asn_bgp(ip)
    except Exception as e:
        agenda.subfailure(f"ip {ip}: error {e}")
        return None
Beispiel #9
0
#!/usr/bin/env python3
import agenda

agenda.section("Set up network")
agenda.task("Create Virtual Private Cloud")
agenda.task("Attach internet gateway")
agenda.task("Allocate subnet #1")
agenda.subtask("Hook in internet-enabled route table")
agenda.task("Allocate subnet #2")
agenda.task("Generate VPC key-pair")
agenda.subfailure("Could not create key-pair")
agenda.subtask("Attempting to delete old key-pair")
agenda.subtask("Attempting to generate new key-pair")

agenda.section("Launch instances")
agenda.task("Launch instances in cluster #1")
agenda.task("Launch instances in cluster #2")
agenda.task("Wait for HQ to start running")
agenda.subtask("Still in 'pending' state")
agenda.subtask("Still in 'pending' state")
agenda.task("Wait for workers to reach 'running' state")
agenda.task("Wait for HQ to become pingable")
print("54.84.179.156 | UNREACHABLE!")
print("54.84.179.156 | UNREACHABLE!")
print('54.84.179.156 | SUCCESS => {"changed": false, "ping": "pong"}')
agenda.task("Wait for workers to become pingable")
print('10.0.1.237 | SUCCESS => {"changed": false, "ping": "pong"}')

agenda.section("Deploy application")
print("""\
PLAY [ansible-playbook]
Beispiel #10
0
os.makedirs(args.outdir, exist_ok = True)

for n in args.negotiate:
    for srv in args.server:
        is_remote = '127.0.0.1' not in srv
        for m in args.mode:
            if m not in modes:
                agenda.failure(f"unknown mode {m}")
                break
            if m != 'rel' and args.ghostunnel is None:
                agenda.failure("need ghostunnel arg for non-rel exp")
                break
            agenda.task(f"mode: {m}, negotiate {n}")
            if is_remote and m == 'rel-ux':
                agenda.subfailure("No remote for unix mode")
                continue
            start_localnamectl(srv, args.burrito_root if 'fp' in m else None)
            if m == 'rel-ux':
                start_server_unix(n)
                time.sleep(15)
                exp_unix(args, n)
            else:
                start_server(
                    srv,
                    args.server_port,
                    args.ghostunnel if 'rel' not in m else None,
                    args.burrito_root if 'fp' in m else None,
                    n)
                agenda.task("waiting for server")
                time.sleep(15)
Beispiel #11
0
def do_exp(outdir, lb, shards, clients, shardtype, ops_per_sec, wrkload):
    wrkname = wrkload.split("/")[-1].split(".")[0]
    num_shards = len(shards)
    server_prefix = f"{outdir}/{shardtype}shard-{ops_per_sec}-{wrkname}-lb"
    shard_prefix = f"{outdir}/{shardtype}shard-{ops_per_sec}-{wrkname}-shard"
    outf = f"{outdir}/{shardtype}shard-{ops_per_sec}-{wrkname}-client"

    agenda.task(f"checking {outf}0-{clients[0].addr}.data")
    if os.path.exists(f"{outf}0-{clients[0].addr}.data"):
        agenda.task(f"skipping: server = {lb.addr}, shardtype = {shardtype}, load = {ops_per_sec} ops/s")
        return True
    else:
        agenda.task(f"running: server = {lb.addr}, shardtype = {shardtype}, load = {ops_per_sec} ops/s")

    # load = (4 (client threads / proc) * 1 (procs/machine) * {len(machines) - 1} (machines))
    #        / {interarrival} (per client thread)
    num_client_threads = int(wrkname.split('-')[-1])
    interarrival_secs = num_client_threads * len(clients) / ops_per_sec
    interarrival_us = int(interarrival_secs * 1e6)

    redis_addr = start_redis(lb)
    time.sleep(5)
    server_addr = lb.addr
    agenda.task(f"starting: server = {server_addr}, shardtype = {shardtype}, load = {ops_per_sec}, ops/s -> interarrival_us = {interarrival_us}, num_clients = {len(clients)}")

    agenda.subtask("starting shards")
    for s in shards:
        start_shard(s, shard_prefix)
    time.sleep(5)

    agenda.subtask("starting lb")
    redis_port = redis_addr.split(":")[-1]
    start_lb(lb, f"127.0.0.1:{redis_port}", [s.addr for s in shards], server_prefix)
    time.sleep(5)

    # prime the server with loads
    # conn, server, redis_addr, outf, wrkload='uniform'
    agenda.task("doing loads")
    run_loads(clients[0], server_addr, redis_addr, outf, wrkload)
    # others are clients
    agenda.task("starting clients")
    client_threads = [threading.Thread(target=run_client, args=(
            m,
            server_addr,
            redis_addr,
            interarrival_us,
            shardtype,
            outf,
            wrkload
        ),
    ) for m in clients]

    [t.start() for t in client_threads]
    [t.join() for t in client_threads]
    agenda.task("all clients returned")

    # kill the server
    lb.run("sudo pkill -9 burrito-lb")
    lb.run("sudo pkill -9 iokerneld")
    for s in shards:
        s.run("sudo pkill -9 single-shard")
        s.run("sudo pkill -9 iokerneld")

    lb.run("rm ~/burrito/*.config")
    for m in shards:
        m.run("rm ~/burrito/*.config")
    for m in clients:
        m.run("rm ~/burrito/*.config")

    agenda.task("get lb files")
    if not lb.local:
        lb.get(f"burrito/{server_prefix}.out", local=f"{server_prefix}.out", preserve_mode=False)
        lb.get(f"burrito/{server_prefix}.err", local=f"{server_prefix}.err", preserve_mode=False)
    agenda.task("get shard files")
    for s in shards:
        if not s.local:
            s.get(f"burrito/{shard_prefix}-{s.addr}.out", local=f"{shard_prefix}-{s.addr}.out", preserve_mode=False)
            s.get(f"burrito/{shard_prefix}-{s.addr}.err", local=f"{shard_prefix}-{s.addr}.err", preserve_mode=False)
            #s.get(f"burrito/{shard_prefix}-{s.addr}.trace", local=f"{shard_prefix}-{s.addr}.trace", preserve_mode=False)

    def get_files(num):
        fn = c.get
        if c.local:
            agenda.subtask(f"Use get_local: {c.host}")
            fn = get_local

        agenda.subtask(f"getting {outf}{num}-{c.addr}.err")
        fn(
            f"burrito/{outf}{num}.err",
            local=f"{outf}{num}-{c.addr}.err",
            preserve_mode=False,
        )
        agenda.subtask(f"getting {outf}{num}-{c.addr}.out")
        fn(
            f"burrito/{outf}{num}.out",
            local=f"{outf}{num}-{c.addr}.out",
            preserve_mode=False,
        )
        #agenda.subtask(f"getting {outf}{num}-{c.addr}.trace")
        #fn(
        #    f"burrito/{outf}{num}.trace",
        #    local=f"{outf}{num}-{c.addr}.trace",
        #    preserve_mode=False,
        #)
        agenda.subtask(f"getting {outf}{num}-{c.addr}.data1")
        fn(
            f"burrito/{outf}{num}.data1",
            local=f"{outf}{num}-{c.addr}.data1",
            preserve_mode=False,
        )

    agenda.task("get client files")
    ok = True
    for c in clients:
        try:
            get_files(0)
        except Exception as e:
            agenda.subfailure(f"At least one file missing for {c}: {e}")
            ok = False
    if not ok:
        return ok

    def awk_files(num):
        subprocess.run(f"awk '{{if (!hdr) {{hdr=$1; print \"ShardType NumShards Ops \"$0;}} else {{print \"{shardtype} {num_shards} {ops_per_sec} \"$0}} }}' {outf}{num}-{c.addr}.data1 > {outf}{num}-{c.addr}.data", shell=True, check=True)

    for c in clients:
        agenda.subtask(f"adding experiment info for {c.addr}")
        try:
            awk_files(0)
        except:
            agenda.subfailure(f"At least one file missing")
            return False

    agenda.task("done")
    return True
Beispiel #12
0
def do_exp(iter_num,
           outdir=None,
           machines=None,
           num_shards=None,
           shardtype=None,
           ops_per_sec=None,
           client_batch=None,
           server_batch=None,
           poisson_arrivals=None,
           stack_frag=None,
           wrkload=None,
           overwrite=None):
    assert (outdir is not None and machines is not None
            and num_shards is not None and shardtype is not None
            and ops_per_sec is not None and client_batch is not None
            and server_batch is not None and poisson_arrivals is not None
            and stack_frag is not None and wrkload is not None
            and overwrite is not None)

    wrkname = wrkload.split("/")[-1].split(".")[0]
    server_prefix = f"{outdir}/{num_shards}-{shardtype}shard-{ops_per_sec}-poisson={poisson_arrivals}-clientbatch={client_batch}-server_batch={server_batch}-stackfrag={stack_frag}-{wrkname}-{iter_num}-kvserver"
    outf = f"{outdir}/{num_shards}-{shardtype}shard-{ops_per_sec}-poisson={poisson_arrivals}-client_batch={client_batch}-server_batch={server_batch}-stackfrag={stack_frag}-{wrkname}-{iter_num}-client"

    for m in machines:
        if m.local:
            m.run(f"mkdir -p {outdir}", wd="~/burrito")
            continue
        m.run(f"rm -rf {outdir}", wd="~/burrito")
        m.run(f"mkdir -p {outdir}", wd="~/burrito")

    if not overwrite and os.path.exists(f"{outf}0-{machines[1].addr}.data"):
        agenda.task(
            f"skipping: server = {machines[0].addr}, num_shards = {num_shards}, shardtype = {shardtype}, client_batch = {client_batch}, server_batch = {server_batch}, stack_fragmentation = {stack_frag}, load = {ops_per_sec} ops/s"
        )
        return True
    else:
        agenda.task(f"running: {outf}0-{machines[1].addr}.data")

    # load = (n (client threads / proc) * 1 (procs/machine) * {len(machines) - 1} (machines))
    #        / {interarrival} (per client thread)
    num_client_threads = int(wrkname.split('-')[-1])
    interarrival_secs = num_client_threads * len(machines[1:]) / ops_per_sec
    interarrival_us = int(interarrival_secs * 1e6)

    #if interarrival_us < 5000:
    #    agenda.subfailure("Can't have interarrival < 5ms")
    #    return False

    redis_addr = start_redis(machines[0])
    time.sleep(5)
    server_addr = machines[0].addr
    agenda.task(
        f"starting: server = {machines[0].addr}, num_shards = {num_shards}, shardtype = {shardtype}, client_batch = {client_batch}, server_batch = {server_batch}, load = {ops_per_sec} ops/s -> interarrival_us = {interarrival_us}, num_clients = {len(machines)-1}"
    )

    # first one is the server, start the server
    agenda.subtask("starting server")
    redis_port = redis_addr.split(":")[-1]
    start_server(machines[0],
                 f"127.0.0.1:{redis_port}",
                 server_prefix,
                 shards=num_shards,
                 ebpf=False,
                 server_batch=server_batch,
                 stack_frag=stack_frag)
    time.sleep(5)
    # prime the server with loads
    agenda.task("doing loads")
    run_loads(machines[1], server_addr, redis_addr, outf, wrkload)
    try:
        machines[1].get(f"{outf}-loads.out",
                        local=f"{outf}-loads.out",
                        preserve_mode=False)
        machines[1].get(f"{outf}-loads.err",
                        local=f"{outf}-loads.err",
                        preserve_mode=False)
    except Exception as e:
        agenda.subfailure(f"Could not get file from loads client: {e}")

    # others are clients
    agenda.task("starting clients")
    clients = [
        threading.Thread(
            target=run_client,
            args=(m, server_addr, redis_addr, interarrival_us,
                  poisson_arrivals, client_batch, shardtype, stack_frag, outf,
                  wrkload),
        ) for m in machines[1:]
    ]

    [t.start() for t in clients]
    [t.join() for t in clients]
    agenda.task("all clients returned")

    # kill the server
    machines[0].run("sudo pkill -9 kvserver-ebpf")
    machines[0].run("sudo pkill -9 kvserver-noebpf")
    machines[0].run("sudo pkill -INT iokerneld")

    for m in machines:
        m.run("rm ~/burrito/*.config")

    agenda.task("get server files")
    if not machines[0].local:
        machines[0].get(f"~/burrito/{server_prefix}.out",
                        local=f"{server_prefix}.out",
                        preserve_mode=False)
        machines[0].get(f"~/burrito/{server_prefix}.err",
                        local=f"{server_prefix}.err",
                        preserve_mode=False)

    def get_files(num):
        fn = c.get
        if c.local:
            agenda.subtask(f"Use get_local: {c.host}")
            fn = get_local

        agenda.subtask(f"getting {outf}{num}-{c.addr}.err")
        fn(
            f"burrito/{outf}{num}.err",
            local=f"{outf}{num}-{c.addr}.err",
            preserve_mode=False,
        )
        agenda.subtask(f"getting {outf}{num}-{c.addr}.out")
        fn(
            f"burrito/{outf}{num}.out",
            local=f"{outf}{num}-{c.addr}.out",
            preserve_mode=False,
        )
        agenda.subtask(f"getting {outf}{num}-{c.addr}.data")
        fn(
            f"burrito/{outf}{num}.data",
            local=f"{outf}{num}-{c.addr}.data",
            preserve_mode=False,
        )
        agenda.subtask(f"getting {outf}{num}-{c.addr}.trace")
        fn(
            f"burrito/{outf}{num}.trace",
            local=f"{outf}{num}-{c.addr}.trace",
            preserve_mode=False,
        )

    agenda.task("get client files")
    for c in machines[1:]:
        try:
            get_files(0)
        except Exception as e:
            agenda.subfailure(f"At least one file missing for {c}: {e}")

    agenda.task("done")
    return True
Beispiel #13
0
            )
            sys.exit(1)

    for t in cfg['exp']['shardtype']:
        if t not in ['client', 'server',
                     'basicclient']:  # basicclient is a subset of server
            agenda.failure(f"Unknown shardtype {t}")
            sys.exit(1)

    agenda.task(f"Checking for connection vs experiment ip")
    ips = [cfg['machines']['server']] + cfg['machines']['clients']
    agenda.task(f"connecting to {ips}")
    machines, commits = zip(*[check_machine(ip) for ip in ips])
    # check all the commits are equal
    if not all(c == commits[0] for c in commits):
        agenda.subfailure(f"not all commits equal: {commits}")
        sys.exit(1)

    for m in machines:
        if m.host in ['127.0.0.1', '::1', 'localhost']:
            agenda.subtask(f"Local conn: {m.host}/{m.addr}")
            m.local = True
        else:
            m.local = False

    # build
    agenda.task("building burrito...")
    thread_ok = True
    setups = [
        threading.Thread(target=setup_machine, args=(m, outdir))
        for m in machines
Beispiel #14
0
def main(argv=None):
    """The main entry-point to salvo."""
    if argv is None:
        argv = sys.argv[1:]

    parser = argparse.ArgumentParser(description='Provision a new salvo.')
    parser.add_argument('config',
                        type=argparse.FileType('r'),
                        help='salvo configuration file to run')
    parser.add_argument('--playbook',
                        '-p',
                        type=argparse.FileType('r'),
                        default='./deploy/playbook.yml',
                        help='directory where playbooks reside')
    parser.add_argument('--wait',
                        '-w',
                        default=False,
                        action='store_true',
                        help='wait for [Enter] before cleaning up')
    parser.add_argument('--deployment',
                        '-d',
                        type=str,
                        default='salvo',
                        help='deployment name for this salvo')
    parser.add_argument('--set',
                        '-s',
                        nargs='*',
                        type=str,
                        help='key:value pair to set for this salvo execution')
    parser.add_argument('--dry-run',
                        '-n',
                        action='store_true',
                        default=False,
                        help='only print what actions would be taken')
    args = parser.parse_args(argv)

    args.set = dict(item.split(":", maxsplit=1)
                    for item in args.set) if args.set is not None else {}
    topology = Topology.load_file(args.config, args.set)

    hq = Cluster('hq', {
        'expose': [22],
    }, {})
    topology.clusters = [hq] + topology.clusters

    agenda.section("Set up network")

    client = boto3.client('ec2')
    ec2 = boto3.resource('ec2')

    # Set up VPC
    agenda.task("Create VPC")
    vpc = client.create_vpc(DryRun=args.dry_run, CidrBlock='10.0.0.0/16')
    vpc = ec2.Vpc(vpc['Vpc']['VpcId'])

    agenda.task("Attach VPC internet gateway")
    gateway = client.create_internet_gateway(DryRun=args.dry_run)
    gateway = ec2.InternetGateway(
        gateway['InternetGateway']['InternetGatewayId'])
    gateway.attach_to_vpc(DryRun=args.dry_run, VpcId=vpc.id)

    agenda.task("Create internet-enabled route table")
    iroutable = vpc.create_route_table(DryRun=args.dry_run)
    iroutable.create_route(DryRun=args.dry_run,
                           DestinationCidrBlock='0.0.0.0/0',
                           GatewayId=gateway.id)

    subnets = []
    secs = []
    for i, c in enumerate(topology.clusters):
        agenda.task("Allocate subnet #{}".format(i + 1))
        subnet = vpc.create_subnet(DryRun=args.dry_run,
                                   CidrBlock='10.0.{}.0/24'.format(i))

        if c.internet:
            agenda.subtask("Hook in internet-enable route table")
            iroutable.associate_with_subnet(DryRun=args.dry_run,
                                            SubnetId=subnet.id)

        # set up security croups
        agenda.subtask("Create network security group")
        sec = vpc.create_security_group(
            DryRun=args.dry_run,
            GroupName='{}-cluster-{}'.format(args.deployment, i + 1),
            Description='Ingress rules for cluster {}-{}'.format(
                args.deployment, c.name))
        # allow all internal traffic
        sec.authorize_ingress(DryRun=args.dry_run,
                              IpProtocol='tcp',
                              FromPort=1,
                              ToPort=65535,
                              CidrIp='10.0.0.0/16')

        if c.expose is not False:
            for p in c.expose:
                agenda.subtask("Allow ingress traffic on port {}".format(p))
                sec.authorize_ingress(DryRun=args.dry_run,
                                      IpProtocol='tcp',
                                      FromPort=p,
                                      ToPort=p,
                                      CidrIp='0.0.0.0/0')

        secs.append(sec)
        subnets.append(subnet)

    # Tag all our VPC resources
    agenda.task("Tag all VPC resources")
    ec2.create_tags(DryRun=args.dry_run,
                    Resources=[
                        vpc.id,
                        gateway.id,
                        iroutable.id,
                    ] + [sn.id for sn in subnets] + [sg.id for sg in secs],
                    Tags=[{
                        'Key': 'salvo',
                        'Value': args.deployment,
                    }])

    # Create access keys
    agenda.task("Generate VPC key pair")
    try:
        keys = client.create_key_pair(DryRun=args.dry_run,
                                      KeyName=args.deployment)
    except botocore.exceptions.ClientError:
        # Key probably already exists. Delete and re-create.
        agenda.subfailure("Could not create key pair")
        agenda.subtask("Attempting to delete old key pair")
        client.delete_key_pair(DryRun=args.dry_run, KeyName=args.deployment)
        agenda.subtask("Attempting to generate new key pair")
        keys = client.create_key_pair(DryRun=args.dry_run,
                                      KeyName=args.deployment)

    keymat = keys['KeyMaterial']
    keys = ec2.KeyPair(keys['KeyName'])

    agenda.section("Launch instances")

    # Launch instances
    clusters = []
    for i, c in enumerate(topology.clusters):
        nics = [{
            "DeviceIndex": 0,
            "Groups": [secs[i].id],
            "SubnetId": subnets[i].id,
            "DeleteOnTermination": True,
            "AssociatePublicIpAddress": c.internet,
        }]

        agenda.task("Launching {} instances in cluster {}".format(
            c.attrs['count'], c.name))
        clusters.append(
            list(
                map(lambda x: ec2.Instance(x), [
                    instance['InstanceId']
                    for instance in client.run_instances(
                        DryRun=args.dry_run,
                        KeyName=keys.name,
                        NetworkInterfaces=nics,
                        ImageId=c.attrs['image'],
                        MinCount=c.attrs['count'],
                        MaxCount=c.attrs['count'],
                        InstanceType=c.attrs['itype'],
                        InstanceInitiatedShutdownBehavior='terminate')
                    ['Instances']
                ])))

    exit = 1
    try:
        agenda.task("Wait for HQ to start running")

        hq = clusters[0][0]
        while hq.state['Name'] == 'pending':
            agenda.subtask("Still in 'pending' state")
            sleep(3)
            hq.load()

        if hq.state['Name'] != 'running':
            agenda.failure(hq.state_reason['Message'])
            raise ChildProcessError(hq.state_reason['Message'])

        def prepare(ci, instance):
            global hq
            print("instance {} in {} now available through {}",
                  instance.private_ip_address, topology.clusters[ci].name,
                  hq.public_ip_address)

        agenda.task("Wait for workers to reach 'running' state")

        done = []
        p = Pool(5)
        pending = True
        while pending:
            pending = False
            for i, cluster in enumerate(clusters):
                for ii, instance in enumerate(cluster):
                    if instance.state['Name'] == 'pending':
                        agenda.subtask(
                            "Instance {}.{} is still pending".format(
                                i + 1, ii + 1))

                        pending = True
                        instance.load()
                        break
                    elif instance.state['Name'] != 'running':
                        agenda.subfailure("Instance {}.{} failed: {}".format(
                            i + 1, ii + 1, instance.state_reason['Message']))
                        raise ChildProcessError(
                            instance.state_reason['Message'])
                    else:
                        # State is now 'running'
                        tag = (i, ii)
                        if tag not in done:
                            # State hasn't been 'running' before
                            done.append(tag)
                            p.apply_async(prepare, [i, instance])
                if pending:
                    break
            sleep(3)
        p.close()
        p.join()

        agenda.task("Wait for HQ to become pingable")

        # Wait for hq to be pingable
        deployment = Deployer(args.playbook.name, topology, keymat, clusters)
        while not deployment.test(hq.public_ip_address):
            sleep(1)

        agenda.task("Wait for workers to become pingable")

        # Wait for workers to be pingable
        for i, cluster in enumerate(clusters):
            for ii, instance in enumerate(cluster):
                while not deployment.test(instance.private_ip_address):
                    sleep(1)

        # Deploy!
        agenda.section("Deploy application")
        exit = deployment.deploy()
    except:
        import traceback
        traceback.print_exc()
    finally:
        agenda.section("Clean up VPC")

        if args.wait:
            agenda.prompt("Press [Enter] when you are ready to clean")
            input()

        # Terminate instances and delete VPC resources
        agenda.task("Terminate all instances")
        instances = list(vpc.instances.all())
        vpc.instances.terminate(DryRun=args.dry_run)
        still_running = True
        while still_running:
            still_running = False
            for i in instances:
                i.load()
                if i.state['Name'] != 'terminated':
                    agenda.subtask("At least one instance still shutting down")
                    still_running = True
                    sleep(3)
                    break

        agenda.task("Delete network resources")
        agenda.subtask("key pair")
        keys.delete(DryRun=args.dry_run)
        agenda.subtask("internet-enabled route associations")
        for r in iroutable.associations.all():
            r.delete(DryRun=args.dry_run)
        agenda.subtask("internet-enabled route table")
        iroutable.delete(DryRun=args.dry_run)
        agenda.subtask("internet gateway")
        gateway.detach_from_vpc(DryRun=args.dry_run, VpcId=vpc.id)
        gateway.delete(DryRun=args.dry_run)
        agenda.subtask("subnets")
        try:
            for sn in subnets:
                sn.delete(DryRun=args.dry_run)
        except:
            agenda.subfailure("failed to delete subnet:")
            import traceback
            traceback.print_exc()
        agenda.subtask("security groups")
        for sg in secs:
            sg.delete()
        agenda.subtask("network interfaces")
        for i in vpc.network_interfaces.all():
            i.delete(DryRun=args.dry_run)

        agenda.task("Delete the VPC")
        vpc.delete(DryRun=args.dry_run)

    return exit