def fatal_warn(msg, exit=True): print() for m in msg.split("\n"): agenda.subfailure(m) print() if exit: sys.exit(1)
def expect(res, msg): if res and res.exited: agenda.subfailure(msg) print("exit code: {}\ncommand: {}\nstdout: {}\nstderr: {}".format( res.exited, res.command, res.stdout, res.stderr )) return res
def check_file(self, grep, where): res = self.run(f"grep \"{grep}\" {where}") if res.exited != 0: agenda.subfailure( f"Unable to find search string (\"{grep}\") in process output file {where}" ) res = self.run(f'tail {where}') if res.exited == 0: print(res.command) print(res.stdout) sys.exit(1)
def check_proc(self, proc_name, proc_out): res = self.run(f"pgrep {proc_name}") if res.exited != 0: agenda.subfailure( f'failed to find running process with name \"{proc_name}\" on {self.addr}' ) res = self.run(f'tail {proc_out}') if res.exited == 0: print(res.command) print(res.stdout) else: print(res) sys.exit(1)
def check_existing_experiment(driver): agenda.task("Check for existing experiment") driver.get("https://www.cloudlab.us/user-dashboard.php#experiments") table = None try: table = driver.find_element_by_id("experiments_table") except: agenda.subfailure("No existing experiment found") return None elements = [ e.text.split()[0] for e in table.find_elements_by_xpath("//table/tbody") if len(e.text.split()) > 0 ] agenda.subtask("Existing experiment found") driver.find_element_by_link_text(elements[0]).click() time.sleep(6) return get_machines_from_experiment(driver)
def run_loads(conn, server, redis_addr, outf, wrkfile): conn.run("sudo pkill -INT iokerneld") write_shenango_config(conn) while True: conn.run("./iokerneld", wd="~/burrito/shenango-chunnel/caladan", sudo=True, background=True) time.sleep(2) loads_start = time.time() agenda.subtask(f"loads client starting") ok = None try: ok = conn.run( f"RUST_LOG=info,ycsb=debug,bertha=debug ./target/release/ycsb \ --addr {server}:4242 \ --redis-addr={redis_addr} \ -i 1000 \ --accesses {wrkfile} \ -s host.config \ --logging --tracing --loads-only", wd="~/burrito", stdout=f"{outf}-loads.out", stderr=f"{outf}-loads.err", timeout=30, ) except: agenda.subfailure( f"loads failed, retrying after {time.time() - loads_start} s") finally: conn.run("sudo pkill -INT iokerneld") if ok is None or ok.exited != 0: agenda.subfailure( f"loads failed, retrying after {time.time() - loads_start} s") continue else: agenda.subtask(f"loads client done: {time.time() - loads_start} s") break
def check(ok, msg, addr, allowed=[]): # exit code 0 is always ok, allowed is in addition if ok.exited != 0 and ok.exited not in allowed: agenda.subfailure(f"{msg} on {addr}: {ok.exited} not in {allowed}") agenda.subfailure("stdout") print(ok.stdout) agenda.subfailure("stderr") print(ok.stderr) thread_ok = False # something went wrong. sys.exit(1)
def get_asn(ip): try: return ip2asn_bgp(ip) except Exception as e: agenda.subfailure(f"ip {ip}: error {e}") return None
#!/usr/bin/env python3 import agenda agenda.section("Set up network") agenda.task("Create Virtual Private Cloud") agenda.task("Attach internet gateway") agenda.task("Allocate subnet #1") agenda.subtask("Hook in internet-enabled route table") agenda.task("Allocate subnet #2") agenda.task("Generate VPC key-pair") agenda.subfailure("Could not create key-pair") agenda.subtask("Attempting to delete old key-pair") agenda.subtask("Attempting to generate new key-pair") agenda.section("Launch instances") agenda.task("Launch instances in cluster #1") agenda.task("Launch instances in cluster #2") agenda.task("Wait for HQ to start running") agenda.subtask("Still in 'pending' state") agenda.subtask("Still in 'pending' state") agenda.task("Wait for workers to reach 'running' state") agenda.task("Wait for HQ to become pingable") print("54.84.179.156 | UNREACHABLE!") print("54.84.179.156 | UNREACHABLE!") print('54.84.179.156 | SUCCESS => {"changed": false, "ping": "pong"}') agenda.task("Wait for workers to become pingable") print('10.0.1.237 | SUCCESS => {"changed": false, "ping": "pong"}') agenda.section("Deploy application") print("""\ PLAY [ansible-playbook]
os.makedirs(args.outdir, exist_ok = True) for n in args.negotiate: for srv in args.server: is_remote = '127.0.0.1' not in srv for m in args.mode: if m not in modes: agenda.failure(f"unknown mode {m}") break if m != 'rel' and args.ghostunnel is None: agenda.failure("need ghostunnel arg for non-rel exp") break agenda.task(f"mode: {m}, negotiate {n}") if is_remote and m == 'rel-ux': agenda.subfailure("No remote for unix mode") continue start_localnamectl(srv, args.burrito_root if 'fp' in m else None) if m == 'rel-ux': start_server_unix(n) time.sleep(15) exp_unix(args, n) else: start_server( srv, args.server_port, args.ghostunnel if 'rel' not in m else None, args.burrito_root if 'fp' in m else None, n) agenda.task("waiting for server") time.sleep(15)
def do_exp(outdir, lb, shards, clients, shardtype, ops_per_sec, wrkload): wrkname = wrkload.split("/")[-1].split(".")[0] num_shards = len(shards) server_prefix = f"{outdir}/{shardtype}shard-{ops_per_sec}-{wrkname}-lb" shard_prefix = f"{outdir}/{shardtype}shard-{ops_per_sec}-{wrkname}-shard" outf = f"{outdir}/{shardtype}shard-{ops_per_sec}-{wrkname}-client" agenda.task(f"checking {outf}0-{clients[0].addr}.data") if os.path.exists(f"{outf}0-{clients[0].addr}.data"): agenda.task(f"skipping: server = {lb.addr}, shardtype = {shardtype}, load = {ops_per_sec} ops/s") return True else: agenda.task(f"running: server = {lb.addr}, shardtype = {shardtype}, load = {ops_per_sec} ops/s") # load = (4 (client threads / proc) * 1 (procs/machine) * {len(machines) - 1} (machines)) # / {interarrival} (per client thread) num_client_threads = int(wrkname.split('-')[-1]) interarrival_secs = num_client_threads * len(clients) / ops_per_sec interarrival_us = int(interarrival_secs * 1e6) redis_addr = start_redis(lb) time.sleep(5) server_addr = lb.addr agenda.task(f"starting: server = {server_addr}, shardtype = {shardtype}, load = {ops_per_sec}, ops/s -> interarrival_us = {interarrival_us}, num_clients = {len(clients)}") agenda.subtask("starting shards") for s in shards: start_shard(s, shard_prefix) time.sleep(5) agenda.subtask("starting lb") redis_port = redis_addr.split(":")[-1] start_lb(lb, f"127.0.0.1:{redis_port}", [s.addr for s in shards], server_prefix) time.sleep(5) # prime the server with loads # conn, server, redis_addr, outf, wrkload='uniform' agenda.task("doing loads") run_loads(clients[0], server_addr, redis_addr, outf, wrkload) # others are clients agenda.task("starting clients") client_threads = [threading.Thread(target=run_client, args=( m, server_addr, redis_addr, interarrival_us, shardtype, outf, wrkload ), ) for m in clients] [t.start() for t in client_threads] [t.join() for t in client_threads] agenda.task("all clients returned") # kill the server lb.run("sudo pkill -9 burrito-lb") lb.run("sudo pkill -9 iokerneld") for s in shards: s.run("sudo pkill -9 single-shard") s.run("sudo pkill -9 iokerneld") lb.run("rm ~/burrito/*.config") for m in shards: m.run("rm ~/burrito/*.config") for m in clients: m.run("rm ~/burrito/*.config") agenda.task("get lb files") if not lb.local: lb.get(f"burrito/{server_prefix}.out", local=f"{server_prefix}.out", preserve_mode=False) lb.get(f"burrito/{server_prefix}.err", local=f"{server_prefix}.err", preserve_mode=False) agenda.task("get shard files") for s in shards: if not s.local: s.get(f"burrito/{shard_prefix}-{s.addr}.out", local=f"{shard_prefix}-{s.addr}.out", preserve_mode=False) s.get(f"burrito/{shard_prefix}-{s.addr}.err", local=f"{shard_prefix}-{s.addr}.err", preserve_mode=False) #s.get(f"burrito/{shard_prefix}-{s.addr}.trace", local=f"{shard_prefix}-{s.addr}.trace", preserve_mode=False) def get_files(num): fn = c.get if c.local: agenda.subtask(f"Use get_local: {c.host}") fn = get_local agenda.subtask(f"getting {outf}{num}-{c.addr}.err") fn( f"burrito/{outf}{num}.err", local=f"{outf}{num}-{c.addr}.err", preserve_mode=False, ) agenda.subtask(f"getting {outf}{num}-{c.addr}.out") fn( f"burrito/{outf}{num}.out", local=f"{outf}{num}-{c.addr}.out", preserve_mode=False, ) #agenda.subtask(f"getting {outf}{num}-{c.addr}.trace") #fn( # f"burrito/{outf}{num}.trace", # local=f"{outf}{num}-{c.addr}.trace", # preserve_mode=False, #) agenda.subtask(f"getting {outf}{num}-{c.addr}.data1") fn( f"burrito/{outf}{num}.data1", local=f"{outf}{num}-{c.addr}.data1", preserve_mode=False, ) agenda.task("get client files") ok = True for c in clients: try: get_files(0) except Exception as e: agenda.subfailure(f"At least one file missing for {c}: {e}") ok = False if not ok: return ok def awk_files(num): subprocess.run(f"awk '{{if (!hdr) {{hdr=$1; print \"ShardType NumShards Ops \"$0;}} else {{print \"{shardtype} {num_shards} {ops_per_sec} \"$0}} }}' {outf}{num}-{c.addr}.data1 > {outf}{num}-{c.addr}.data", shell=True, check=True) for c in clients: agenda.subtask(f"adding experiment info for {c.addr}") try: awk_files(0) except: agenda.subfailure(f"At least one file missing") return False agenda.task("done") return True
def do_exp(iter_num, outdir=None, machines=None, num_shards=None, shardtype=None, ops_per_sec=None, client_batch=None, server_batch=None, poisson_arrivals=None, stack_frag=None, wrkload=None, overwrite=None): assert (outdir is not None and machines is not None and num_shards is not None and shardtype is not None and ops_per_sec is not None and client_batch is not None and server_batch is not None and poisson_arrivals is not None and stack_frag is not None and wrkload is not None and overwrite is not None) wrkname = wrkload.split("/")[-1].split(".")[0] server_prefix = f"{outdir}/{num_shards}-{shardtype}shard-{ops_per_sec}-poisson={poisson_arrivals}-clientbatch={client_batch}-server_batch={server_batch}-stackfrag={stack_frag}-{wrkname}-{iter_num}-kvserver" outf = f"{outdir}/{num_shards}-{shardtype}shard-{ops_per_sec}-poisson={poisson_arrivals}-client_batch={client_batch}-server_batch={server_batch}-stackfrag={stack_frag}-{wrkname}-{iter_num}-client" for m in machines: if m.local: m.run(f"mkdir -p {outdir}", wd="~/burrito") continue m.run(f"rm -rf {outdir}", wd="~/burrito") m.run(f"mkdir -p {outdir}", wd="~/burrito") if not overwrite and os.path.exists(f"{outf}0-{machines[1].addr}.data"): agenda.task( f"skipping: server = {machines[0].addr}, num_shards = {num_shards}, shardtype = {shardtype}, client_batch = {client_batch}, server_batch = {server_batch}, stack_fragmentation = {stack_frag}, load = {ops_per_sec} ops/s" ) return True else: agenda.task(f"running: {outf}0-{machines[1].addr}.data") # load = (n (client threads / proc) * 1 (procs/machine) * {len(machines) - 1} (machines)) # / {interarrival} (per client thread) num_client_threads = int(wrkname.split('-')[-1]) interarrival_secs = num_client_threads * len(machines[1:]) / ops_per_sec interarrival_us = int(interarrival_secs * 1e6) #if interarrival_us < 5000: # agenda.subfailure("Can't have interarrival < 5ms") # return False redis_addr = start_redis(machines[0]) time.sleep(5) server_addr = machines[0].addr agenda.task( f"starting: server = {machines[0].addr}, num_shards = {num_shards}, shardtype = {shardtype}, client_batch = {client_batch}, server_batch = {server_batch}, load = {ops_per_sec} ops/s -> interarrival_us = {interarrival_us}, num_clients = {len(machines)-1}" ) # first one is the server, start the server agenda.subtask("starting server") redis_port = redis_addr.split(":")[-1] start_server(machines[0], f"127.0.0.1:{redis_port}", server_prefix, shards=num_shards, ebpf=False, server_batch=server_batch, stack_frag=stack_frag) time.sleep(5) # prime the server with loads agenda.task("doing loads") run_loads(machines[1], server_addr, redis_addr, outf, wrkload) try: machines[1].get(f"{outf}-loads.out", local=f"{outf}-loads.out", preserve_mode=False) machines[1].get(f"{outf}-loads.err", local=f"{outf}-loads.err", preserve_mode=False) except Exception as e: agenda.subfailure(f"Could not get file from loads client: {e}") # others are clients agenda.task("starting clients") clients = [ threading.Thread( target=run_client, args=(m, server_addr, redis_addr, interarrival_us, poisson_arrivals, client_batch, shardtype, stack_frag, outf, wrkload), ) for m in machines[1:] ] [t.start() for t in clients] [t.join() for t in clients] agenda.task("all clients returned") # kill the server machines[0].run("sudo pkill -9 kvserver-ebpf") machines[0].run("sudo pkill -9 kvserver-noebpf") machines[0].run("sudo pkill -INT iokerneld") for m in machines: m.run("rm ~/burrito/*.config") agenda.task("get server files") if not machines[0].local: machines[0].get(f"~/burrito/{server_prefix}.out", local=f"{server_prefix}.out", preserve_mode=False) machines[0].get(f"~/burrito/{server_prefix}.err", local=f"{server_prefix}.err", preserve_mode=False) def get_files(num): fn = c.get if c.local: agenda.subtask(f"Use get_local: {c.host}") fn = get_local agenda.subtask(f"getting {outf}{num}-{c.addr}.err") fn( f"burrito/{outf}{num}.err", local=f"{outf}{num}-{c.addr}.err", preserve_mode=False, ) agenda.subtask(f"getting {outf}{num}-{c.addr}.out") fn( f"burrito/{outf}{num}.out", local=f"{outf}{num}-{c.addr}.out", preserve_mode=False, ) agenda.subtask(f"getting {outf}{num}-{c.addr}.data") fn( f"burrito/{outf}{num}.data", local=f"{outf}{num}-{c.addr}.data", preserve_mode=False, ) agenda.subtask(f"getting {outf}{num}-{c.addr}.trace") fn( f"burrito/{outf}{num}.trace", local=f"{outf}{num}-{c.addr}.trace", preserve_mode=False, ) agenda.task("get client files") for c in machines[1:]: try: get_files(0) except Exception as e: agenda.subfailure(f"At least one file missing for {c}: {e}") agenda.task("done") return True
) sys.exit(1) for t in cfg['exp']['shardtype']: if t not in ['client', 'server', 'basicclient']: # basicclient is a subset of server agenda.failure(f"Unknown shardtype {t}") sys.exit(1) agenda.task(f"Checking for connection vs experiment ip") ips = [cfg['machines']['server']] + cfg['machines']['clients'] agenda.task(f"connecting to {ips}") machines, commits = zip(*[check_machine(ip) for ip in ips]) # check all the commits are equal if not all(c == commits[0] for c in commits): agenda.subfailure(f"not all commits equal: {commits}") sys.exit(1) for m in machines: if m.host in ['127.0.0.1', '::1', 'localhost']: agenda.subtask(f"Local conn: {m.host}/{m.addr}") m.local = True else: m.local = False # build agenda.task("building burrito...") thread_ok = True setups = [ threading.Thread(target=setup_machine, args=(m, outdir)) for m in machines
def main(argv=None): """The main entry-point to salvo.""" if argv is None: argv = sys.argv[1:] parser = argparse.ArgumentParser(description='Provision a new salvo.') parser.add_argument('config', type=argparse.FileType('r'), help='salvo configuration file to run') parser.add_argument('--playbook', '-p', type=argparse.FileType('r'), default='./deploy/playbook.yml', help='directory where playbooks reside') parser.add_argument('--wait', '-w', default=False, action='store_true', help='wait for [Enter] before cleaning up') parser.add_argument('--deployment', '-d', type=str, default='salvo', help='deployment name for this salvo') parser.add_argument('--set', '-s', nargs='*', type=str, help='key:value pair to set for this salvo execution') parser.add_argument('--dry-run', '-n', action='store_true', default=False, help='only print what actions would be taken') args = parser.parse_args(argv) args.set = dict(item.split(":", maxsplit=1) for item in args.set) if args.set is not None else {} topology = Topology.load_file(args.config, args.set) hq = Cluster('hq', { 'expose': [22], }, {}) topology.clusters = [hq] + topology.clusters agenda.section("Set up network") client = boto3.client('ec2') ec2 = boto3.resource('ec2') # Set up VPC agenda.task("Create VPC") vpc = client.create_vpc(DryRun=args.dry_run, CidrBlock='10.0.0.0/16') vpc = ec2.Vpc(vpc['Vpc']['VpcId']) agenda.task("Attach VPC internet gateway") gateway = client.create_internet_gateway(DryRun=args.dry_run) gateway = ec2.InternetGateway( gateway['InternetGateway']['InternetGatewayId']) gateway.attach_to_vpc(DryRun=args.dry_run, VpcId=vpc.id) agenda.task("Create internet-enabled route table") iroutable = vpc.create_route_table(DryRun=args.dry_run) iroutable.create_route(DryRun=args.dry_run, DestinationCidrBlock='0.0.0.0/0', GatewayId=gateway.id) subnets = [] secs = [] for i, c in enumerate(topology.clusters): agenda.task("Allocate subnet #{}".format(i + 1)) subnet = vpc.create_subnet(DryRun=args.dry_run, CidrBlock='10.0.{}.0/24'.format(i)) if c.internet: agenda.subtask("Hook in internet-enable route table") iroutable.associate_with_subnet(DryRun=args.dry_run, SubnetId=subnet.id) # set up security croups agenda.subtask("Create network security group") sec = vpc.create_security_group( DryRun=args.dry_run, GroupName='{}-cluster-{}'.format(args.deployment, i + 1), Description='Ingress rules for cluster {}-{}'.format( args.deployment, c.name)) # allow all internal traffic sec.authorize_ingress(DryRun=args.dry_run, IpProtocol='tcp', FromPort=1, ToPort=65535, CidrIp='10.0.0.0/16') if c.expose is not False: for p in c.expose: agenda.subtask("Allow ingress traffic on port {}".format(p)) sec.authorize_ingress(DryRun=args.dry_run, IpProtocol='tcp', FromPort=p, ToPort=p, CidrIp='0.0.0.0/0') secs.append(sec) subnets.append(subnet) # Tag all our VPC resources agenda.task("Tag all VPC resources") ec2.create_tags(DryRun=args.dry_run, Resources=[ vpc.id, gateway.id, iroutable.id, ] + [sn.id for sn in subnets] + [sg.id for sg in secs], Tags=[{ 'Key': 'salvo', 'Value': args.deployment, }]) # Create access keys agenda.task("Generate VPC key pair") try: keys = client.create_key_pair(DryRun=args.dry_run, KeyName=args.deployment) except botocore.exceptions.ClientError: # Key probably already exists. Delete and re-create. agenda.subfailure("Could not create key pair") agenda.subtask("Attempting to delete old key pair") client.delete_key_pair(DryRun=args.dry_run, KeyName=args.deployment) agenda.subtask("Attempting to generate new key pair") keys = client.create_key_pair(DryRun=args.dry_run, KeyName=args.deployment) keymat = keys['KeyMaterial'] keys = ec2.KeyPair(keys['KeyName']) agenda.section("Launch instances") # Launch instances clusters = [] for i, c in enumerate(topology.clusters): nics = [{ "DeviceIndex": 0, "Groups": [secs[i].id], "SubnetId": subnets[i].id, "DeleteOnTermination": True, "AssociatePublicIpAddress": c.internet, }] agenda.task("Launching {} instances in cluster {}".format( c.attrs['count'], c.name)) clusters.append( list( map(lambda x: ec2.Instance(x), [ instance['InstanceId'] for instance in client.run_instances( DryRun=args.dry_run, KeyName=keys.name, NetworkInterfaces=nics, ImageId=c.attrs['image'], MinCount=c.attrs['count'], MaxCount=c.attrs['count'], InstanceType=c.attrs['itype'], InstanceInitiatedShutdownBehavior='terminate') ['Instances'] ]))) exit = 1 try: agenda.task("Wait for HQ to start running") hq = clusters[0][0] while hq.state['Name'] == 'pending': agenda.subtask("Still in 'pending' state") sleep(3) hq.load() if hq.state['Name'] != 'running': agenda.failure(hq.state_reason['Message']) raise ChildProcessError(hq.state_reason['Message']) def prepare(ci, instance): global hq print("instance {} in {} now available through {}", instance.private_ip_address, topology.clusters[ci].name, hq.public_ip_address) agenda.task("Wait for workers to reach 'running' state") done = [] p = Pool(5) pending = True while pending: pending = False for i, cluster in enumerate(clusters): for ii, instance in enumerate(cluster): if instance.state['Name'] == 'pending': agenda.subtask( "Instance {}.{} is still pending".format( i + 1, ii + 1)) pending = True instance.load() break elif instance.state['Name'] != 'running': agenda.subfailure("Instance {}.{} failed: {}".format( i + 1, ii + 1, instance.state_reason['Message'])) raise ChildProcessError( instance.state_reason['Message']) else: # State is now 'running' tag = (i, ii) if tag not in done: # State hasn't been 'running' before done.append(tag) p.apply_async(prepare, [i, instance]) if pending: break sleep(3) p.close() p.join() agenda.task("Wait for HQ to become pingable") # Wait for hq to be pingable deployment = Deployer(args.playbook.name, topology, keymat, clusters) while not deployment.test(hq.public_ip_address): sleep(1) agenda.task("Wait for workers to become pingable") # Wait for workers to be pingable for i, cluster in enumerate(clusters): for ii, instance in enumerate(cluster): while not deployment.test(instance.private_ip_address): sleep(1) # Deploy! agenda.section("Deploy application") exit = deployment.deploy() except: import traceback traceback.print_exc() finally: agenda.section("Clean up VPC") if args.wait: agenda.prompt("Press [Enter] when you are ready to clean") input() # Terminate instances and delete VPC resources agenda.task("Terminate all instances") instances = list(vpc.instances.all()) vpc.instances.terminate(DryRun=args.dry_run) still_running = True while still_running: still_running = False for i in instances: i.load() if i.state['Name'] != 'terminated': agenda.subtask("At least one instance still shutting down") still_running = True sleep(3) break agenda.task("Delete network resources") agenda.subtask("key pair") keys.delete(DryRun=args.dry_run) agenda.subtask("internet-enabled route associations") for r in iroutable.associations.all(): r.delete(DryRun=args.dry_run) agenda.subtask("internet-enabled route table") iroutable.delete(DryRun=args.dry_run) agenda.subtask("internet gateway") gateway.detach_from_vpc(DryRun=args.dry_run, VpcId=vpc.id) gateway.delete(DryRun=args.dry_run) agenda.subtask("subnets") try: for sn in subnets: sn.delete(DryRun=args.dry_run) except: agenda.subfailure("failed to delete subnet:") import traceback traceback.print_exc() agenda.subtask("security groups") for sg in secs: sg.delete() agenda.subtask("network interfaces") for i in vpc.network_interfaces.all(): i.delete(DryRun=args.dry_run) agenda.task("Delete the VPC") vpc.delete(DryRun=args.dry_run) return exit