def main(argv): if len(argv) == 1: print "Usage: %s [region]+" % argv[0] print " region: all or some of %s" % " ".join(Ec2Region.All()) sys.exit(1) # Note: Not sure if I want to parameterize the cluster name too. It can be # generated dynamically. regions = [] if argv[1] == "all": regions = Ec2Region.All() else: for i in range(len(argv)): if i == 0: continue regions.append(argv[i]) # EC2 instance types # # 4 vCPUs, 7.5 Gib RAM, EBS only, $0.209 per Hour # "The specified instance type can only be used in a VPC. A subnet ID or network interface ID is required to carry out the request." # ec2_type = "c4.xlarge" # # 4 vCPUs, 7.5 Gib RAM, 2 x 40 SSD, $0.21 per Hour # ec2_type = "c3.xlarge" # # For fast development ec2_type = "c3.4xlarge"
def MeasureMetadataXdcTraffic(q): Cons.P("regions: %s" % ",".join(Ec2Region.All())) req_attrs = { "regions": Ec2Region.All() # Partial replication metadata is exchanged , "acorn-youtube.replication_type": "partial" # Objects are fully replicated , "acorn_options.full_replication": "true" , "acorn-youtube.fn_youtube_reqs": "tweets-010" , "acorn-youtube.youtube_extra_data_size": "10240" # Request all , "acorn-youtube.max_requests": "-1" , "acorn-youtube.simulation_time_dur_in_ms": "1800000" } _EnqReq(q, req_attrs) # Full replication, of course without any acorn metadata exchange req_attrs["acorn-youtube.replication_type"] = "full" req_attrs["acorn_options.use_attr_user"] = "******" req_attrs["acorn_options.use_attr_topic"] = "false" _EnqReq(q, req_attrs)
def CanLaunchAnotherCluster(): # Returns True when all regions have less than 12 instances. with _num_nodes_per_region_lock: for r in Ec2Region.All(): v = _num_nodes_per_region.get(r) if v is None: return False if v >= 12: return False #JobControllerLog.P("%s %s" % (Util.FileLine(), pprint.pformat(_num_nodes_per_region))) # You can launch another cluster now for r in Ec2Region.All(): _num_nodes_per_region[r] += 1 return True
def ByRepModels(q): # UT req_attrs = { "init_script": "acorn-server" , "regions": Ec2Region.All() # Partial replication metadata is exchanged , "acorn-youtube.replication_type": "partial" , "acorn-youtube.fn_youtube_reqs": "tweets-010" # Default is 10240 #, "acorn-youtube.youtube_extra_data_size": "10240" # Default is -1 (request all) #, "acorn-youtube.max_requests": "-1" , "acorn-youtube.max_requests": "100000" # Default is 1800000 #, "acorn-youtube.simulation_time_dur_in_ms": "1800000" , "acorn-youtube.simulation_time_dur_in_ms": "10000" # Default is true, true , "acorn_options.use_attr_user": "******" , "acorn_options.use_attr_topic": "true" } _EnqReq(q, req_attrs)
def ByJobIdTermSelfLast(): job_id = Ec2Util.JobId() Cons.P("Terminating running instances of job_id %s" % job_id) _TermInst.Init(term_by_job_id_self_last=True) tags = {} tags["job_id"] = job_id tis = [] for r in Ec2Region.All(): tis.append(_TermInst(r, tags)) threads = [] for ti in tis: t = threading.Thread(target=ti.Run) t.daemon = True threads.append(t) t.start() for t in threads: t.join() print "" Cons.P(_TermInst.Header()) for ti in tis: ti.PrintResult()
def RunTermInst(tags): threads = [] sys.stdout.write("Terminating running instances:") sys.stdout.flush() tis = [] for r in Ec2Region.All(): tis.append(TermInst(r, tags)) for ti in tis: t = threading.Thread(target=ti.Run) t.daemon = True threads.append(t) t.start() for t in threads: t.join() print "" Cons.P( Util.BuildHeader(_fmt, "Region" " InstanceId" " PrevState" " CurrState")) for ti in tis: ti.PrintResult()
def main(argv): if len(argv) != 3: raise RuntimeError("Usage: %s ami-id-in-us-east-1 name\n" \ " E.g.: %s ami-01d12c17 mutant-server-170129-1600" \ % (argv[0], argv[0])) region_ami = {"us-east-1": argv[1]} regions = Ec2Region.All() regions.remove("us-east-1") for r in regions: cmd = "aws ec2 copy-image" \ " --source-image-id %s" \ " --source-region us-east-1" \ " --region %s" \ " --name %s" \ % (argv[1], r, argv[2]) out = Util.RunSubp(cmd) found_ami_id = False for line in out.split("\n"): if "\"ImageId\": \"ami-" in line: t = line.split("\"ImageId\": \"") if len(t) != 2: raise RuntimeError("Unexpected line=[%s]" % line) # ami-a46623c4" # 012345678901 region_ami[r] = t[1][0:11 + 1] found_ami_id = True break if not found_ami_id: raise RuntimeError("Unexpected output=[%s]" % out) print "{\n%s\n}" % ("\n, ".join( ["\"%s\": \"%s\"" % (k, v) for (k, v) in sorted(region_ami.items())]))
def main(argv): iscs = [] for r in Ec2Region.All(): iscs.append(ImageSnapshotCleaner(region=r)) for i in iscs: i.GetImages() for i in iscs: i.Join() for i in iscs: i.GetSnapshots() for i in iscs: i.Join() for i in iscs: i.PrintWhatToKeepAndDelete() num_AMIs_to_delete = 0 for i in iscs: num_AMIs_to_delete += len(i.imgs_myproj_to_delete) if num_AMIs_to_delete == 0: Cons.P("Nothing to delete") return confirm = raw_input("Would you like to proceed (Y/N)? ") if confirm.lower() != "y": return Cons.P("") Cons.P("Deregistering Amis and deleting snapshots ...") for i in iscs: i.DeleteOldAmisSnapshots() for i in iscs: i.Join()
def __init__(self, az_or_region): if re.match(r".*[a-z]$", az_or_region): self.az = az_or_region self.region_name = self.az[:-1] else: self.az = None self.region_name = az_or_region self.ami_id = Ec2Region.GetLatestAmiId(self.region_name)
def Run(tags=None): sys.stdout.write("desc_instances:") sys.stdout.flush() diprs = [] for r in Ec2Region.All(): diprs.append(DescInstPerRegion(r, tags)) threads = [] for dipr in diprs: t = threading.Thread(target=dipr.Run) threads.append(t) t.daemon = True t.start() for t in threads: t.join() print "" num_insts = 0 for dipr in diprs: num_insts += dipr.NumInsts() if num_insts == 0: Cons.P("No instances found.") return print "" Cons.P( Util.BuildHeader( _fmt, "job_id" " Placement:AvailabilityZone" " InstanceId" #" InstanceType" #" LaunchTime" #" PrivateIpAddress" " PublicIpAddress" " State:Name" #" Tag:Name" )) results = [] for dipr in diprs: results += dipr.GetResults() for r in sorted(results): Cons.P(r)
def main(argv): with Cons.MTnnl("Checking:"): checks = [] for r in Ec2Region.All(): checks.append(Check(r)) threads = [] for c in checks: t = threading.Thread(target=c.Run) threads.append(t) t.start() for t in threads: t.join() print "" for c in checks: Cons.P("%-14s %2d" % (c.region, c.max_inst))
def GetServerPubIpsByJobId(job_id): threads = [] dis = [] for r in Ec2Region.All(): dis.append(_DescInst(r, {"job_id": job_id})) for di in dis: t = threading.Thread(target=di.Run) threads.append(t) t.start() for t in threads: t.join() ips = [] for di in dis: ips.extend(di.GetIPs()) return ips
def GetByTags(tags): threads = [] dis = [] for r in Ec2Region.All(): dis.append(DescInst(r, tags)) for di in dis: t = threading.Thread(target=di.Run) threads.append(t) t.start() for t in threads: t.join() ips = [] for di in dis: ip = di.GetIp() if ip == None: continue ips.append(ip) return ips
def GetInstDescs(tags=None): sys.stdout.write("desc_instances:") sys.stdout.flush() dis = [] for r in Ec2Region.All(): dis.append(DescInstPerRegion(r, tags)) threads = [] for di in dis: t = threading.Thread(target=di.Run) threads.append(t) t.start() for t in threads: t.join() print "" inst_descs = [] for di in dis: inst_descs += di.GetInstDesc() return inst_descs
def ByTags(tags, job_id_none_requested): Cons.Pnnl("Terminating running instances:") _TermInst.Init(job_id_none_requested) tis = [] for r in Ec2Region.All(): tis.append(_TermInst(r, tags)) threads = [] for ti in tis: t = threading.Thread(target=ti.Run) t.daemon = True threads.append(t) t.start() for t in threads: t.join() print "" Cons.P(_TermInst.Header()) for ti in tis: ti.PrintResult()
def main(argv): iscs = [] for r in Ec2Region.All(): iscs.append(ImageSnapshotCleaner(region=r)) for i in iscs: i.GetImages() for i in iscs: i.Join() for i in iscs: i.GetSnapshots() for i in iscs: i.Join() for i in iscs: i.PrintWhatToKeepAndDelete() Cons.P("") Cons.P("Deregistering Amis and deleting snapshots ...") for i in iscs: i.DeleteOldAmisSnapshots() for i in iscs: i.Join()
def _DescInst(self): if self.mode == "run_until_stopped": self.dio.P("\n") self.dio.P("Describing instances:") DescInstPerRegion.Reset() dis = [] for r in Ec2Region.All(): dis.append(DescInstPerRegion(r, self.dio)) self.per_region_threads = [] for di in dis: t = threading.Thread(target=di.Run) self.per_region_threads.append(t) t.daemon = True t.start() # Exit immediately when requested for t in self.per_region_threads: while t.isAlive(): if self.stop_requested: return t.join(0.1) self.dio.P("\n") num_insts = 0 with _num_nodes_per_region_lock: for di in dis: num_insts += len(di.Instances()) # Decrement slowly, at most one at a time. You don't want a suddern # increase in the capacity. Increase as is reported by the boto library. n = _num_nodes_per_region.get(di.region) if n is None: n = len(di.Instances()) else: if len(di.Instances()) < n: n -= 0.2 else: n = len(di.Instances()) _num_nodes_per_region[di.region] = n if num_insts == 0: self.dio.P("No instances found.\n") else: self.dio.P("#" " job_id" " (Placement:AvailabilityZone" " InstanceId" " PublicIpAddress" " State:Name) ...\n") # Group by job_id. Only for those with job_ids # { job_id: {region: Inst} } jobid_inst = {} # Instances without any job_id # { region: [Inst] } nojobid_inst = {} num_nojobid_inst = 0 for di in dis: for i in di.Instances(): if i.job_id is not None: if i.job_id not in jobid_inst: jobid_inst[i.job_id] = {} jobid_inst[i.job_id][i.region] = i else: if i.region not in nojobid_inst: nojobid_inst[i.region] = [] nojobid_inst[i.region].append(i) num_nojobid_inst += 1 ClusterCleaner.Clean(jobid_inst) for job_id, v in sorted(jobid_inst.iteritems()): self.dio.P("%s %d" % (job_id, len(v))) for k1, i in sorted(v.iteritems()): #msg = " (%s %s %s %s)" % (i.az, i.inst_id, i.public_ip, i.state) msg = " (%s %s %s)" % (i.az, i.public_ip, i.state) if self.dio.LastLineWidth() + len( msg) > DIO.max_column_width: self.dio.P("\n ") self.dio.P(msg) self.dio.P("\n") if len(nojobid_inst) > 0: self.dio.P("%-13s %d" % ("no-job-id", num_nojobid_inst)) for region, insts in sorted(nojobid_inst.iteritems()): for i in insts: msg = " (%s %s %s)" % (i.az, i.public_ip, i.state) if self.dio.LastLineWidth() + len( msg) > DIO.max_column_width: self.dio.P("\n ") self.dio.P(msg) self.dio.P("\n") if self.mode == "run_until_stopped": self.dio.P("Time since the last msg: %s" % (datetime.datetime.now() - self.desc_inst_start_time)) self.dio.Flush()
def _ReqSpotInst(self): # This is run as root # # http://unix.stackexchange.com/questions/4342/how-do-i-get-sudo-u-user-to-use-the-users-env user_data = \ """#!/bin/bash cd /home/ubuntu/work rm -rf /home/ubuntu/work/acorn-tools sudo -i -u ubuntu bash -c 'git clone https://github.com/hobinyoon/acorn-tools.git /home/ubuntu/work/acorn-tools' sudo -i -u ubuntu /home/ubuntu/work/acorn-tools/ec2/ec2-init.py {0} {1} {2} {3} """ user_data = user_data.format(self.tags["init_script"], self.jr_sqs_url, self.jr_sqs_msg_receipt_handle, self.num_regions) ls = { 'ImageId': Ec2Region.GetLatestAmiId(region=self.region, name=self.ami_name) #, 'KeyName': 'string' , 'SecurityGroups': ["cass-server"], 'UserData': base64.b64encode(user_data) #, 'AddressingType': 'string' , 'InstanceType': self.inst_type, 'EbsOptimized': True, 'Placement': { 'AvailabilityZone': self.az } } while True: try: r = BotoClient.Get(self.region).request_spot_instances( SpotPrice=str(self.max_price), #ClientToken='string', InstanceCount=1, Type='one-time', #ValidFrom=datetime(2015, 1, 1), #ValidUntil=datetime(2015, 1, 1), #LaunchGroup='string', #AvailabilityZoneGroup='string', # https://aws.amazon.com/blogs/aws/new-ec2-spot-blocks-for-defined-duration-workloads/ #BlockDurationMinutes=123, LaunchSpecification=ls, ) self.log.P("SpotInstReqResp: %s" % pprint.pformat(r)) if len(r["SpotInstanceRequests"]) != 1: raise RuntimeError("len(r[\"SpotInstanceRequests\"])=%d" % len(r["SpotInstanceRequests"])) self.spot_req_id = r["SpotInstanceRequests"][0][ "SpotInstanceRequestId"] self.log.P("region=%s spot_req_id=%s" % (self.region, self.spot_req_id)) break except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "RequestLimitExceeded": self.log.P("region=%s error=%s" % (self.region, e)) time.sleep(5) else: raise e
def _DescInst(dio): if _mode == "run_until_stopped": dio.P("\n") dio.P("# Describing instances:") DescInstPerRegion.Reset() region_desc_inst = {} for r in Ec2Region.All(): region_desc_inst[r] = DescInstPerRegion(r, dio) if _stop_requested: return threads = [] for r, di in region_desc_inst.iteritems(): t = threading.Thread(target=di.Run) threads.append(t) t.daemon = True t.start() # Exit immediately when requested for t in threads: while t.isAlive(): if _stop_requested: return t.join(0.1) dio.P("\n#\n") num_insts = 0 for r, di in region_desc_inst.iteritems(): num_insts += len(di.Instances()) if num_insts == 0: dio.P("No instances found.\n") else: # Header fmt = "%-15s %13s %-10s %6.4f %2s %19s %15s %13s" dio.P( Util.BuildHeader( fmt, "az" " job_id" " inst_type" " cur_spot_price" " name" " InstanceId" " PublicIpAddress" " State:Name") + "\n") for r, di in sorted(region_desc_inst.iteritems()): for i in di.Instances(): # Note: could be grouped by job_id later dio.P((fmt + "\n") % (i.az, i.job_id, i.inst_type, SpotPrice.GetCur(i.az, i.inst_type), i.name.replace("server", "s").replace( "client", "c"), i.inst_id, i.public_ip, i.state)) if _mode == "run_once": sys.exit(0) # Note: JobCleaner could use this node info if _stop_requested: return if _mode == "run_until_stopped": # Since the last JobContConsole output dio.P("# Time since the last msg: %s" % (datetime.datetime.now() - _desc_inst_start_time)) dio.Flush()
import time import traceback sys.path.insert(0, "%s/../../util/python" % os.path.dirname(__file__)) import Cons import Util sys.path.insert(0, "%s/.." % os.path.dirname(__file__)) import BotoClient import Ec2Region import ClusterCleaner import JobControllerLog # Initialize all values to None _num_nodes_per_region = dict.fromkeys(Ec2Region.All()) _num_nodes_per_region_lock = threading.Lock() def CanLaunchAnotherCluster(): # Returns True when all regions have less than 12 instances. with _num_nodes_per_region_lock: for r in Ec2Region.All(): v = _num_nodes_per_region.get(r) if v is None: return False if v >= 12: return False #JobControllerLog.P("%s %s" % (Util.FileLine(), pprint.pformat(_num_nodes_per_region))) # You can launch another cluster now