def main(): parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", dest="regions", action="append", help="optional list of regions") parser.add_argument("-v", "--verbose", action="store_true", help="Supress logging messages") parser.add_argument("--events-dir", dest="events_dir", help="cloudtrail logs event directory") args = parser.parse_args() logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.ERROR) if not args.regions: args.regions = DEFAULT_REGIONS all_instances = [] all_volumes = [] for region in args.regions: conn = get_aws_connection(region) all_instances.extend(get_all_instances(conn)) all_volumes.extend(conn.get_all_volumes()) generate_report(connection=conn, regions=args.regions, instances=all_instances, volumes=all_volumes, events_dir=args.events_dir)
def sanity_check(regions): spot_requests = [] for r in regions: conn = get_aws_connection(r) region_spot_requests = conn.get_all_spot_instance_requests() if region_spot_requests: spot_requests.extend(region_spot_requests) all_spot_instances = aws_get_all_instances(regions) instance_ids = [i.id for i in all_spot_instances] for req in spot_requests: if req.state in ["open", "failed"]: if req.status.code in CANCEL_STATUS_CODES: log.info("Cancelling request %s", req) retry_aws_request(req.add_tag, "moz-cancel-reason", req.status.code) req.cancel() elif req.status.code not in IGNORABLE_STATUS_CODES: log.error("Uknown status for request %s: %s", req, req.status.code) # Cancel all active request older than 30 mins without runing instances elif req.state == "active" and \ parse_aws_time(req.create_time) + 30 * 60 < time.time() and \ req.instance_id not in instance_ids: log.info("Cancelling request %s: %s is not running", req, req.instance_id) retry_aws_request(req.add_tag, "moz-cancel-reason", "no-running-instances") req.cancel()
def verify(hosts, config, region, ignore_subnet_check=False): """ Check DNS entries and IP availability for hosts""" passed = True conn = get_aws_connection(region) for host in hosts: fqdn = "%s.%s" % (host, config["domain"]) log.info("Checking name conflicts for %s", host) if not name_available(conn, host): log.error("%s has been already taken", host) passed = False continue log.debug("Getting IP for %s", fqdn) ip = get_ip(fqdn) if not ip: log.error("%s has no DNS entry", fqdn) passed = False else: log.debug("Getting PTR for %s", fqdn) ptr = get_ptr(ip) if ptr != fqdn: log.error("Bad PTR for %s", host) passed = False log.debug("Checking %s availablility", ip) if not ip_available(region, ip): log.error("IP %s reserved for %s, but not available", ip, host) passed = False if not ignore_subnet_check: vpc = get_vpc(region) s_id = get_subnet_id(vpc, ip) if s_id not in config['subnet_ids']: log.error("IP %s does not belong to assigned subnets", ip) passed = False if not passed: raise RuntimeError("Sanity check failed")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", dest="regions", action="append", help="optional list of regions") parser.add_argument("-q", "--quiet", action="store_true", help="Supress logging messages") args = parser.parse_args() logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if not args.quiet: log.setLevel(logging.INFO) else: log.setLevel(logging.ERROR) if not args.regions: args.regions = DEFAULT_REGIONS images = [] for region in args.regions: conn = get_aws_connection(region) images.extend( conn.get_all_images(owners=["self"], filters={"state": "available"})) update_ami_status(amis_to_dict(images))
def get_ami(region, moz_instance_type): conn = get_aws_connection(region) avail_amis = conn.get_all_images( owners=["self"], filters={"tag:moz-type": moz_instance_type}) last_ami = sorted(avail_amis, key=lambda ami: ami.tags.get("moz-created"))[-1] return last_ami
def main(): parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", dest="regions", action="append", help="optional list of regions") parser.add_argument("action", choices=["stop", "start", "restart", "enable", "disable", "terminate", "status"], help="action to be performed") parser.add_argument("-m", "--comments", help="reason to disable") parser.add_argument("-n", "--dry-run", action="store_true", help="Dry run mode") parser.add_argument("-q", "--quiet", action="store_true", help="Supress logging messages") parser.add_argument("hosts", metavar="host", nargs="+", help="hosts to be processed") parser.add_argument("-f", "--force", action="store_true", help="Force action without prompting") args = parser.parse_args() logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if not args.quiet: log.setLevel(logging.INFO) else: log.setLevel(logging.ERROR) if not args.regions: args.regions = DEFAULT_REGIONS for region in args.regions: conn = get_aws_connection(region) instances = conn.get_only_instances() for i in instances: name = i.tags.get('Name', '') instance_id = i.id if not i.private_ip_address: # Terminated instances has no IP address assinged log.debug("Skipping (terminated?) %s (%s)..." % (name, instance_id)) continue if name in args.hosts or instance_id in args.hosts: log.info("Found %s (%s)..." % (name, instance_id)) if args.action == "start": start(i, args.dry_run) elif args.action == "stop": stop(i, args.dry_run) elif args.action == "restart": restart(i, args.dry_run) elif args.action == "enable": enable(i, args.dry_run) elif args.action == "disable": disable(i, args.dry_run, args.comments) elif args.action == "terminate": terminate(i, args.dry_run, args.force) elif args.action == "status": status(i)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", action="append", dest="regions") parser.add_argument("amis", metavar="AMI", nargs="+", help="AMI IDs") parser.add_argument("-v", "--verbose", action="store_true", help="Verbose logging") args = parser.parse_args() logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) regions = args.regions if not regions: regions = DEFAULT_REGIONS instances_to_kill = [] for r in regions: log.debug("working in %s", r) conn = get_aws_connection(r) instances = conn.get_only_instances(filters={ "image-id": args.amis, "instance-state-name": "running" }) log.debug("got %s instances:\n%s", len(instances), instances) if instances: instances_to_kill.extend(instances) if instances_to_kill: log.info("Preparing to terminate the following %s instances:", len(instances_to_kill)) for i in instances_to_kill: log.info("%s (%s)", i.id, i.tags.get("Name")) yesno = raw_input("Are you sure you want to kill these? ^ y/N >") if yesno != "y": log.info("Exiting without any changes!") return yesno = raw_input("ARE YOU SURE YOU WANT TO KILL THESE? ^" " LAST WARNING!!! y/N >") if yesno != "y": log.info("Exiting without any changes!") return log.warn("The instances mentioned above are about to be terminated") log.warn("Waiting extra 60 seconds to make sure...") time.sleep(60) log.warn("Starting...") for i in instances_to_kill: log.warn("Terminating %s...", i) i.terminate() log.warn("Done.")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", action="append", dest="regions") parser.add_argument("amis", metavar="AMI", nargs="+", help="AMI IDs") parser.add_argument("-v", "--verbose", action="store_true", help="Verbose logging") args = parser.parse_args() logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.INFO) regions = args.regions if not regions: regions = DEFAULT_REGIONS instances_to_kill = [] for r in regions: log.debug("working in %s", r) conn = get_aws_connection(r) instances = conn.get_only_instances( filters={"image-id": args.amis, "instance-state-name": "running"}) log.debug("got %s instances:\n%s", len(instances), instances) if instances: instances_to_kill.extend(instances) if instances_to_kill: log.info("Preparing to terminate the following %s instances:", len(instances_to_kill)) for i in instances_to_kill: log.info("%s (%s)", i.id, i.tags.get("Name")) yesno = raw_input("Are you sure you want to kill these? ^ y/N >") if yesno != "y": log.info("Exiting without any changes!") return yesno = raw_input("ARE YOU SURE YOU WANT TO KILL THESE? ^" " LAST WARNING!!! y/N >") if yesno != "y": log.info("Exiting without any changes!") return log.warn("The instances mentioned above are about to be terminated") log.warn("Waiting extra 60 seconds to make sure...") time.sleep(60) log.warn("Starting...") for i in instances_to_kill: log.warn("Terminating %s...", i) i.terminate() log.warn("Done.")
def main(): args = docopt(__doc__) logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s") try: config = json.load( open("%s/%s.json" % (AMI_CONFIGS_DIR, args['--config'])))[args['--region']] except KeyError: log.error("unknown configuration") exit(1) connection = get_aws_connection(args['--region']) host_instance = create_instance(connection, args['INSTANCE_NAME'], config, args['--key-name']) create_ami(host_instance, args['--config'], config)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", dest="region", required=True, help="optional list of regions") parser.add_argument("-v", "--verbose", action="store_true", help="Supress logging messages") args = parser.parse_args() logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if args.verbose: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) conn = get_aws_connection(args.region) pool = Pool() res = conn.get_all_instances() instances = reduce(lambda a, b: a + b, [r.instances for r in res]) a_checks = [] ptr_checks = [] cname_checks = [] for i in instances: # TODO: ignore EB name = i.tags.get("Name") if not name: log.warning("%s has no Name tag, skipping...", i) continue fqdn = i.tags.get("FQDN") if not fqdn: log.warning("%s has no FQDN tag, skipping...", i) continue ip = i.private_ip_address if not ip: log.warning("%s no ip assigned, skipping...", i) continue cname = "%s.build.mozilla.org" % name a_checks.append([fqdn, ip]) ptr_checks.append([fqdn, ip]) cname_checks.append([fqdn, cname]) pool.map(check_A, a_checks) pool.map(check_PTR, ptr_checks) pool.map(check_CNAME, cname_checks) pool.close() pool.join()
def do_request_ondemand_instance(region, price, ami_id, instance_type, ssh_key, user_data, bdm, nc, profile, moz_instance_type, name, fqdn): conn = get_aws_connection(region) res = conn.run_instances( image_id=ami_id, key_name=ssh_key, instance_type=instance_type, user_data=user_data, block_device_map=bdm, network_interfaces=nc, instance_profile_name=profile, # terminate the instances on shutdown instance_initiated_shutdown_behavior="terminate", ) return tag_ondemand_instance(res.instances[0], name, fqdn, moz_instance_type)
def do_request_spot_instance(region, price, ami_id, instance_type, ssh_key, user_data, bdm, nc, profile, moz_instance_type, name, fqdn): conn = get_aws_connection(region) sir = conn.request_spot_instances( price=str(price), image_id=ami_id, count=1, instance_type=instance_type, key_name=ssh_key, user_data=user_data, block_device_map=bdm, network_interfaces=nc, instance_profile_name=profile, ) # Sleep for a little bit to prevent us hitting # InvalidSpotInstanceRequestID.NotFound right away time.sleep(0.5) max_tries = 10 sleep_time = 5 for i in range(max_tries): try: sir[0].add_tag("moz-type", moz_instance_type) # Name will be used to determine available slave names sir[0].add_tag("Name", name) sir[0].add_tag("FQDN", fqdn) return True except EC2ResponseError, e: if e.code == "InvalidSpotInstanceRequestID.NotFound": if i < max_tries - 1: # Try again log.debug("waiting for spot request") time.sleep(sleep_time) sleep_time = min(30, sleep_time * 1.5) continue except BotoServerError, e: if e.code == "RequestLimitExceeded": if i < max_tries - 1: # Try again log.debug( "request limit exceeded; sleeping and trying again") time.sleep(sleep_time) sleep_time = min(30, sleep_time * 1.5) continue raise
def aws_get_reservations(regions): """ Return a mapping of (availability zone, ec2 instance type) -> count """ log.debug("getting reservations for %s", regions) retval = {} for region in regions: conn = get_aws_connection(region) reservations = conn.get_all_reserved_instances(filters={ 'state': 'active', }) for r in reservations: az = r.availability_zone ec2_instance_type = r.instance_type if (az, ec2_instance_type) not in retval: retval[az, ec2_instance_type] = 0 retval[az, ec2_instance_type] += r.instance_count return retval
def do_request_spot_instance(region, price, ami_id, instance_type, ssh_key, user_data, bdm, nc, profile, moz_instance_type, name, fqdn): conn = get_aws_connection(region) sir = conn.request_spot_instances( price=str(price), image_id=ami_id, count=1, instance_type=instance_type, key_name=ssh_key, user_data=user_data, block_device_map=bdm, network_interfaces=nc, instance_profile_name=profile, ) # Sleep for a little bit to prevent us hitting # InvalidSpotInstanceRequestID.NotFound right away time.sleep(0.5) max_tries = 10 sleep_time = 5 for i in range(max_tries): try: sir[0].add_tag("moz-type", moz_instance_type) # Name will be used to determine available slave names sir[0].add_tag("Name", name) sir[0].add_tag("FQDN", fqdn) return True except EC2ResponseError, e: if e.code == "InvalidSpotInstanceRequestID.NotFound": if i < max_tries - 1: # Try again log.debug("waiting for spot request") time.sleep(sleep_time) sleep_time = min(30, sleep_time * 1.5) continue except BotoServerError, e: if e.code == "RequestLimitExceeded": if i < max_tries - 1: # Try again log.debug("request limit exceeded; sleeping and trying again") time.sleep(sleep_time) sleep_time = min(30, sleep_time * 1.5) continue raise
def aws_get_all_instances(regions): """ Returns a list of all instances in the given regions """ log.debug("fetching all instances for %s", regions) retval = [] for region in regions: if region in _aws_instances_cache: log.debug("aws_get_all_instances - cache hit for %s", region) retval.extend(_aws_instances_cache[region]) else: conn = get_aws_connection(region) reservations = conn.get_all_instances() region_instances = [] for r in reservations: region_instances.extend(r.instances) log.debug("aws_get_running_instances - caching %s", region) _aws_instances_cache[region] = region_instances retval.extend(region_instances) return retval
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("-r", "--region", dest="region", help="region to use", default="us-east-1") options, args = parser.parse_args() if not args: parser.error("at least one instance name is required") hosts_re = [re.compile(x) for x in args] conn = get_aws_connection(options.region) res = conn.get_all_instances() if res: instances = reduce(lambda a, b: a + b, [r.instances for r in res]) for i in instances: for mask in hosts_re: hostname = i.tags.get('FQDN', i.tags.get('Name', '')) if mask.search(hostname) and i.private_ip_address: print i.private_ip_address, hostname
def main(): parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", dest="regions", action="append", help="optional list of regions") parser.add_argument("-q", "--quiet", action="store_true", help="Supress logging messages") args = parser.parse_args() logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if not args.quiet: log.setLevel(logging.INFO) else: log.setLevel(logging.ERROR) if not args.regions: args.regions = DEFAULT_REGIONS images = [] for region in args.regions: conn = get_aws_connection(region) images.extend(conn.get_all_images(owners=["self"], filters={"state": "available"})) update_ami_status(amis_to_dict(images))
def do_request_spot_instance(region, secrets, moz_instance_type, price, ami, instance_config, cached_cert_dir, instance_type, availability_zone, slaveset, active_network_ids, dryrun): conn = get_aws_connection(region) interface = get_available_interface( conn=conn, moz_instance_type=moz_instance_type, availability_zone=availability_zone, slaveset=slaveset, active_network_ids=active_network_ids) if not interface: log.warn("No free network interfaces left in %s" % region) return False # TODO: check DNS fqdn = interface.tags.get("FQDN") if not fqdn: log.warn("interface %s has no FQDN", interface) return False log.debug("Spot request for %s (%s)", fqdn, price) if dryrun: log.info("Dry run. skipping") return True spec = NetworkInterfaceSpecification( network_interface_id=interface.id) nc = NetworkInterfaceCollection(spec) ip = interface.private_ip_address certs = get_puppet_certs(ip, secrets, cached_cert_dir) user_data = """ FQDN="%(fqdn)s" cd /var/lib/puppet/ssl || exit 1 %(certs)s cd - """ % dict(fqdn=fqdn, certs=certs) if instance_config[region].get("lvm"): user_data += """ mkdir -p /etc/lvm-init/ cat <<EOF > /etc/lvm-init/lvm-init.json %s EOF /sbin/lvm-init """ % json.dumps(instance_config[region]) bdm = BlockDeviceMapping() for device, device_info in instance_config[region]['device_map'].items(): bd = BlockDeviceType() if device_info.get('size'): bd.size = device_info['size'] if ami.root_device_name == device: ami_size = ami.block_device_mapping[device].size if ami.virtualization_type == "hvm": # Overwrite root device size for HVM instances, since they # cannot be resized online bd.size = ami_size elif device_info.get('size'): # make sure that size is enough for this AMI assert ami_size <= device_info['size'], \ "Instance root device size cannot be smaller than AMI " \ "root device" if device_info.get("delete_on_termination") is not False: bd.delete_on_termination = True if device_info.get("ephemeral_name"): bd.ephemeral_name = device_info["ephemeral_name"] bdm[device] = bd sir = conn.request_spot_instances( price=str(price), image_id=ami.id, count=1, instance_type=instance_type, key_name=instance_config[region]["ssh_key"], user_data=user_data, block_device_map=bdm, network_interfaces=nc, instance_profile_name=instance_config[region].get("instance_profile_name"), ) sir[0].add_tag("moz-type", moz_instance_type) return True
def main(): parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", dest="regions", action="append", help="optional list of regions") parser.add_argument("action", choices=[ "stop", "start", "restart", "enable", "disable", "terminate", "status" ], help="action to be performed") parser.add_argument("-m", "--comments", help="reason to disable") parser.add_argument("-n", "--dry-run", action="store_true", help="Dry run mode") parser.add_argument("-q", "--quiet", action="store_true", help="Supress logging messages") parser.add_argument("hosts", metavar="host", nargs="+", help="hosts to be processed") parser.add_argument("-f", "--force", action="store_true", help="Force action without prompting") args = parser.parse_args() logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if not args.quiet: log.setLevel(logging.INFO) else: log.setLevel(logging.ERROR) if not args.regions: args.regions = DEFAULT_REGIONS for region in args.regions: conn = get_aws_connection(region) instances = conn.get_only_instances() for i in instances: name = i.tags.get('Name', '') instance_id = i.id if not i.private_ip_address: # Terminated instances has no IP address assinged log.debug("Skipping (terminated?) %s (%s)..." % (name, instance_id)) continue if name in args.hosts or instance_id in args.hosts: log.info("Found %s (%s)..." % (name, instance_id)) if args.action == "start": start(i, args.dry_run) elif args.action == "stop": stop(i, args.dry_run) elif args.action == "restart": restart(i, args.dry_run) elif args.action == "enable": enable(i, args.dry_run) elif args.action == "disable": disable(i, args.dry_run, args.comments) elif args.action == "terminate": terminate(i, args.dry_run, args.force) elif args.action == "status": status(i)
session.commit() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", dest="regions", action="append", help="optional list of regions") parser.add_argument("-q", "--quiet", action="store_true", help="Supress logging messages") parser.add_argument("-d", "--db", default="spots.db") args = parser.parse_args() logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if not args.quiet: log.setLevel(logging.DEBUG) else: log.setLevel(logging.WARNING) engine = create_engine('sqlite:///%s' % args.db) Base.metadata.create_all(bind=engine) Session = sessionmaker(bind=engine) session = Session() if not args.regions: args.regions = DEFAULT_REGIONS for region in args.regions: conn = get_aws_connection(region) update_spot_stats(conn, session) cancel_low_price(conn)
def request_spot_instances(all_instances, moz_instance_type, start_count, regions, region_priorities, spot_config, dryrun, slaveset, latest_ami_percentage): started = 0 spot_rules = spot_config.get("rules", {}).get(moz_instance_type) if not spot_rules: log.warn("No spot rules found for %s", moz_instance_type) return 0 instance_config = load_instance_config(moz_instance_type) connections = [get_aws_connection(r) for r in regions] spot_choices = get_spot_choices(connections, spot_rules, "Linux/UNIX (Amazon VPC)") if not spot_choices: log.warn("No spot choices for %s", moz_instance_type) return 0 to_start = defaultdict(list) active_instance_ids = set(i.id for i in all_instances) # count the number of instances for each image id ami_distribution = defaultdict(int) for instance in all_instances: ami_distribution[instance.image_id] += 1 for region in regions: # Check if spots are enabled in this region for this type region_limit = spot_config.get("limits", {}).get(region, {}).get(moz_instance_type) if not region_limit: log.debug("No spot limits defined for %s in %s, skipping...", moz_instance_type, region) continue # check the limits active_requests = get_spot_requests_for_moztype( region=region, moz_instance_type=moz_instance_type) log.debug("%i active spot requests for %s %s", len(active_requests), region, moz_instance_type) # Filter out requests for instances that don't exist active_requests = [ r for r in active_requests if r.instance_id is not None and r.instance_id in active_instance_ids ] log.debug("%i real active spot requests for %s %s", len(active_requests), region, moz_instance_type) active_count = len(active_requests) can_be_started = region_limit - active_count if can_be_started < 1: log.debug( "Not starting. Active spot request count in %s region " "hit limit of %s. Active count: %s", region, region_limit, active_count) continue to_be_started_latest = min(can_be_started, start_count - started) spot_amis = get_spot_amis(region=region, tags={"moz-type": moz_instance_type}) ami_latest = spot_amis[-1] if len(spot_amis) > 1 and latest_ami_percentage < 100: # get the total number of running instances with both the latest and # prevous ami types, so that we can decide how many of each type to # launch. ami_prev = spot_amis[-2] prev_ami_count = ami_distribution[ami_prev.id] latest_ami_count = ami_distribution[ami_latest.id] ami_prev_to_start, ami_latest_to_start = find_prev_latest_amis_needed( latest_ami_percentage, prev_ami_count, latest_ami_count, to_be_started_latest) to_start[region].append({ "ami": ami_prev, "instances": ami_prev_to_start }) to_start[region].append({ "ami": ami_latest, "instances": ami_latest_to_start }) else: to_start[region].append({ "ami": ami_latest, "instances": to_be_started_latest }) if not to_start: log.debug("Nothing to start for %s", moz_instance_type) return 0 for choice in spot_choices: region = choice.region if region not in to_start: log.debug("Skipping %s for %s", choice, region) continue if not usable_spot_choice(choice): log.debug("Skipping %s for %s - unusable", choice, region) continue for to_start_entry in to_start[region]: need = min(to_start_entry["instances"], start_count - started) if need > 0: log.debug("Need %s of %s in %s", need, moz_instance_type, choice.availability_zone) log.debug("Using %s", choice) launched = do_request_spot_instances( amount=need, region=region, moz_instance_type=moz_instance_type, ami=to_start_entry["ami"], instance_config=instance_config, dryrun=dryrun, spot_choice=choice, slaveset=slaveset, all_instances=all_instances, ) started += launched if started >= start_count: break return started
def get_all_spot_requests(region): log.info("getting all spot requests for %s", region) conn = get_aws_connection(region) spot_requests = conn.get_all_spot_instance_requests() return spot_requests
def request_spot_instances(all_instances, moz_instance_type, start_count, regions, region_priorities, spot_config, dryrun, latest_ami_percentage): started = 0 spot_rules = spot_config.get("rules", {}).get(moz_instance_type) if not spot_rules: log.warn("No spot rules found for %s", moz_instance_type) return 0 instance_config = load_instance_config(moz_instance_type) connections = [get_aws_connection(r) for r in regions] product_description = get_product_description(moz_instance_type) spot_choices = get_spot_choices(connections, spot_rules, product_description) if not spot_choices: log.warn("No spot choices for %s", moz_instance_type) log.warn("%s: market price too expensive in all available regions; spot instances needed: %i", moz_instance_type, start_count) return 0 to_start = defaultdict(list) active_instance_ids = set(i.id for i in all_instances) # count the number of instances for each image id ami_distribution = defaultdict(int) for instance in all_instances: ami_distribution[instance.image_id] += 1 for region in regions: # Check if spots are enabled in this region for this type region_limit = spot_config.get("limits", {}).get(region, {}).get( moz_instance_type) if not region_limit: log.debug("No spot limits defined for %s in %s, skipping...", moz_instance_type, region) continue # check the limits active_requests = get_spot_requests_for_moztype( region=region, moz_instance_type=moz_instance_type) log.debug("%i active spot requests for %s %s", len(active_requests), region, moz_instance_type) # Filter out requests for instances that don't exist active_requests = [r for r in active_requests if r.instance_id is not None and r.instance_id in active_instance_ids] log.debug("%i real active spot requests for %s %s", len(active_requests), region, moz_instance_type) active_count = len(active_requests) can_be_started = region_limit - active_count if can_be_started < 1: log.debug("Not starting. Active spot request count in %s region " "hit limit of %s. Active count: %s", region, region_limit, active_count) continue to_be_started_latest = min(can_be_started, start_count - started) spot_amis = get_spot_amis(region=region, tags={"moz-type": moz_instance_type}) ami_latest = spot_amis[-1] if len(spot_amis) > 1 and latest_ami_percentage < 100: # get the total number of running instances with both the latest and # prevous ami types, so that we can decide how many of each type to # launch. ami_prev = spot_amis[-2] prev_ami_count = ami_distribution[ami_prev.id] latest_ami_count = ami_distribution[ami_latest.id] ami_prev_to_start, ami_latest_to_start = find_prev_latest_amis_needed( latest_ami_percentage, prev_ami_count, latest_ami_count, to_be_started_latest ) to_start[region].append({"ami": ami_prev, "instances": ami_prev_to_start}) to_start[region].append({"ami": ami_latest, "instances": ami_latest_to_start}) else: to_start[region].append({"ami": ami_latest, "instances": to_be_started_latest}) if not to_start: log.debug("Nothing to start for %s", moz_instance_type) return 0 for choice in spot_choices: region = choice.region if region not in to_start: log.debug("Skipping %s for %s", choice, region) continue if not usable_spot_choice(choice): log.debug("Skipping %s for %s - unusable", choice, region) continue for to_start_entry in to_start[region]: need = min(to_start_entry["instances"], start_count - started) if need > 0: log.debug("Need %s of %s in %s", need, moz_instance_type, choice.availability_zone) log.debug("Using %s", choice) launched = do_request_spot_instances( amount=need, region=region, moz_instance_type=moz_instance_type, ami=to_start_entry["ami"], instance_config=instance_config, dryrun=dryrun, spot_choice=choice, all_instances=all_instances, ) started += launched if started >= start_count: break return started
def main(): parser = argparse.ArgumentParser() parser.add_argument("-r", "--region", dest="region", required=True, help="Region") parser.add_argument("-q", "--quiet", action="store_true", help="Supress logging messages") parser.add_argument("-c", "--ami-config", required=True, help="AMI config") parser.add_argument("-i", "--instance-config", required=True, help="Instance config") parser.add_argument("--ssh-key", required=True, help="SSH key name") parser.add_argument("--user", help="Login name") parser.add_argument("--public", action="store_true", default=False, help="Generate a public AMI (no secrets)") args = parser.parse_args() try: ami_config = json.load( open("%s/%s.json" % (AMI_CONFIGS_DIR, args.ami_config)) )[args.region] moz_type_config = json.load( open("%s/%s" % (INSTANCE_CONFIGS_DIR, args.instance_config)) )[args.region] except KeyError: parser.error("unknown configuration") logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") if not args.quiet: log.setLevel(logging.DEBUG) else: log.setLevel(logging.ERROR) conn = get_aws_connection(args.region) dated_target_name = "spot-%s-%s" % ( args.ami_config, time.strftime("%Y-%m-%d-%H-%M", time.gmtime())) filters = { "tag:moz-state": "ready", "instance-state-name": "stopped" } for tag, value in moz_type_config["tags"].iteritems(): filters["tag:%s" % tag] = value using_stopped_instance = True res = conn.get_all_instances(filters=filters) if not res: filters["instance-state-name"] = "running" res = conn.get_all_instances(filters=filters) using_stopped_instance = False instances = reduce(lambda a, b: a + b, [r.instances for r in res]) # skip loaned instances instances = [i for i in instances if not i.tags.get("moz-loaned-to")] i = sorted(instances, key=lambda i: i.launch_time)[-1] log.debug("Selected instance to clone: %s", i) v_id = i.block_device_mapping[i.root_device_name].volume_id v = conn.get_all_volumes(volume_ids=[v_id])[0] snap1 = v.create_snapshot("temporary snapshot of %s" % v_id) wait_for_status(snap1, "status", "completed", "update") host_instance = run_instance( connection=conn, instance_name="tmp", config=ami_config, key_name=args.ssh_key, user=args.user, subnet_id=random.choice(moz_type_config["subnet_ids"])) env.host_string = host_instance.private_ip_address env.user = '******' env.abort_on_prompts = True env.disable_known_hosts = True int_dev_name = ami_config['target']['int_dev_name'] mount_dev = int_dev_name mount_point = ami_config['target']['mount_point'] virtualization_type = ami_config.get("virtualization_type") if virtualization_type == "hvm": mount_dev = "%s1" % mount_dev tmp_v = conn.create_volume(size=snap1.volume_size, zone=host_instance.placement, snapshot=snap1) wait_for_status(tmp_v, "status", "available", "update") while True: try: tmp_v.attach(host_instance.id, ami_config['target']['aws_dev_name']) break except: log.debug('hit error waiting for volume to be attached') time.sleep(10) while True: try: tmp_v.update() if tmp_v.status == 'in-use': if run('ls %s' % int_dev_name).succeeded: break except: log.debug('hit error waiting for volume to be attached') time.sleep(10) run('mkdir -p %s' % mount_point) run('mount {dev} {mount_point}'.format(dev=mount_dev, mount_point=mount_point)) with cd(mount_point): run("rm -f root/*.sh") run("rm -f root/*.log") run("rm -f root/userdata") run("rm -f root/*.done") run("rm -f etc/spot_setup.done") run("rm -f var/lib/puppet/ssl/private_keys/*") run("rm -f var/lib/puppet/ssl/certs/*") if not using_stopped_instance or args.public: run("rm -rf builds/slave") else: run("rm -f builds/slave/buildbot.tac") run("echo localhost > etc/hostname") run("sed -i -e 's/127.0.0.1.*/127.0.0.1 localhost/g' etc/hosts") if args.public: # put rc.local put("%s/%s/etc/rc.local" % (AMI_CONFIGS_DIR, args.ami_config), "etc/rc.local", mirror_local_mode=True) run("rm -rf home/cltbld/.ssh") run("rm -rf root/.ssh/*") run("rm -rf builds/gapi.data") run("rm -rf builds/mock_mozilla/*/root/home/mock_mozilla") else: put("%s/spot_setup.sh" % AMI_CONFIGS_DIR, "etc/spot_setup.sh", mirror_local_mode=True) # replace puppet init with our script if ami_config["distro"] == "ubuntu": put("%s/spot_setup.conf" % AMI_CONFIGS_DIR, "etc/init/puppet.conf", mirror_local_mode=True) else: run("echo '/etc/spot_setup.sh' > etc/init.d/puppet") # create snapshot2 log.info('Terminating %s', host_instance) host_instance.terminate() wait_for_status(tmp_v, "status", "available", "update") log.info('Creating a snapshot') snap2 = tmp_v.create_snapshot(dated_target_name) wait_for_status(snap2, "status", "completed", "update") snap2.add_tag("Name", dated_target_name) bdm = BlockDeviceMapping() bdm[i.root_device_name] = BlockDeviceType(snapshot_id=snap2.id) log.info('Creating AMI') if virtualization_type == "hvm": kernel_id = None else: kernel_id = i.kernel ami_id = conn.register_image( dated_target_name, dated_target_name, architecture=ami_config["arch"], kernel_id=kernel_id, root_device_name=i.root_device_name, block_device_map=bdm, virtualization_type=virtualization_type, ) log.info('Waiting...') while True: try: ami = conn.get_image(ami_id) ami.add_tag('Name', dated_target_name) ami.add_tag('moz-created', int(time.mktime(time.gmtime()))) for tag, value in moz_type_config["tags"].iteritems(): ami.add_tag(tag, value) log.info('AMI created') log.info('ID: {id}, name: {name}'.format(id=ami.id, name=ami.name)) break except: log.info('Wating for AMI') time.sleep(10) # Step 7: Cleanup log.info('Cleanup...') tmp_v.delete() snap1.delete()
parser = argparse.ArgumentParser() parser.add_argument("-c", "--config", required=True, type=argparse.FileType('r'), help="instance configuration to use") parser.add_argument("-r", "--region", help="region to use", default="us-east-1") parser.add_argument("-n", "--number", type=int, required=True, help="How many IPs you need") args = parser.parse_args() try: config = json.load(args.config)[args.region] except KeyError: parser.error("unknown configuration") conn = get_aws_connection(args.region) vpc = get_vpc(args.region) interfaces = vpc.get_all_network_interfaces() used_ips = [i.private_ip_address for i in interfaces] subnets = vpc.get_all_subnets(subnet_ids=config["subnet_ids"]) blocks = [s.cidr_block for s in subnets] available_ips = [] for b in blocks: # skip first 5 IPs (they are sometimes "reserved") and the last one # (broadcast) for ip in list(IP(b))[4:-1]: if str(ip) not in used_ips: available_ips.append(ip)
parser.add_option("-s", "--key-name", dest="key_name", help="SSH key name") parser.add_option('--keep-volume', dest='keep_volume', action='store_true', help="Don't delete target volume") parser.add_option('--keep-host-instance', dest='keep_host_instance', action='store_true', help="Don't delete host instance") parser.add_option('--user', dest='user', default='root') options, args = parser.parse_args() logging.basicConfig(level=logging.INFO) if not args: parser.error("at least one instance name is required") if not options.config: parser.error("config name is required") if not options.key_name: parser.error("SSH key name name is required") try: config = json.load(open("%s/%s.json" % (AMI_CONFIGS_DIR, options.config)))[options.region] except KeyError: parser.error("unknown configuration") connection = get_aws_connection(options.region) host_instance = run_instance(connection, args[0], config, options.key_name, options.user) create_ami(host_instance, options, config)
def create_instance(name, config, region, key_name, ssh_key, instance_data, deploypass, loaned_to, loan_bug, create_ami, ignore_subnet_check, max_attempts): """Creates an AMI instance with the given name and config. The config must specify things like ami id.""" conn = get_aws_connection(region) # Make sure we don't request the same things twice token = str(uuid.uuid4())[:16] instance_data = instance_data.copy() instance_data['name'] = name instance_data['domain'] = config['domain'] instance_data['hostname'] = '{name}.{domain}'.format( name=name, domain=config['domain']) ami = conn.get_all_images(image_ids=[config["ami"]])[0] bdm = None if 'device_map' in config: bdm = BlockDeviceMapping() for device, device_info in config['device_map'].items(): bd = BlockDeviceType() if device_info.get('size'): bd.size = device_info['size'] # Overwrite root device size for HVM instances, since they cannot # be resized online if ami.virtualization_type == "hvm" and \ ami.root_device_name == device: bd.size = ami.block_device_mapping[ami.root_device_name].size if device_info.get("delete_on_termination") is not False: bd.delete_on_termination = True if device_info.get("ephemeral_name"): bd.ephemeral_name = device_info["ephemeral_name"] if device_info.get("volume_type"): bd.volume_type = device_info["volume_type"] if device_info["volume_type"] == "io1" \ and device_info.get("iops"): bd.iops = device_info["iops"] bdm[device] = bd interfaces = make_instance_interfaces( region, instance_data['hostname'], ignore_subnet_check, config.get('subnet_ids'), config.get('security_group_ids', []), config.get("use_public_ip")) keep_going, attempt = True, 1 while keep_going: try: puppet_master = pick_puppet_master(instance_data.get('puppet_masters')) user_data = user_data_from_template(config['type'], { "puppet_server": puppet_master, "fqdn": instance_data['hostname'], "hostname": instance_data['name'], "domain": instance_data['domain'], "dns_search_domain": config.get('dns_search_domain'), "password": deploypass, "moz_instance_type": config['type'], "region_dns_atom": get_region_dns_atom(region)}) reservation = conn.run_instances( image_id=config['ami'], key_name=key_name, instance_type=config['instance_type'], block_device_map=bdm, client_token=token, disable_api_termination=config.get('disable_api_termination'), user_data=user_data, instance_profile_name=config.get('instance_profile_name'), network_interfaces=interfaces, ) break except boto.exception.BotoServerError: log.exception("Cannot start an instance") time.sleep(10) if max_attempts: attempt += 1 keep_going = max_attempts >= attempt instance = reservation.instances[0] log.info("instance %s created, waiting to come up", instance) # Wait for the instance to come up wait_for_status(instance, "state", "running", "update") instance.add_tag('Name', name) instance.add_tag('FQDN', instance_data['hostname']) instance.add_tag('created', time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime())) instance.add_tag('moz-type', config['type']) if loaned_to: instance.add_tag("moz-loaned-to", loaned_to) if loan_bug: instance.add_tag("moz-bug", loan_bug) log.info("assimilating %s", instance) instance.add_tag('moz-state', 'pending') keep_going, attempt = True, 1 while keep_going: try: # Don't reboot if need to create ami reboot = not create_ami assimilate_instance(instance=instance, config=config, ssh_key=ssh_key, instance_data=instance_data, deploypass=deploypass, reboot=reboot) break except NetworkError as e: # it takes a while for the machine to start/reboot so the # NetworkError exception is quite common, just log the error, # without the full stack trace log.warn("cannot connect; instance may still be starting %s (%s, %s) - %s," "retrying in %d sec ...", instance_data['hostname'], instance.id, instance.private_ip_address, e, FAILURE_TIMEOUT) time.sleep(FAILURE_TIMEOUT) except: # any other exception log.warn("problem assimilating %s (%s, %s), retrying in " "%d sec ...", instance_data['hostname'], instance.id, instance.private_ip_address, FAILURE_TIMEOUT, exc_info=True) time.sleep(FAILURE_TIMEOUT) if max_attempts: attempt += 1 keep_going = max_attempts >= attempt instance.add_tag('moz-state', 'ready') if create_ami: ami_name = "spot-%s-%s" % ( config['type'], time.strftime("%Y-%m-%d-%H-%M", time.gmtime())) log.info("Generating AMI %s", ami_name) ami_cleanup(mount_point="/", distro=config["distro"]) root_bd = instance.block_device_mapping[instance.root_device_name] volume = instance.connection.get_all_volumes( volume_ids=[root_bd.volume_id])[0] # The instance has to be stopped to flush EBS caches instance.stop() wait_for_status(instance, 'state', 'stopped', 'update') ami = volume_to_ami(volume=volume, ami_name=ami_name, arch=instance.architecture, virtualization_type=instance.virtualization_type, kernel_id=instance.kernel, root_device_name=instance.root_device_name, tags=config["tags"]) log.info("AMI %s (%s) is ready", ami_name, ami.id) log.warn("Terminating %s", instance) instance.terminate()
def create_instance(name, config, region, key_name, ssh_key, instance_data, deploypass, loaned_to, loan_bug, create_ami, ignore_subnet_check, max_attempts): """Creates an AMI instance with the given name and config. The config must specify things like ami id.""" conn = get_aws_connection(region) # Make sure we don't request the same things twice token = str(uuid.uuid4())[:16] instance_data = instance_data.copy() instance_data['name'] = name instance_data['domain'] = config['domain'] instance_data['hostname'] = '{name}.{domain}'.format( name=name, domain=config['domain']) ami = conn.get_all_images(image_ids=[config["ami"]])[0] bdm = None if 'device_map' in config: bdm = BlockDeviceMapping() for device, device_info in config['device_map'].items(): bd = BlockDeviceType() if device_info.get('size'): bd.size = device_info['size'] # Overwrite root device size for HVM instances, since they cannot # be resized online if ami.virtualization_type == "hvm" and \ ami.root_device_name == device: bd.size = ami.block_device_mapping[ami.root_device_name].size if device_info.get("delete_on_termination") is not False: bd.delete_on_termination = True if device_info.get("ephemeral_name"): bd.ephemeral_name = device_info["ephemeral_name"] bdm[device] = bd interfaces = make_instance_interfaces(region, instance_data['hostname'], ignore_subnet_check, config.get('subnet_ids'), config.get('security_group_ids', []), config.get("use_public_ip")) keep_going, attempt = True, 1 while keep_going: try: if 'user_data_file' in config: user_data = open(config['user_data_file']).read() else: user_data = get_user_data_tmpl(config['type']) if user_data: user_data = user_data.format( puppet_server=instance_data.get('default_puppet_server'), fqdn=instance_data['hostname'], hostname=instance_data['name'], domain=instance_data['domain'], dns_search_domain=config.get('dns_search_domain'), password=deploypass, moz_instance_type=config['type'], region_dns_atom=get_region_dns_atom(region), ) reservation = conn.run_instances( image_id=config['ami'], key_name=key_name, instance_type=config['instance_type'], block_device_map=bdm, client_token=token, disable_api_termination=config.get('disable_api_termination'), user_data=user_data, instance_profile_name=config.get('instance_profile_name'), network_interfaces=interfaces, ) break except boto.exception.BotoServerError: log.exception("Cannot start an instance") time.sleep(10) if max_attempts: attempt += 1 keep_going = max_attempts >= attempt instance = reservation.instances[0] log.info("instance %s created, waiting to come up", instance) # Wait for the instance to come up wait_for_status(instance, "state", "running", "update") instance.add_tag('Name', name) instance.add_tag('FQDN', instance_data['hostname']) instance.add_tag('created', time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime())) instance.add_tag('moz-type', config['type']) if loaned_to: instance.add_tag("moz-loaned-to", loaned_to) if loan_bug: instance.add_tag("moz-bug", loan_bug) log.info("assimilating %s", instance) instance.add_tag('moz-state', 'pending') keep_going, attempt = True, 1 while keep_going: try: # Don't reboot if need to create ami reboot = not create_ami assimilate_instance(instance=instance, config=config, ssh_key=ssh_key, instance_data=instance_data, deploypass=deploypass, reboot=reboot) break except NetworkError as e: # it takes a while for the machine to start/reboot so the # NetworkError exception is quite common, just log the error, # without the full stack trace log.warn( "cannot connect; instance may still be starting %s (%s, %s) - %s," "retrying in %d sec ...", instance_data['hostname'], instance.id, instance.private_ip_address, e, FAILURE_TIMEOUT) time.sleep(FAILURE_TIMEOUT) except: # any other exception log.warn( "problem assimilating %s (%s, %s), retrying in " "%d sec ...", instance_data['hostname'], instance.id, instance.private_ip_address, FAILURE_TIMEOUT, exc_info=True) time.sleep(FAILURE_TIMEOUT) if max_attempts: attempt += 1 keep_going = max_attempts >= attempt instance.add_tag('moz-state', 'ready') if create_ami: ami_name = "spot-%s-%s" % ( config['type'], time.strftime("%Y-%m-%d-%H-%M", time.gmtime())) log.info("Generating AMI %s", ami_name) ami_cleanup(mount_point="/", distro=config["distro"]) root_bd = instance.block_device_mapping[instance.root_device_name] volume = instance.connection.get_all_volumes( volume_ids=[root_bd.volume_id])[0] # The instance has to be stopped to flush EBS caches instance.stop() wait_for_status(instance, 'state', 'stopped', 'update') ami = volume_to_ami(volume=volume, ami_name=ami_name, arch=instance.architecture, virtualization_type=instance.virtualization_type, kernel_id=instance.kernel, root_device_name=instance.root_device_name, tags=config["tags"]) log.info("AMI %s (%s) is ready", ami_name, ami.id) log.warn("Terminating %s", instance) instance.terminate()
def aws_get_spot_requests(region, moz_instance_type): """retruns a list of all open and active spot requests""" conn = get_aws_connection(region) filters = {"tag:moz-type": moz_instance_type} req = conn.get_all_spot_instance_requests(filters=filters) return [r for r in req if r.state in ("open", "active")]
def create_instance(name, config, region, key_name, instance_data, deploypass, loaned_to, loan_bug): """Creates an AMI instance with the given name and config. The config must specify things like ami id.""" conn = get_aws_connection(region) vpc = get_vpc(region) # Make sure we don't request the same things twice token = str(uuid.uuid4())[:16] instance_data = instance_data.copy() instance_data['name'] = name instance_data['hostname'] = '{name}.{domain}'.format( name=name, domain=config['domain']) ami = conn.get_all_images(image_ids=[config["ami"]])[0] bdm = None if 'device_map' in config: bdm = BlockDeviceMapping() for device, device_info in config['device_map'].items(): bd = BlockDeviceType() if device_info.get('size'): bd.size = device_info['size'] # Overwrite root device size for HVM instances, since they cannot # be resized online if ami.virtualization_type == "hvm" and \ ami.root_device_name == device: bd.size = ami.block_device_mapping[ami.root_device_name].size if device_info.get("delete_on_termination") is not False: bd.delete_on_termination = True if device_info.get("ephemeral_name"): bd.ephemeral_name = device_info["ephemeral_name"] bdm[device] = bd ip_address = get_ip(instance_data['hostname']) subnet_id = None if ip_address: s_id = get_subnet_id(vpc, ip_address) if s_id in config['subnet_ids']: if ip_available(conn, ip_address): subnet_id = s_id else: log.warning("%s already assigned" % ip_address) if not ip_address or not subnet_id: ip_address = None subnet_id = choice(config.get('subnet_ids')) interface = NetworkInterfaceSpecification( subnet_id=subnet_id, private_ip_address=ip_address, delete_on_termination=True, groups=config.get('security_group_ids', []), associate_public_ip_address=config.get("use_public_ip") ) interfaces = NetworkInterfaceCollection(interface) while True: try: reservation = conn.run_instances( image_id=config['ami'], key_name=key_name, instance_type=config['instance_type'], block_device_map=bdm, client_token=token, disable_api_termination=bool(config.get('disable_api_termination')), network_interfaces=interfaces, instance_profile_name=config.get("instance_profile_name"), ) break except boto.exception.BotoServerError: log.exception("Cannot start an instance") time.sleep(10) instance = reservation.instances[0] log.info("instance %s created, waiting to come up", instance) # Wait for the instance to come up wait_for_status(instance, "state", "running", "update") instance.add_tag('Name', name) instance.add_tag('FQDN', instance_data['hostname']) instance.add_tag('created', time.strftime("%Y-%m-%d %H:%M:%S %Z", time.gmtime())) instance.add_tag('moz-type', config['type']) if loaned_to: instance.add_tag("moz-loaned-to", loaned_to) if loan_bug: instance.add_tag("moz-bug", loan_bug) log.info("assimilating %s", instance) instance.add_tag('moz-state', 'pending') while True: try: assimilate(instance.private_ip_address, config, instance_data, deploypass) break except: log.warn("problem assimilating %s (%s), retrying in 10 sec ...", instance_data['hostname'], instance.id) time.sleep(10) instance.add_tag('moz-state', 'ready')
def request_spot_instances(moz_instance_type, start_count, regions, secrets, region_priorities, spot_config, dryrun, cached_cert_dir, slaveset): started = 0 spot_rules = spot_config.get("rules", {}).get(moz_instance_type) if not spot_rules: log.warn("No spot rules found for %s", moz_instance_type) return 0 instance_config = json.load(open(os.path.join(INSTANCE_CONFIGS_DIR, moz_instance_type))) connections = [] for region in regions: conn = get_aws_connection(region) connections.append(conn) spot_choices = get_spot_choices(connections, spot_rules, "Linux/UNIX (Amazon VPC)") if not spot_choices: log.warn("No spot choices for %s", moz_instance_type) return 0 to_start = {} active_network_ids = {} for region in regions: # Check if spots are enabled in this region for this type region_limit = spot_config.get("limits", {}).get(region, {}).get( moz_instance_type) if not region_limit: log.debug("No spot limits defined for %s in %s, skipping...", moz_instance_type, region) continue # check the limits # Count how many unique network interfaces are active # Sometimes we have multiple requests for the same interface active_requests = aws_get_spot_requests(region=region, moz_instance_type=moz_instance_type) active_network_ids[region] = set(r.launch_specification.networkInterfaceId for r in active_requests) active_count = len(active_network_ids[region]) log.debug("%s: %i running spot instances in %s", moz_instance_type, active_count, region) can_be_started = region_limit - active_count if can_be_started < 1: log.debug("Not starting. Active spot request count in %s region " "hit limit of %s. Active count: %s", region, region_limit, active_count) continue to_be_started = min(can_be_started, start_count - started) ami = get_ami(region=region, moz_instance_type=moz_instance_type) to_start[region] = {"ami": ami, "instances": to_be_started} if not to_start: log.debug("Nothing to start for %s", moz_instance_type) return 0 for choice in spot_choices: region = choice.region if region not in to_start: log.debug("Skipping %s for %s", choice, region) continue if not usable_choice(choice): log.debug("Skipping %s for %s - unusable", choice, region) continue need = min(to_start[region]["instances"], start_count - started) log.debug("Need %s of %s in %s", need, moz_instance_type, choice.availability_zone) log.debug("Using %s", choice) launched = do_request_spot_instances( amount=need, region=region, secrets=secrets, moz_instance_type=moz_instance_type, ami=to_start[region]["ami"], instance_config=instance_config, dryrun=dryrun, cached_cert_dir=cached_cert_dir, spot_choice=choice, slaveset=slaveset, active_network_ids=active_network_ids[region], ) started += launched if started >= start_count: break return started
def aws_stop_idle(credentials, regions, masters_json, moz_types, dryrun=False, concurrency=8): if not regions: # Look at all regions log.debug("loading all regions") regions = [r.name for r in boto.ec2.regions()] min_running_by_type = 0 all_instances = [] impaired_ids = [] for r in regions: log.debug("looking at region %s", r) conn = get_aws_connection(r) instances = get_buildbot_instances(conn, moz_types) impaired = conn.get_all_instance_status( filters={'instance-status.status': 'impaired'}) impaired_ids.extend(i.id for i in impaired) instances_by_type = {} for i in instances: # TODO: Check if launch_time is too old, and terminate the instance # if it is # NB can't turn this on until aws_create_instance is working # properly (with ssh keys) instances_by_type.setdefault(i.tags['moz-type'], []).append(i) # Make sure min_running_by_type are kept running for t in instances_by_type: to_remove = instances_by_type[t][:min_running_by_type] for i in to_remove: log.debug("%s - keep running (min %s instances of type %s)", i.tags['Name'], min_running_by_type, i.tags['moz-type']) instances.remove(i) all_instances.extend(instances) random.shuffle(all_instances) q = Queue() to_stop = Queue() def worker(): while True: try: i = q.get(timeout=0.1) except Empty: return try: if aws_safe_stop_instance(i, impaired_ids, credentials, masters_json, dryrun=dryrun): to_stop.put(i) except Exception: log.warning("%s - unable to stop" % i.tags.get('Name'), exc_info=True) for i in all_instances: q.put(i) # Workaround for http://bugs.python.org/issue11108 time.strptime("19000102030405", "%Y%m%d%H%M%S") threads = [] for i in range(concurrency): t = threading.Thread(target=worker) t.start() threads.append(t) while threads: for t in threads[:]: try: if t.is_alive(): t.join(timeout=0.5) else: t.join() threads.remove(t) except KeyboardInterrupt: raise SystemExit(1) total_stopped = {} while not to_stop.empty(): i = to_stop.get() if not dryrun: i.update() if 'moz-type' not in i.tags: log.info("%s - has no moz-type! (%s)" % (i.tags.get('Name'), i.id)) t = i.tags.get('moz-type', 'notype') if t not in total_stopped: total_stopped[t] = 0 total_stopped[t] += 1 for t, c in sorted(total_stopped.items()): log.info("%s - stopped %s", t, c)