def test_run_instances_block_device_mapping(self): # Same as the test in ``unit/ec2/autoscale/test_group.py:TestLaunchConfiguration``, # but with modified request parameters (due to a mismatch between EC2 & # Autoscaling). self.set_http_response(status_code=200) dev_sdf = BlockDeviceType(snapshot_id='snap-12345') dev_sdg = BlockDeviceType(snapshot_id='snap-12346', delete_on_termination=True, encrypted=True) bdm = BlockDeviceMapping() bdm.update(OrderedDict((('/dev/sdf', dev_sdf), ('/dev/sdg', dev_sdg)))) response = self.service_connection.run_instances( image_id='123456', instance_type='m1.large', security_groups=['group1', 'group2'], block_device_map=bdm ) self.assert_request_parameters({ 'Action': 'RunInstances', 'BlockDeviceMapping.1.DeviceName': '/dev/sdf', 'BlockDeviceMapping.1.Ebs.DeleteOnTermination': 'false', 'BlockDeviceMapping.1.Ebs.SnapshotId': 'snap-12345', 'BlockDeviceMapping.2.DeviceName': '/dev/sdg', 'BlockDeviceMapping.2.Ebs.DeleteOnTermination': 'true', 'BlockDeviceMapping.2.Ebs.SnapshotId': 'snap-12346', 'BlockDeviceMapping.2.Ebs.Encrypted': 'true', 'ImageId': '123456', 'InstanceType': 'm1.large', 'MaxCount': 1, 'MinCount': 1, 'SecurityGroup.1': 'group1', 'SecurityGroup.2': 'group2', }, ignore_params_values=[ 'Version', 'AWSAccessKeyId', 'SignatureMethod', 'SignatureVersion', 'Timestamp' ])
class BlockDeviceMappingTests(unittest.TestCase): def setUp(self): self.block_device_mapping = BlockDeviceMapping() def block_device_type_eq(self, b1, b2): if isinstance(b1, BlockDeviceType) and isinstance(b2, BlockDeviceType): return all([b1.connection == b2.connection, b1.ephemeral_name == b2.ephemeral_name, b1.no_device == b2.no_device, b1.volume_id == b2.volume_id, b1.snapshot_id == b2.snapshot_id, b1.status == b2.status, b1.attach_time == b2.attach_time, b1.delete_on_termination == b2.delete_on_termination, b1.size == b2.size, b1.encrypted == b2.encrypted]) def test_startElement_with_name_ebs_sets_and_returns_current_value(self): retval = self.block_device_mapping.startElement("ebs", None, None) assert self.block_device_type_eq(retval, BlockDeviceType(self.block_device_mapping)) def test_startElement_with_name_virtualName_sets_and_returns_current_value(self): retval = self.block_device_mapping.startElement("virtualName", None, None) assert self.block_device_type_eq(retval, BlockDeviceType(self.block_device_mapping)) def test_endElement_with_name_device_sets_current_name_dev_null(self): self.block_device_mapping.endElement("device", "/dev/null", None) self.assertEqual(self.block_device_mapping.current_name, "/dev/null") def test_endElement_with_name_device_sets_current_name(self): self.block_device_mapping.endElement("deviceName", "some device name", None) self.assertEqual(self.block_device_mapping.current_name, "some device name") def test_endElement_with_name_item_sets_current_name_key_to_current_value(self): self.block_device_mapping.current_name = "some name" self.block_device_mapping.current_value = "some value" self.block_device_mapping.endElement("item", "some item", None) self.assertEqual(self.block_device_mapping["some name"], "some value")
def launch_cluster(conn, opts, cluster_name): #Remove known hosts to avoid "Offending key for IP ..." errors. known_hosts = os.environ['HOME'] + "/.ssh/known_hosts" if os.path.isfile(known_hosts): os.remove(known_hosts) if opts.key_pair is None: opts.key_pair = keypair() if opts.key_pair is None: print ( "ERROR: Must provide a key pair name (-k) to use on instances.", file=sys.stderr) sys.exit(1) if opts.profile is None: opts.profile = profile() if opts.profile is None: print ( "ERROR: No profile found in current host. It be provided with -p option.", file=sys.stderr) sys.exit(1) public_key = pub_key() user_data = Template("""#!/bin/bash set -e -x echo '$public_key' >> ~root/.ssh/authorized_keys echo '$public_key' >> ~ec2-user/.ssh/authorized_keys""").substitute(public_key=public_key) print("Setting up security groups...") master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") security_group = os.popen("curl -s http://169.254.169.254/latest/meta-data/security-groups").read() sparknotebook_group = get_or_make_group(conn, security_group) if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=sparknotebook_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') master_group.authorize('tcp', 7077, 7077, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=sparknotebook_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if not any(r for r in sparknotebook_group.rules for g in r.grants if master_group.id == g.group_id): sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=master_group) sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=master_group) if not any(r for r in sparknotebook_group.rules for g in r.grants if slave_group.id == g.group_id): sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=slave_group) sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=slave_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print (("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)), file=sys.stderr) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print ("Could not find AMI " + opts.ami, file=sys.stderr) sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price != None: zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: best_price = find_best_price(conn,opts.instance_type,zone, opts.spot_price) # Launch spot instances with the requested price print(("Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)" % (opts.slaves, best_price, opts.slaves * best_price)), file=sys.stderr) num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnetId(), groups=[slave_group.id], associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface) slave_reqs = conn.request_spot_instances( price = best_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = zone, count = num_slaves_this_zone, key_name = opts.key_pair, instance_type = opts.instance_type, block_device_map = block_map, user_data = user_data, instance_profile_arn = opts.profile, network_interfaces = interfaces) my_req_ids += [req.id for req in slave_reqs] i += 1 print ("Waiting for spot instances to be granted", file=sys.stderr) try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print ("All %d slaves granted" % opts.slaves, file=sys.stderr) reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: # print >> stderr, ".", print("%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests", file=sys.stderr) conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=sys.stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name = opts.key_pair, security_group_ids = [slave_group.id], instance_type = opts.instance_type, subnet_id = subnetId(), placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map, user_data = user_data, instance_profile_arn = opts.profile) slave_nodes += slave_res.instances print("Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id), file=sys.stderr) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name if opts.spot_price != None: best_price = find_best_price(conn,master_type,opts.zone,opts.spot_price) # Launch spot instances with the requested price print(("Requesting master as spot instances with price $%.3f/hour" % (best_price)), file=sys.stderr) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnetId(), groups=[master_group.id], associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface) master_reqs = conn.request_spot_instances( price = best_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = opts.zone, count = 1, key_name = opts.key_pair, instance_type = master_type, block_device_map = block_map, user_data = user_data, instance_profile_arn = opts.profile, network_interfaces = interfaces) my_req_ids = [r.id for r in master_reqs] print("Waiting for spot instance to be granted", file=sys.stderr) try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests(request_ids=my_req_ids) id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: #print(id_to_req[i].state, file=sys.stderr) if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == 1: print ( "Master granted", file=sys.stderr) reservations = conn.get_all_instances(active_instance_ids) master_nodes = [] for r in reservations: master_nodes += r.instances break else: # print >> stderr, ".", print("%d of %d masters granted, waiting longer" % ( len(active_instance_ids), 1)) except: print("Canceling spot instance requests", file=sys.stderr) conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, master_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(master_nodes) if running: print(("WARNING: %d instances are still running" % running), file=sys.stderr) sys.exit(0) else: master_res = image.run(key_name = opts.key_pair, security_group_ids = [master_group.id], instance_type = master_type, subnet_id = subnetId(), placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map, user_data = user_data, instance_profile_arn = opts.profile) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id), file=sys.stderr) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') if opts.cluster_type == "mesos": master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups active_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if any(active_nodes): print >> stderr, ("ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) # Figure out the latest AMI from our static URL if opts.ami == "latest": try: opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip() print "Latest Spark AMI: " + opts.ami except: print >> stderr, "Could not read " + LATEST_AMI_URL sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = zone, count = num_slaves_this_zone, key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes, zoo_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) + len(zoo_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def create_image(module, ec2): """ Creates new AMI module : AnsibleModule object ec2: authenticated ec2 connection object """ instance_id = module.params.get('instance_id') name = module.params.get('name') wait = module.params.get('wait') wait_timeout = int(module.params.get('wait_timeout')) description = module.params.get('description') architecture = module.params.get('architecture') kernel_id = module.params.get('kernel_id') root_device_name = module.params.get('root_device_name') virtualization_type = module.params.get('virtualization_type') no_reboot = module.params.get('no_reboot') device_mapping = module.params.get('device_mapping') tags = module.params.get('tags') launch_permissions = module.params.get('launch_permissions') try: params = {'name': name, 'description': description} images = ec2.get_all_images(filters={'name': name}) if images and images[0]: # ensure that launch_permissions are up to date update_image(module, ec2, images[0].id) bdm = None if device_mapping: bdm = BlockDeviceMapping() for device in device_mapping: if 'device_name' not in device: module.fail_json(msg='Device name must be set for volume') device_name = device['device_name'] del device['device_name'] bd = BlockDeviceType(**device) bdm[device_name] = bd if instance_id: params['instance_id'] = instance_id params['no_reboot'] = no_reboot if bdm: params['block_device_mapping'] = bdm image_id = ec2.create_image(**params) else: params['architecture'] = architecture params['virtualization_type'] = virtualization_type if kernel_id: params['kernel_id'] = kernel_id if root_device_name: params['root_device_name'] = root_device_name if bdm: params['block_device_map'] = bdm image_id = ec2.register_image(**params) except boto.exception.BotoServerError as e: module.fail_json(msg="%s: %s" % (e.error_code, e.error_message)) # Wait until the image is recognized. EC2 API has eventual consistency, # such that a successful CreateImage API call doesn't guarantee the success # of subsequent DescribeImages API call using the new image id returned. for i in range(wait_timeout): try: img = ec2.get_image(image_id) if img.state == 'available': break elif img.state == 'failed': module.fail_json( msg= "AMI creation failed, please see the AWS console for more details" ) except boto.exception.EC2ResponseError as e: if ('InvalidAMIID.NotFound' not in e.error_code and 'InvalidAMIID.Unavailable' not in e.error_code) and wait and i == wait_timeout - 1: module.fail_json( msg= "Error while trying to find the new image. Using wait=yes and/or a longer wait_timeout may help. %s: %s" % (e.error_code, e.error_message)) finally: time.sleep(1) if img.state != 'available': module.fail_json( msg= "Error while trying to find the new image. Using wait=yes and/or a longer wait_timeout may help." ) if tags: try: ec2.create_tags(image_id, tags) except boto.exception.EC2ResponseError as e: module.fail_json(msg="Image tagging failed => %s: %s" % (e.error_code, e.error_message)) if launch_permissions: try: img = ec2.get_image(image_id) img.set_launch_permissions(**launch_permissions) except boto.exception.BotoServerError as e: module.fail_json(msg="%s: %s" % (e.error_code, e.error_message), image_id=image_id) module.exit_json(msg="AMI creation operation complete", changed=True, **get_ami_info(img))
interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=vpc_subnet_id, groups=group_id, associate_public_ip_address=assign_public_ip) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( interface) params['network_interfaces'] = interfaces else: params['subnet_id'] = vpc_subnet_id if vpc_subnet_id: params['security_group_ids'] = group_id else: params['security_groups'] = group_name if volumes: bdm = BlockDeviceMapping() for volume in volumes: if 'device_name' not in volume: module.fail_json( msg='Device name must be set for volume') # Minimum volume size is 1GB. We'll use volume size explicitly set to 0 # to be a signal not to create this volume if 'volume_size' not in volume or int( volume['volume_size']) > 0: bdm[volume['device_name']] = create_block_device( module, ec2, volume) params['block_device_map'] = bdm # check to see if we're using spot pricing first before starting instances if not spot_price:
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." if opts.security_group_prefix is None: master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") else: master_group = get_or_make_group(conn, opts.security_group_prefix + "-master") slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves") authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) master_group.authorize('tcp', 8080, 8081, authorized_address) master_group.authorize('tcp', 18080, 18080, authorized_address) master_group.authorize('tcp', 19999, 19999, authorized_address) master_group.authorize('tcp', 50030, 50030, authorized_address) master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) slave_group.authorize('tcp', 8080, 8081, authorized_address) slave_group.authorize('tcp', 50060, 50060, authorized_address) slave_group.authorize('tcp', 50075, 50075, authorized_address) slave_group.authorize('tcp', 60060, 60060, authorized_address) slave_group.authorize('tcp', 60075, 60075, authorized_address) # Check if instances are already running with the cluster name existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances for name: %s " % cluster_name) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) additional_groups = [] if opts.additional_security_group: additional_groups = [sg for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type=opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] outstanding_request_ids = [] for i in my_req_ids: if i in id_to_req: if id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) else: outstanding_request_ids.append(i) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer for request ids including %s" % ( len(active_instance_ids), opts.slaves, outstanding_request_ids[0:10]) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group] + additional_groups, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=user_data_content) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names for master in master_nodes: name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id) tag_instance(master, name) for slave in slave_nodes: name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id) tag_instance(slave, name) # Return all the instances return (master_nodes, slave_nodes)
def create_ami(host_instance, args, config, instance_config, ssh_key, key_filename, instance_data, deploypass, cert, pkey, ami_name_prefix): connection = host_instance.connection setup_fabric_env(instance=host_instance, abort_on_prompts=True, disable_known_hosts=True, key_filename=key_filename) target_name = args.config virtualization_type = config.get("virtualization_type") config_dir = "%s/%s" % (AMI_CONFIGS_DIR, target_name) if ami_name_prefix: prefix = ami_name_prefix else: prefix = args.config dated_target_name = "{}-{}".format( prefix, time.strftime("%Y-%m-%d-%H-%M", time.gmtime())) if config.get('distro') in ('debian', 'ubuntu'): ubuntu_release = config.get("release", "precise") int_dev_name = config['target']['int_dev_name'] mount_dev = int_dev_name grub_dev = int_dev_name mount_point = config['target']['mount_point'] boot_mount_dev = None host_packages_file = os.path.join(config_dir, "host_packages") packages_file = os.path.join(config_dir, "packages") if os.path.exists(host_packages_file): install_packages(host_packages_file, config.get('distro')) v = attach_and_wait(host_instance, config['target']['size'], config['target']['aws_dev_name'], int_dev_name) # Step 0: install required packages if config.get('distro') == "centos": run('which MAKEDEV >/dev/null || yum -d 1 install -y MAKEDEV') # Step 1: prepare target FS run('mkdir -p %s' % mount_point) if config.get("root_device_type") == "instance-store": # Use file image mount_dev = "/dev/cloud_root/lv_root" grub_dev = "/dev/loop0" boot_mount_dev = "/dev/mapper/loop0p1" img_file = dated_target_name partition_image(mount_dev=mount_dev, int_dev_name=int_dev_name, img_file=img_file) elif virtualization_type == "hvm": # use EBS volume mount_dev = "/dev/cloud_root/lv_root" boot_mount_dev = "%s1" % int_dev_name partition_ebs_volume(int_dev_name=int_dev_name) run('/sbin/mkfs.{fs_type} {args} {dev}'.format( fs_type=config['target']['fs_type'], args=config['target'].get("mkfs_args", ""), dev=mount_dev)) run('/sbin/e2label {dev} {label}'.format( dev=mount_dev, label=config['target']['e2_label'])) run('mount {dev} {mount_point}'.format(dev=mount_dev, mount_point=mount_point)) run('mkdir {0}/dev {0}/proc {0}/etc {0}/boot {0}/sys'.format(mount_point)) run('mount -t sysfs sys %s/sys' % mount_point) if config.get('distro') not in ('debian', 'ubuntu'): run('mount -t proc proc %s/proc' % mount_point) run('for i in console null zero random urandom; ' 'do /sbin/MAKEDEV -d %s/dev -x $i ; done' % mount_point) if boot_mount_dev: run('mount {} {}/boot'.format(boot_mount_dev, mount_point)) # Step 2: install base system if config.get('distro') in ('debian', 'ubuntu'): run("debootstrap %s %s " "http://puppet/repos/apt/ubuntu/" % (ubuntu_release, mount_point)) run('chroot %s mount -t proc none /proc' % mount_point) run('mount -o bind /dev %s/dev' % mount_point) put('%s/releng-public-%s.list' % (AMI_CONFIGS_DIR, ubuntu_release), '%s/etc/apt/sources.list' % mount_point) with lcd(config_dir): put('usr/sbin/policy-rc.d', '%s/usr/sbin/' % mount_point, mirror_local_mode=True) install_packages(packages_file, config.get('distro'), chroot=mount_point) else: with lcd(config_dir): put('etc/yum-local.cfg', '%s/etc/yum-local.cfg' % mount_point) yum = 'yum -d 1 -c {0}/etc/yum-local.cfg -y --installroot={0} '.format( mount_point) # this groupinstall emulates the %packages section of the kickstart # config, which defaults to Core and Base. run('%s groupinstall Core Base' % yum) run('%s clean packages' % yum) # Rebuild RPM DB for cases when versions mismatch run('chroot %s rpmdb --rebuilddb || :' % mount_point) # Step 3: upload custom configuration files run('chroot %s mkdir -p /boot/grub' % mount_point) for directory in ('boot', 'etc', 'usr'): local_directory = os.path.join(config_dir, directory) remote_directory = os.path.join(mount_point, directory) if not os.path.exists(local_directory): pass sync(local_directory, remote_directory) # Step 4: tune configs run('sed -i -e s/@ROOT_DEV_LABEL@/{label}/g -e s/@FS_TYPE@/{fs}/g ' '{mnt}/etc/fstab'.format(label=config['target']['e2_label'], fs=config['target']['fs_type'], mnt=mount_point)) if config.get('distro') in ('debian', 'ubuntu'): if virtualization_type == "hvm": run("chroot {mnt} grub-install {int_dev_name}".format( mnt=mount_point, int_dev_name=int_dev_name)) run("chroot {mnt} update-grub".format(mnt=mount_point)) else: run("chroot {mnt} update-grub -y".format(mnt=mount_point)) run("sed -i 's/^# groot.*/# groot=(hd0)/g' " "{mnt}/boot/grub/menu.lst".format(mnt=mount_point)) run("chroot {mnt} update-grub".format(mnt=mount_point)) else: run('ln -s grub.conf %s/boot/grub/menu.lst' % mount_point) run('ln -s ../boot/grub/grub.conf %s/etc/grub.conf' % mount_point) if config.get('kernel_package') == 'kernel-PAE': run('sed -i s/@VERSION@/`chroot %s rpm -q ' '--queryformat "%%{version}-%%{release}.%%{arch}.PAE" ' '%s | tail -n1`/g %s/boot/grub/grub.conf' % (mount_point, config.get('kernel_package', 'kernel'), mount_point)) else: run('sed -i s/@VERSION@/`chroot %s rpm -q ' '--queryformat "%%{version}-%%{release}.%%{arch}" ' '%s | tail -n1`/g %s/boot/grub/grub.conf' % (mount_point, config.get('kernel_package', 'kernel'), mount_point)) if config.get("root_device_type") == "instance-store": # files normally copied by grub-install run("cp -va /usr/share/grub/x86_64-redhat/* /mnt/boot/grub/") put(os.path.join(config_dir, "grub.cmd"), "/tmp/grub.cmd") run("sed -i s/@IMG@/{}/g /tmp/grub.cmd".format(img_file)) run("cat /tmp/grub.cmd | grub --device-map=/dev/null") elif virtualization_type == "hvm": # See https://bugs.archlinux.org/task/30241 for the details, # grub-nstall doesn't handle /dev/xvd* devices properly grub_install_patch = os.path.join(config_dir, "grub-install.diff") if os.path.exists(grub_install_patch): put(grub_install_patch, "/tmp/grub-install.diff") run('which patch >/dev/null || yum -d 1 install -y patch') run('patch -p0 -i /tmp/grub-install.diff /sbin/grub-install') run("grub-install --root-directory=%s --no-floppy %s" % (mount_point, grub_dev)) run("sed -i -e '/PermitRootLogin/d' -e '/UseDNS/d' " "-e '$ a PermitRootLogin without-password' " "-e '$ a UseDNS no' " "%s/etc/ssh/sshd_config" % mount_point) if config.get('distro') in ('debian', 'ubuntu'): pass else: manage_service("network", mount_point, "on") manage_service("rc.local", mount_point, "on") if config.get("root_device_type") == "instance-store" and \ config.get("distro") == "centos": instance_data = instance_data.copy() instance_data['name'] = host_instance.tags.get("Name") instance_data['hostname'] = host_instance.tags.get("FQDN") run("cp /etc/resolv.conf {}/etc/resolv.conf".format(mount_point)) # make puppet happy # disable ipv6 run("/sbin/service ip6tables stop") # mount /dev to let sshd start run('mount -o bind /dev %s/dev' % mount_point) assimilate_instance(host_instance, instance_config, ssh_key, instance_data, deploypass, chroot=mount_point, reboot=False) ami_cleanup(mount_point=mount_point, distro=config["distro"]) # kill chroot processes put('%s/kill_chroot.sh' % AMI_CONFIGS_DIR, '/tmp/kill_chroot.sh') run('bash /tmp/kill_chroot.sh {}'.format(mount_point)) run('swapoff -a') run('umount %s/dev || :' % mount_point) if config.get("distro") == "ubuntu": run('rm -f %s/usr/sbin/policy-rc.d' % mount_point) run('chroot %s ln -s /sbin/MAKEDEV /dev/' % mount_point) for dev in ('zero', 'null', 'console', 'generic'): run('chroot %s sh -c "cd /dev && ./MAKEDEV %s"' % (mount_point, dev)) run('umount %s/sys || :' % mount_point) run('umount %s/proc || :' % mount_point) run('umount %s/dev || :' % mount_point) run('umount %s/boot || :' % mount_point) run('umount %s' % mount_point) if config.get("root_device_type") == "instance-store" \ and config.get("distro") == "centos": # create bundle run("yum -d 1 install -y ruby " "http://s3.amazonaws.com/ec2-downloads/ec2-ami-tools.noarch.rpm") bundle_location = "{b}/{d}/{t}/{n}".format( b=config["bucket"], d=config["bucket_dir"], t=config["target"]["tags"]["moz-type"], n=dated_target_name) manifest_location = "{}/{}.manifest.xml".format( bundle_location, dated_target_name) run("mkdir -p /mnt-tmp/out") put(cert, "/mnt-tmp/cert.pem") put(pkey, "/mnt-tmp/pk.pem") run("ec2-bundle-image -c /mnt-tmp/cert.pem -k /mnt-tmp/pk.pem " "-u {uid} -i /mnt-tmp/{img_file} -d /mnt-tmp/out -r x86_64".format( img_file=img_file, uid=config["aws_user_id"])) with hide('running', 'stdout', 'stderr'): log.info("uploading bundle") run("ec2-upload-bundle -b {bundle_location}" " --access-key {access_key} --secret-key {secret_key}" " --region {region}" " -m /mnt-tmp/out/{img_file}.manifest.xml --retry".format( bundle_location=bundle_location, access_key=boto.config.get("Credentials", "aws_access_key_id"), secret_key=boto.config.get("Credentials", "aws_secret_access_key"), region=connection.region.name, img_file=img_file)) v.detach(force=True) wait_for_status(v, "status", "available", "update") if not config.get("root_device_type") == "instance-store": # Step 5: Create a snapshot log.info('Creating a snapshot') snapshot = v.create_snapshot(dated_target_name) wait_for_status(snapshot, "status", "completed", "update") snapshot.add_tag('Name', dated_target_name) snapshot.add_tag('moz-created', str(int(time.mktime(time.gmtime())))) # Step 6: Create an AMI log.info('Creating AMI') if config.get("root_device_type") == "instance-store": ami_id = connection.register_image( dated_target_name, '%s AMI' % dated_target_name, architecture=config['arch'], virtualization_type=virtualization_type, image_location=manifest_location, ) else: host_img = connection.get_image(config['ami']) block_map = BlockDeviceMapping() block_map[host_img.root_device_name] = BlockDeviceType( snapshot_id=snapshot.id) root_device_name = host_img.root_device_name if virtualization_type == "hvm": kernel_id = None ramdisk_id = None else: kernel_id = host_img.kernel_id ramdisk_id = host_img.ramdisk_id ami_id = connection.register_image( dated_target_name, '%s AMI' % dated_target_name, architecture=config['arch'], kernel_id=kernel_id, ramdisk_id=ramdisk_id, root_device_name=root_device_name, block_device_map=block_map, virtualization_type=virtualization_type, ) while True: try: ami = connection.get_image(ami_id) ami.add_tag('Name', dated_target_name) ami.add_tag('moz-created', str(int(time.mktime(time.gmtime())))) if config["target"].get("tags"): for tag, value in config["target"]["tags"].items(): log.info("Tagging %s: %s", tag, value) ami.add_tag(tag, value) log.info('AMI created') log.info('ID: {id}, name: {name}'.format(id=ami.id, name=ami.name)) break except: log.info('Wating for AMI') time.sleep(10) # Step 7: Cleanup if not args.keep_volume: log.info('Deleting volume') v.delete() if not args.keep_host_instance: log.info('Terminating host instance') host_instance.terminate() return ami
def vmcreate(self, obj_attr_list): ''' TBD ''' try: _status = 100 _fmsg = "An error has occurred, but no error message was captured" _instance = False _reservation = False self.determine_instance_name(obj_attr_list) self.determine_key_name(obj_attr_list) obj_attr_list[ "last_known_state"] = "about to connect to " + self.get_description( ) + " manager" self.take_action_if_requested("VM", obj_attr_list, "provision_originated") if not self.ec2conn: self.connect(obj_attr_list["access"], obj_attr_list["credentials"], \ obj_attr_list["vmc_name"]) if self.is_vm_running(obj_attr_list): _msg = "An instance named \"" + obj_attr_list["cloud_vm_name"] _msg += " is already running. It needs to be destroyed first." _status = 187 cberr(_msg) raise CldOpsException(_msg, _status) # "Security groups" must be a list _security_groups = [] _security_groups.append(obj_attr_list["security_groups"]) _time_mark_prs = int(time()) obj_attr_list[ "mgt_002_provisioning_request_sent"] = _time_mark_prs - int( obj_attr_list["mgt_001_provisioning_request_originated"]) self.vm_placement(obj_attr_list) obj_attr_list["last_known_state"] = "about to send create request" self.get_images(obj_attr_list) self.get_networks(obj_attr_list) obj_attr_list["config_drive"] = False # We need the instance placemente information before creating the actual volume #self.vvcreate(obj_attr_list) if "cloud_rv_type" not in obj_attr_list: obj_attr_list["cloud_rv_type"] = "standard" _bdm = BlockDeviceMapping() ''' Options: gp2 (== ssd) io1 (also ssd) st1 (not sure) sc1 (cold?) standard (spinners) ''' if obj_attr_list["cloud_rv_iops"] == "0": _iops = None else: _iops = obj_attr_list["cloud_rv_iops"] if "cloud_rv" in obj_attr_list and obj_attr_list["cloud_rv"] != "0": _size = obj_attr_list["cloud_rv"] else: _size = None _bdm['/dev/sda1'] = BlockDeviceType( volume_type=obj_attr_list["cloud_rv_type"], delete_on_termination=True, iops=_iops, size=_size) self.common_messages("VM", obj_attr_list, "creating", 0, '') self.pre_vmcreate_process(obj_attr_list) _reservation = self.ec2conn.run_instances(image_id = obj_attr_list["boot_volume_imageid1"], \ instance_type = obj_attr_list["size"], \ key_name = obj_attr_list["key_name"], \ user_data = self.populate_cloudconfig(obj_attr_list), block_device_map = _bdm, security_groups = _security_groups) if _reservation: sleep(int(obj_attr_list["update_frequency"])) _instance = _reservation.instances[0] _instance.add_tag("Name", obj_attr_list["cloud_vm_name"]) obj_attr_list["cloud_vm_uuid"] = '{0}'.format(_instance.id) obj_attr_list["instance_obj"] = _instance self.vvcreate(obj_attr_list) self.take_action_if_requested("VM", obj_attr_list, "provision_started") _time_mark_prc = self.wait_for_instance_ready( obj_attr_list, _time_mark_prs) if obj_attr_list["cloud_vv_instance"]: self.common_messages("VV", obj_attr_list, "attaching", _status, _fmsg) obj_attr_list["cloud_vv_instance"].attach( obj_attr_list["cloud_vm_uuid"], "/dev/xvdc") self.wait_for_instance_boot(obj_attr_list, _time_mark_prc) obj_attr_list["host_name"] = "unknown" self.take_action_if_requested("VM", obj_attr_list, "provision_finished") _status = 0 if obj_attr_list["force_failure"].lower() == "true": _fmsg = "Forced failure (option FORCE_FAILURE set \"true\")" _status = 916 except CldOpsException, obj: _status = obj.status _fmsg = str(obj.msg)
dst_sg = ec2dst.create_security_group(args.name, 'AMI Copy') cleanup.add(dst_sg, 'delete', 'Removing destination security group: %s' % args.name) info('Allowing SSH access from 0.0.0.0/0') dst_sg.authorize('tcp', 22, 22, '0.0.0.0/0') # Set up device mapping variables info('Generating a list of EBS volumes to copy') # Create a list of devices for the copying instances tmp_dev = valid_block_devs[:] # Grab the source AMI BDM src_ami_bdm = src_ami.block_device_mapping # Use the source AMI BDM as the base for the destination AMI BDM dst_ami_bdm = src_ami.block_device_mapping # The instance BDMs should be empty to start with src_inst_bdm = BlockDeviceMapping() dst_inst_bdm = BlockDeviceMapping() device_map = {} # Generate the instance BDMs and keep track of the mappings for b in src_ami_bdm.keys(): if src_ami_bdm[b].snapshot_id: d = tmp_dev.pop(0) src_inst_bdm[d] = BlockDeviceType( snapshot_id = src_ami_bdm[b].snapshot_id, size = src_ami_bdm[b].size, delete_on_termination = True, volume_type = src_ami_bdm[b].volume_type, iops = src_ami_bdm[b].iops,) dst_inst_bdm[d] = BlockDeviceType( size = src_ami_bdm[b].size,
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") master_group.owner_id = os.getenv('EC2_USER_ID') slave_group = get_or_make_group(conn, cluster_name + "-slaves") slave_group.owner_id = os.getenv('EC2_USER_ID') zoo_group = get_or_make_group(conn, cluster_name + "-zoo") zoo_group.owner_id = os.getenv('EC2_USER_ID') if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves, existing_zoos = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ( "ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to logging.debug("Calling boto BlockDeviceMapping()...") block_map = BlockDeviceMapping() logging.debug(" Printing block_map..") #print block_map if opts.ebs_vol_size > 0: logging.debug("Calling boto EBSBlockDeviceType()...") device = EBSBlockDeviceType() #print "device: ", device device.size = opts.ebs_vol_size device.delete_on_termination = True device.ephemeral_name = "ephemeral0" #block_map["/dev/sdv"] = device #block_map["/dev/sdv"] = device block_map["/dev/vdb"] = device if opts.user_data_file != None: user_data_file = open(opts.user_data_file) try: opts.user_data = user_data_file.read() #print "user data (encoded) = ", opts.user_data finally: user_data_file.close() # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=opts.user_data) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=opts.user_data) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Launch ZooKeeper nodes if required if int(opts.ft) > 1: print "Running " + opts.ft + " zookeepers" zoo_res = image.run(key_name=opts.key_pair, security_groups=[zoo_group], instance_type=opts.instance_type, placement=opts.zone, min_count=3, max_count=3, block_device_map=block_map, user_data=opts.user_data) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
snapshot = volume.create_snapshot(description=description) print >> sys.stderr, "created snapshot {0}".format(snapshot.id) nixops.util.check_wait(check, max_tries=120) m._conn.create_tags([snapshot.id], {'Name': ami_name}) if not args.keep: depl.destroy_resources() # Register the image. aki = m._conn.get_all_images( filters={'manifest-location': '*pv-grub-hd0_1.03-x86_64*'})[0] print >> sys.stderr, "using kernel image {0} - {1}".format( aki.id, aki.location) block_map = BlockDeviceMapping() block_map['/dev/sda'] = BlockDeviceType(snapshot_id=snapshot.id, delete_on_termination=True) block_map['/dev/sdb'] = BlockDeviceType(ephemeral_name="ephemeral0") block_map['/dev/sdc'] = BlockDeviceType(ephemeral_name="ephemeral1") block_map['/dev/sdd'] = BlockDeviceType(ephemeral_name="ephemeral2") block_map['/dev/sde'] = BlockDeviceType(ephemeral_name="ephemeral3") ami_id = m._conn.register_image(name=ami_name, description=description, architecture="x86_64", root_device_name="/dev/sda", kernel_id=aki.id, block_device_map=block_map) print >> sys.stderr, "registered AMI {0}".format(ami_id)
def create_launch_config(connection, module): name = module.params.get('name') image_id = module.params.get('image_id') key_name = module.params.get('key_name') security_groups = module.params['security_groups'] user_data = module.params.get('user_data') volumes = module.params['volumes'] instance_type = module.params.get('instance_type') spot_price = module.params.get('spot_price') instance_monitoring = module.params.get('instance_monitoring') assign_public_ip = module.params.get('assign_public_ip') kernel_id = module.params.get('kernel_id') ramdisk_id = module.params.get('ramdisk_id') instance_profile_name = module.params.get('instance_profile_name') ebs_optimized = module.params.get('ebs_optimized') classic_link_vpc_id = module.params.get('classic_link_vpc_id') classic_link_vpc_security_groups = module.params.get( 'classic_link_vpc_security_groups') append_hash = module.params.get('append_hash') bdm = BlockDeviceMapping() if volumes: for volume in volumes: if 'device_name' not in volume: module.fail_json(msg='Device name must be set for volume') # Minimum volume size is 1GB. We'll use volume size explicitly set to 0 # to be a signal not to create this volume if 'volume_size' not in volume or int(volume['volume_size']) > 0: bdm[volume['device_name']] = create_block_device( module, volume) lc = LaunchConfiguration( name=name, image_id=image_id, key_name=key_name, security_groups=security_groups, user_data=user_data, block_device_mappings=[bdm], instance_type=instance_type, kernel_id=kernel_id, spot_price=spot_price, instance_monitoring=instance_monitoring, associate_public_ip_address=assign_public_ip, ramdisk_id=ramdisk_id, instance_profile_name=instance_profile_name, ebs_optimized=ebs_optimized, classic_link_vpc_security_groups=classic_link_vpc_security_groups, classic_link_vpc_id=classic_link_vpc_id, ) if append_hash: # MD5 of launch configuration properties h = hashlib.md5() h.update(str(frozenset(lc.__dict__))) # Update name variables with md5 hash name = '-'.join((name, h.hexdigest())) lc.name = name launch_configs = connection.get_all_launch_configurations(names=[name]) changed = False if not launch_configs: try: connection.create_launch_configuration(lc) launch_configs = connection.get_all_launch_configurations( names=[name]) changed = True except BotoServerError, e: module.fail_json(msg=str(e))
def create_instance_args(): """ Looks up security group, subnet and returns arguments to pass into ec2.run_instances() including user data """ vpc = VPCConnection() subnet = vpc.get_all_subnets(filters={ 'tag:aws:cloudformation:stack-name': stack_name, 'tag:play': args.play }) if len(subnet) < 1: sys.stderr.write( "ERROR: Expected at least one subnet, got {}\n".format( len(subnet))) sys.exit(1) subnet_id = subnet[0].id vpc_id = subnet[0].vpc_id security_group_id = get_instance_sec_group(vpc_id) if args.identity: config_secure = 'true' with open(args.identity) as f: identity_contents = f.read() else: config_secure = 'false' identity_contents = "dummy" user_data = """#!/bin/bash set -x set -e exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 base_dir="/var/tmp/edx-cfg" extra_vars="$base_dir/extra-vars-$$.yml" secure_identity="$base_dir/secure-identity" git_ssh="$base_dir/git_ssh.sh" configuration_version="{configuration_version}" configuration_secure_version="{configuration_secure_version}" configuration_private_version="{configuration_private_version}" environment="{environment}" deployment="{deployment}" play="{play}" config_secure={config_secure} git_repo_name="configuration" git_repo="https://github.com/edx/$git_repo_name" git_repo_secure="{configuration_secure_repo}" git_repo_secure_name="{configuration_secure_repo_basename}" git_repo_private="{configuration_private_repo}" git_repo_private_name=$(basename $git_repo_private .git) secure_vars_file={secure_vars_file} environment_deployment_secure_vars="$base_dir/$git_repo_secure_name/ansible/vars/{environment}-{deployment}.yml" deployment_secure_vars="$base_dir/$git_repo_secure_name/ansible/vars/{deployment}.yml" instance_id=\\ $(curl http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null) instance_ip=\\ $(curl http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null) instance_type=\\ $(curl http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null) playbook_dir="$base_dir/{playbook_dir}" if $config_secure; then git_cmd="env GIT_SSH=$git_ssh git" else git_cmd="git" fi ANSIBLE_ENABLE_SQS=true SQS_NAME={queue_name} SQS_REGION=us-east-1 SQS_MSG_PREFIX="[ $instance_id $instance_ip $environment-$deployment $play ]" PYTHONUNBUFFERED=1 # environment for ansible export ANSIBLE_ENABLE_SQS SQS_NAME SQS_REGION SQS_MSG_PREFIX PYTHONUNBUFFERED if [[ ! -x /usr/bin/git || ! -x /usr/bin/pip ]]; then echo "Installing pkg dependencies" /usr/bin/apt-get update /usr/bin/apt-get install -y git python-pip python-apt \\ git-core build-essential python-dev libxml2-dev \\ libxslt-dev curl --force-yes fi rm -rf $base_dir mkdir -p $base_dir cd $base_dir cat << EOF > $git_ssh #!/bin/sh exec /usr/bin/ssh -o StrictHostKeyChecking=no -i "$secure_identity" "\$@" EOF chmod 755 $git_ssh if $config_secure; then cat << EOF > $secure_identity {identity_contents} EOF fi cat << EOF >> $extra_vars --- # extra vars passed into # abbey.py including versions # of all the repositories {extra_vars_yml} {git_refs_yml} # abbey will always run fake migrations # this is so that the application can come # up healthy fake_migrations: true # Use the build number an the dynamic cache key. EDXAPP_UPDATE_STATIC_FILES_KEY: true edxapp_dynamic_cache_key: {deployment}-{environment}-{play}-{cache_id} disable_edx_services: true # abbey should never take instances in # and out of elbs elb_pre_post: false EOF chmod 400 $secure_identity $git_cmd clone $git_repo $git_repo_name cd $git_repo_name $git_cmd checkout $configuration_version cd $base_dir if $config_secure; then $git_cmd clone $git_repo_secure $git_repo_secure_name cd $git_repo_secure_name $git_cmd checkout $configuration_secure_version cd $base_dir fi if [[ ! -z $git_repo_private ]]; then $git_cmd clone $git_repo_private $git_repo_private_name cd $git_repo_private_name $git_cmd checkout $configuration_private_version cd $base_dir fi cd $base_dir/$git_repo_name sudo pip install -r requirements.txt cd $playbook_dir if [[ -r "$deployment_secure_vars" ]]; then extra_args_opts+=" -e@$deployment_secure_vars" fi if [[ -r "$environment_deployment_secure_vars" ]]; then extra_args_opts+=" -e@$environment_deployment_secure_vars" fi if $secure_vars_file; then extra_args_opts+=" -e@$secure_vars_file" fi extra_args_opts+=" -e@$extra_vars" ansible-playbook -vvvv -c local -i "localhost," $play.yml $extra_args_opts ansible-playbook -vvvv -c local -i "localhost," stop_all_edx_services.yml $extra_args_opts rm -rf $base_dir """.format( configuration_version=args.configuration_version, configuration_secure_version=args.configuration_secure_version, configuration_secure_repo=args.configuration_secure_repo, configuration_secure_repo_basename=os.path.basename( args.configuration_secure_repo), configuration_private_version=args.configuration_private_version, configuration_private_repo=args.configuration_private_repo, environment=args.environment, deployment=args.deployment, play=args.play, playbook_dir=args.playbook_dir, config_secure=config_secure, identity_contents=identity_contents, queue_name=run_id, extra_vars_yml=extra_vars_yml, git_refs_yml=git_refs_yml, secure_vars_file=secure_vars_file, cache_id=args.cache_id) mapping = BlockDeviceMapping() root_vol = BlockDeviceType(size=args.root_vol_size) mapping['/dev/sda1'] = root_vol ec2_args = { 'security_group_ids': [security_group_id], 'subnet_id': subnet_id, 'key_name': args.keypair, 'image_id': base_ami, 'instance_type': args.instance_type, 'instance_profile_name': args.role_name, 'user_data': user_data, 'block_device_map': mapping, } return ec2_args
def setUp(self): self.block_device_mapping = BlockDeviceMapping()
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, "ampcamp-master") slave_group = get_or_make_group(conn, "ampcamp-slaves") zoo_group = get_or_make_group(conn, "ampcamp-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') if opts.cluster_type == "mesos": master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') if opts.cluster_type == "mesos": slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: for instance in res.instances: if 'tags' in instance.__dict__ and 'cluster' in instance.tags: if instance.tags['cluster'] == cluster_name and is_active( instance): print >> stderr, ( "ERROR: Instances %s is already running in cluster %s" % (instance.id, cluster_name)) sys.exit(1) if opts.ami in ["latest", "standalone"]: opts.ami = get_ami(opts.ami) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=opts.zone, count=opts.slaves, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map) my_req_ids = [req.id for req in slave_reqs] print "Waiting for spot instances to be granted..." while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active = 0 instance_ids = [] for i in my_req_ids: if id_to_req[i].state == "active": active += 1 instance_ids.append(id_to_req[i].instance_id) if active == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (active, opts.slaves) else: # Launch non-spot instances slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=opts.zone, min_count=opts.slaves, max_count=opts.slaves, block_device_map=block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Create the right tags tags = {} tags['cluster'] = cluster_name tags['type'] = 'slave' for node in slave_nodes: conn.create_tags([node.id], tags) tags['type'] = 'master' for node in master_nodes: conn.create_tags([node.id], tags) zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_spot_request(conn, request, tenant, job): try: logger.debug("%s = %s. tenants vpc = %s" % (request.zone, tenant.subnets[request.zone], tenant.vpc)) cost_aware_req = job.cost_aware drafts_req = job.cost_aware drafts_avg = job.cost_aware mapping = BlockDeviceMapping() sda1 = BlockDeviceType() eph0 = BlockDeviceType() eph1 = BlockDeviceType() eph2 = BlockDeviceType() eph3 = BlockDeviceType() sda1.size = 10 eph0.ephemeral_name = 'ephemeral0' eph1.ephemeral_name = 'ephemeral1' eph2.ephemeral_name = 'ephemeral2' eph3.ephemeral_name = 'ephemeral3' mapping['/dev/sda1'] = sda1 mapping['/dev/sdb'] = eph0 mapping['/dev/sdc'] = eph1 mapping['/dev/sdd'] = eph2 mapping['/dev/sde'] = eph3 inst_req = None inst_req = conn.request_spot_instances( price=request.bid, image_id=request.ami, subnet_id=tenant.subnets[request.zone], count=request.count, key_name=tenant.key_pair, security_group_ids=[tenant.security_group], instance_type=request.instance_type, user_data=customise_cloudinit(tenant, job), block_device_map=mapping) my_req_ids = [req.id for req in inst_req] # address = "" for req in my_req_ids: insert_launch_stats(req, request, tenant) # tag each request tag_requests(req, tenant.name, conn) ProvisionerConfig().dbconn.execute(( "insert into instance_request (tenant, instance_type, " + "price, job_runner_id, request_type, request_id, " + "subnet, cost_aware_ins, cost_aware_bid, cost_aware_subnet," + " drafts_ins, drafts_bid, drafts_subnet, selected_avg_price," " cost_aware_avg_price, drafts_avg_price, drafts_avg_ins, " + "drafts_avg_bid, drafts_avg_subnet, drafts_avg_avg_price) " + "values ('%s', '%s', %s, %s, '%s', '%s', %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" ) % (tenant.db_id, request.instance.db_id, request.price, job.id, "spot", req, tenant.subnets_db_id[request.zone], cost_aware_req.instance.db_id, cost_aware_req.bid, tenant.subnets_db_id[cost_aware_req.zone], drafts_req.instance.db_id, drafts_req.DrAFTS, tenant.subnets_db_id[drafts_req.zone], request.AvgPrice, cost_aware_req.AvgPrice, drafts_req.AvgPrice, drafts_avg.instance.db_id, drafts_avg.DrAFTS, tenant.subnets_db_id[drafts_avg.zone], drafts_avg.AvgPrice)) return my_req_ids except boto.exception.EC2ResponseError: logger.exception("There was an error communicating with EC2.")
#print i.platform #print i.instance_type #print i.instance_profile print '-----' region_name = 'us-west-1' for r in boto.ec2.regions(): if r.name == region_name: break conn = boto.connect_ec2(region=r) #print conn.run_instances(image_id='ami-75287b30') #print conn.run_instances(image_id='ami-71287b34') mapping = BlockDeviceMapping() eph0 = BlockDeviceType() eph1 = BlockDeviceType() eph0.ephemeral_name = 'ephemeral0' eph1.ephemeral_name = 'ephemeral1' mapping['/dev/xvdc'] = eph0 mapping['/dev/xvdd'] = eph1 print conn.run_instances(image_id='ami-75287b30', instance_type='m1.medium', key_name='debian6', block_device_map=mapping) #print conn.terminate_instances(instance_ids=['i-8bd812d3']) #print sys.argv[1:]
def startElement(self, name, attrs, connection): if name == 'blockDeviceMapping': self.attrs['block_device_mapping'] = BlockDeviceMapping() return self.attrs['block_device_mapping'] else: return None
def create_instance_args(): """ Looks up security group, subnet and returns arguments to pass into ec2.run_instances() including user data """ vpc = boto.vpc.connect_to_region(args.region) subnet = vpc.get_all_subnets( filters={ 'tag:aws:cloudformation:stack-name': stack_name, 'tag:play': args.play} ) if len(subnet) < 1: # # try scheme for non-cloudformation builds # subnet = vpc.get_all_subnets( filters={ 'tag:play': args.play, 'tag:environment': args.environment, 'tag:deployment': args.deployment} ) if len(subnet) < 1: sys.stderr.write("ERROR: Expected at least one subnet, got {} for {}-{}-{}\n".format( len(subnet), args.environment, args.deployment, args.play)) sys.exit(1) subnet_id = subnet[0].id vpc_id = subnet[0].vpc_id security_group_id = get_instance_sec_group(vpc_id) if args.identity: config_secure = 'true' with open(args.identity) as f: identity_contents = f.read() else: config_secure = 'false' identity_contents = "dummy" user_data = """#!/bin/bash set -x set -e exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 base_dir="/var/tmp/edx-cfg" extra_vars="$base_dir/extra-vars-$$.yml" secure_identity="$base_dir/secure-identity" git_ssh="$base_dir/git_ssh.sh" configuration_version="{configuration_version}" configuration_secure_version="{configuration_secure_version}" configuration_private_version="{configuration_private_version}" configuration_internal_version="{configuration_internal_version}" environment="{environment}" deployment="{deployment}" play="{play}" cluster="{play}" config_secure={config_secure} git_repo_name="configuration" git_repo="https://github.com/edx/$git_repo_name" git_repo_secure="{configuration_secure_repo}" git_repo_secure_name=$(basename $git_repo_secure .git) git_repo_private="{configuration_private_repo}" git_repo_private_name=$(basename $git_repo_private .git) git_repo_internal="{configuration_internal_repo}" git_repo_internal_name=$(basename $git_repo_internal .git) secure_vars_file={secure_vars_file} environment_deployment_secure_vars="$base_dir/$git_repo_secure_name/ansible/vars/{environment}-{deployment}.yml" deployment_secure_vars="$base_dir/$git_repo_secure_name/ansible/vars/{deployment}.yml" environment_deployment_internal_vars="$base_dir/$git_repo_internal_name/ansible/vars/{environment}-{deployment}.yml" deployment_internal_vars="$base_dir/$git_repo_internal_name/ansible/vars/{deployment}.yml" instance_id=\\ $(curl http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null) instance_ip=\\ $(curl http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null) instance_type=\\ $(curl http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null) playbook_dir="$base_dir/{playbook_dir}" if $config_secure; then git_cmd="env GIT_SSH=$git_ssh git" else git_cmd="git" fi ANSIBLE_ENABLE_SQS=true SQS_NAME={queue_name} SQS_REGION={region} SQS_MSG_PREFIX="[ $instance_id $instance_ip $environment-$deployment $play ]" PYTHONUNBUFFERED=1 HIPCHAT_TOKEN={hipchat_token} HIPCHAT_ROOM={hipchat_room} HIPCHAT_MSG_PREFIX="$environment-$deployment-$play: " HIPCHAT_FROM="ansible-$instance_id" HIPCHAT_MSG_COLOR=$(echo -e "yellow\\ngreen\\npurple\\ngray" | shuf | head -1) DATADOG_API_KEY={datadog_api_key} # environment for ansible export ANSIBLE_ENABLE_SQS SQS_NAME SQS_REGION SQS_MSG_PREFIX PYTHONUNBUFFERED export HIPCHAT_TOKEN HIPCHAT_ROOM HIPCHAT_MSG_PREFIX HIPCHAT_FROM export HIPCHAT_MSG_COLOR DATADOG_API_KEY #################################### Lifted from ansible-bootstrap.sh if [[ -z "$ANSIBLE_REPO" ]]; then ANSIBLE_REPO="https://github.com/edx/ansible.git" fi if [[ -z "$ANSIBLE_VERSION" ]]; then ANSIBLE_VERSION="master" fi if [[ -z "$CONFIGURATION_REPO" ]]; then CONFIGURATION_REPO="https://github.com/edx/configuration.git" fi if [[ -z "$CONFIGURATION_VERSION" ]]; then CONFIGURATION_VERSION="master" fi if [[ -z "$UPGRADE_OS" ]]; then UPGRADE_OS=false fi # # Bootstrapping constants # VIRTUAL_ENV_VERSION="15.0.2" PIP_VERSION="8.1.2" SETUPTOOLS_VERSION="24.0.3" EDX_PPA="deb http://ppa.edx.org precise main" EDX_PPA_KEY_SERVER="keyserver.ubuntu.com" EDX_PPA_KEY_ID="B41E5E3969464050" cat << EOF ****************************************************************************** Running the abbey with the following arguments: ANSIBLE_REPO="$ANSIBLE_REPO" ANSIBLE_VERSION="$ANSIBLE_VERSION" CONFIGURATION_REPO="$CONFIGURATION_REPO" CONFIGURATION_VERSION="$CONFIGURATION_VERSION" ****************************************************************************** EOF if [[ $(id -u) -ne 0 ]] ;then echo "Please run as root"; exit 1; fi if grep -q 'Precise Pangolin' /etc/os-release then SHORT_DIST="precise" elif grep -q 'Trusty Tahr' /etc/os-release then SHORT_DIST="trusty" elif grep -q 'Xenial Xerus' /etc/os-release then SHORT_DIST="xenial" else cat << EOF This script is only known to work on Ubuntu Precise, Trusty and Xenial, exiting. If you are interested in helping make installation possible on other platforms, let us know. EOF exit 1; fi EDX_PPA="deb http://ppa.edx.org $SHORT_DIST main" # Upgrade the OS apt-get update -y apt-key update -y if [ "$UPGRADE_OS" = true ]; then echo "Upgrading the OS..." apt-get upgrade -y fi # Required for add-apt-repository apt-get install -y software-properties-common python-software-properties # Add git PPA add-apt-repository -y ppa:git-core/ppa # For older distributions we need to install a PPA for Python 2.7.10 if [[ "precise" = "$SHORT_DIST" || "trusty" = "$SHORT_DIST" ]]; then # Add python PPA apt-key adv --keyserver "$EDX_PPA_KEY_SERVER" --recv-keys "$EDX_PPA_KEY_ID" add-apt-repository -y "$EDX_PPA" fi # Install python 2.7 latest, git and other common requirements # NOTE: This will install the latest version of python 2.7 and # which may differ from what is pinned in virtualenvironments apt-get update -y apt-get install -y python2.7 python2.7-dev python-pip python-apt python-yaml python-jinja2 build-essential sudo git-core libmysqlclient-dev libffi-dev libssl-dev # Workaround for a 16.04 bug, need to upgrade to latest and then # potentially downgrade to the preferred version. # https://github.com/pypa/pip/issues/3862 if [[ "xenial" = "$SHORT_DIST" ]]; then pip install --upgrade pip pip install --upgrade pip=="$PIP_VERSION" else pip install --upgrade pip=="$PIP_VERSION" fi # pip moves to /usr/local/bin when upgraded hash -r #pip may have moved from /usr/bin/ to /usr/local/bin/. This clears bash's path cache. PATH=/usr/local/bin:$PATH pip install setuptools=="$SETUPTOOLS_VERSION" pip install virtualenv=="$VIRTUAL_ENV_VERSION" ##################### END Lifted from ansible-bootstrap.sh # python3 is required for certain other things # (currently xqwatcher so it can run python2 and 3 grader code, # but potentially more in the future). It's not available on Ubuntu 12.04, # but in those cases we don't need it anyways. if [[ -n "$(apt-cache search --names-only '^python3-pip$')" ]]; then /usr/bin/apt-get update /usr/bin/apt-get install -y python3-pip python3-dev fi # this is missing on 14.04 (base package on 12.04) # we need to do this on any build, since the above apt-get # only runs on a build from scratch /usr/bin/apt-get install -y python-httplib2 --force-yes rm -rf $base_dir mkdir -p $base_dir cd $base_dir cat << EOF > $git_ssh #!/bin/sh exec /usr/bin/ssh -o StrictHostKeyChecking=no -i "$secure_identity" "\$@" EOF chmod 755 $git_ssh if $config_secure; then cat << EOF > $secure_identity {identity_contents} EOF fi cat << EOF >> $extra_vars --- # extra vars passed into # abbey.py including versions # of all the repositories {extra_vars_yml} # abbey will always run fake migrations # this is so that the application can come # up healthy fake_migrations: true disable_edx_services: true COMMON_TAG_EC2_INSTANCE: true # abbey should never take instances in # and out of elbs elb_pre_post: false EOF chmod 400 $secure_identity $git_cmd clone $git_repo $git_repo_name cd $git_repo_name $git_cmd checkout $configuration_version cd $base_dir if $config_secure; then $git_cmd clone $git_repo_secure $git_repo_secure_name cd $git_repo_secure_name $git_cmd checkout $configuration_secure_version cd $base_dir fi if [[ ! -z $git_repo_private ]]; then $git_cmd clone $git_repo_private $git_repo_private_name cd $git_repo_private_name $git_cmd checkout $configuration_private_version cd $base_dir fi if [[ ! -z $git_repo_internal ]]; then $git_cmd clone $git_repo_internal $git_repo_internal_name cd $git_repo_internal_name $git_cmd checkout $configuration_internal_version cd $base_dir fi cd $base_dir/$git_repo_name sudo pip install -r pre-requirements.txt sudo pip install -r requirements.txt cd $playbook_dir if [[ -r "$deployment_internal_vars" ]]; then extra_args_opts+=" -e@$deployment_internal_vars" fi if [[ -r "$environment_deployment_internal_vars" ]]; then extra_args_opts+=" -e@$environment_deployment_internal_vars" fi if [[ -r "$deployment_secure_vars" ]]; then extra_args_opts+=" -e@$deployment_secure_vars" fi if [[ -r "$environment_deployment_secure_vars" ]]; then extra_args_opts+=" -e@$environment_deployment_secure_vars" fi if $secure_vars_file; then extra_args_opts+=" -e@$secure_vars_file" fi extra_args_opts+=" -e@$extra_vars" ansible-playbook -vvvv -c local -i "localhost," $play.yml $extra_args_opts ansible-playbook -vvvv -c local -i "localhost," stop_all_edx_services.yml $extra_args_opts rm -rf $base_dir """.format( hipchat_token=args.hipchat_api_token, hipchat_room=args.ansible_hipchat_room_id, configuration_version=args.configuration_version, configuration_secure_version=args.configuration_secure_version, configuration_secure_repo=args.configuration_secure_repo, configuration_private_version=args.configuration_private_version, configuration_private_repo=args.configuration_private_repo, configuration_internal_version=args.configuration_internal_version, configuration_internal_repo=args.configuration_internal_repo, environment=args.environment, deployment=args.deployment, play=args.play, playbook_dir=args.playbook_dir, config_secure=config_secure, identity_contents=identity_contents, queue_name=run_id, extra_vars_yml=extra_vars_yml, secure_vars_file=secure_vars_file, cache_id=args.cache_id, datadog_api_key=args.datadog_api_key, region=args.region) mapping = BlockDeviceMapping() root_vol = BlockDeviceType(size=args.root_vol_size, volume_type='gp2') mapping['/dev/sda1'] = root_vol ec2_args = { 'security_group_ids': [security_group_id], 'subnet_id': subnet_id, 'key_name': args.keypair, 'image_id': base_ami, 'instance_type': args.instance_type, 'instance_profile_name': args.role_name, 'user_data': user_data, 'block_device_map': mapping, } return ec2_args
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) print "Setting up security groups..." if opts.one_security_group: master_group = get_or_make_group(conn, cluster_name + "-group") master_group.owner_id = os.getenv('EC2_USER_ID') slave_group = master_group zoo_group = master_group else: master_group = get_or_make_group(conn, cluster_name + "-master") master_group.owner_id = os.getenv('EC2_USER_ID') slave_group = get_or_make_group(conn, cluster_name + "-slaves") slave_group.owner_id = os.getenv('EC2_USER_ID') zoo_group = get_or_make_group(conn, cluster_name + "-zoo") zoo_group.owner_id = os.getenv('EC2_USER_ID') if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50031, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') master_group.authorize('tcp', 40000, 40000, '0.0.0.0/0') #apache hama master_group.authorize('tcp', 40013, 40013, '0.0.0.0/0') #apache hama master_group.authorize('tcp', 8020, 8020, '0.0.0.0/0') #hdfs HA nameservice master_group.authorize('tcp', 8485, 8485, '0.0.0.0/0') #journal nodes master_group.authorize('tcp', 8023, 8023, '0.0.0.0/0') #jt HA master_group.authorize('tcp', 8021, 8021, '0.0.0.0/0') #jt HA master_group.authorize('tcp', 8018, 8019, '0.0.0.0/0') #zkfc master_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui #If cohosted with zookeeper open necessary ports if opts.cohost: print "Opening additional ports for zookeeper... " master_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') master_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') master_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 80, 80, '0.0.0.0/0') #Also needed 8649 and 8651 but check if only for master if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') slave_group.authorize('tcp', 40015, 40015, '0.0.0.0/0') ##apache hama web UI slave_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui slave_group.authorize('tcp', 31000, 32000, '0.0.0.0/0') #task tracker web ui if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') zoo_group.authorize('tcp', 8018, 8020, '0.0.0.0/0') #hdfs HA nameservic zoo_group.authorize('tcp', 8485, 8485, '0.0.0.0/0') #journal nodes zoo_group.authorize('tcp', 8023, 8023, '0.0.0.0/0') #jt HA zoo_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui # Check if instances are already running in our groups # Grouped instances are instances that run on the same security group in order to allow communication # using private IPs and without DNS resolving existing_masters, existing_slaves, existing_zoos, existing_grouped = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master) or existing_grouped: print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.emi])[0] except: print >> stderr, "Could not find emi " + opts.emi sys.exit(1) try: image_master = conn.get_all_images(image_ids=[opts.emi_master])[0] except: print >> stderr, "Could not find emi " + opts.emi_master sys.exit(1) # Launch additional ZooKeeper nodes if required - ex: if mesos masters specified are 2 and the zoo_num=3 (default) if int(opts.ft) > 1: if(opts.cohost): zoo_num = str(int(opts.zoo_num) - int(opts.ft)) #extra zoo instances needed else: zoo_num = opts.zoo_num else: zoo_num = opts.zoo_num if (zoo_num > 0): if opts.emi_zoo == "": emi_zoo = opts.emi_master else: emi_zoo = opts.emi_zoo try: image_zoo = conn.get_all_images(image_ids=[emi_zoo])[0] except: print >> stderr, "Could not find emi " + emi_zoo sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to logging.debug( "Calling boto BlockDeviceMapping()...") block_map = BlockDeviceMapping() logging.debug(" Printing block_map..") #print block_map if opts.ebs_vol_size > 0: logging.debug("Calling boto EBSBlockDeviceType()...") device = EBSBlockDeviceType() #print "device: ", device device.size = opts.ebs_vol_size device.delete_on_termination = True device.ephemeral_name = "ephemeral0" #block_map["/dev/sdv"] = device #block_map["/dev/sdv"] = device block_map["/dev/vdb"] = device if opts.user_data_file != None: user_data_file = open(opts.user_data_file) try: opts.user_data = user_data_file.read() #print "user data (encoded) = ", opts.user_data finally: user_data_file.close() # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map, user_data = opts.user_data) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name print "Running " + opts.ft + " masters" master_res = image_master.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = opts.ft, max_count = opts.ft, block_device_map = block_map, user_data = opts.user_data) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) if(zoo_num > 0): print "Running additional " + zoo_num + " zookeepers" zoo_res = image_zoo.run(key_name = opts.key_pair, security_groups = [zoo_group], instance_type = opts.instance_type, placement = opts.zone, min_count = zoo_num, max_count = zoo_num, block_device_map = block_map, user_data = opts.user_data) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] if (opts.cohost): print "Zookeepers are co-hosted on mesos instances..." # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names for master in master_nodes: master.add_tag( key='Name', value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) for slave in slave_nodes: slave.add_tag( key='Name', value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) # Return all the instances return (master_nodes, slave_nodes)
def start_node(): start_logging() print(" ".join(argv)) if len(argv) != 2: print("Usage: %s <nodename>" % (argv[0], ), file=sys.stderr) return 1 nodename = argv[1] cc = ClusterConfiguration.from_config() region = get_region() ec2 = boto.ec2.connect_to_region(region) if not ec2: print("Could not connect to EC2 endpoint in region %r" % (region, ), file=sys.stderr) return 1 kw = {} slurm_s3_root = cc.slurm_s3_root kw['image_id'] = (cc.compute_ami if cc.compute_ami is not None else amazon_linux_ami[region]) if cc.instance_profile is not None: if cc.instance_profile.startswith("arn:"): kw['instance_profile_arn'] = cc.instance_profile else: kw['instance_profile_name'] = cc.instance_profile kw['key_name'] = cc.key_name kw['instance_type'] = cc.compute_instance_type if cc.compute_bid_price is not None: end = time() + 24 * 60 * 60 # FIXME: Don't hardcode this. kw['price'] = cc.compute_bid_price kw['valid_until'] = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime(end)) node_address = cc.get_address_for_nodename(nodename) node_subnet = cc.get_subnet_for_address(node_address) user_data = init_script % { "region": region, "nodename": nodename, "os_packages": " ".join(cc.compute_os_packages if cc. compute_os_packages is not None else []), "external_packages": " ".join(cc.compute_external_packages if cc. compute_external_packages is not None else []), "slurm_ec2_conf": cc.slurm_ec2_configuration, "slurm_s3_root": slurm_s3_root, } user_data = b64encode(user_data) kw['user_data'] = user_data # Map the ethernet interface to the correct IP address eth0 = NetworkInterfaceSpecification(associate_public_ip_address=True, delete_on_termination=True, device_index=0, groups=cc.security_groups, private_ip_address=str(node_address), subnet_id=node_subnet.id) kw['network_interfaces'] = NetworkInterfaceCollection(eth0) # Attach any ephemeral storage devices block_device_map = BlockDeviceMapping() block_device_map['/dev/xvda'] = BlockDeviceType(size=32, volume_type="gp2") devices = cc.ephemeral_stores[cc.compute_instance_type] for i, device in enumerate(devices): drive = "/dev/sd" + chr(ord('b') + i) block_device_map[drive] = BlockDeviceType( ephemeral_name="ephemeral%d" % i) kw['block_device_map'] = block_device_map if cc.compute_bid_price is None: print("run_instances: %r" % kw) reservation = ec2.run_instances(**kw) tags = { 'SLURMHostname': nodename, 'SLURMS3Root': slurm_s3_root, 'Name': "SLURM Computation Node %s" % nodename, } print("instances: %s" % " ".join([instance.id for instance in reservation.instances])) # create-tags can fail at times since the tag resource database is # a bit behind EC2's actual state. for i in xrange(10): try: ec2.create_tags( [instance.id for instance in reservation.instances], tags) break except Exception as e: print("Failed to tag instance: %s" % e, file=sys.stderr) sleep(0.5 * i) else: print("request_spot_instances: %r" % kw, file=sys.stderr) requests = ec2.request_spot_instances(**kw) print("requests: %s" % " ".join([request.id for request in requests])) return 0
def create_launch_config(connection, module): name = module.params.get('name') image_id = module.params.get('image_id') key_name = module.params.get('key_name') security_groups = module.params['security_groups'] user_data = module.params.get('user_data') user_data_path = module.params.get('user_data_path') volumes = module.params['volumes'] instance_type = module.params.get('instance_type') spot_price = module.params.get('spot_price') instance_monitoring = module.params.get('instance_monitoring') assign_public_ip = module.params.get('assign_public_ip') kernel_id = module.params.get('kernel_id') ramdisk_id = module.params.get('ramdisk_id') instance_profile_name = module.params.get('instance_profile_name') ebs_optimized = module.params.get('ebs_optimized') classic_link_vpc_id = module.params.get('classic_link_vpc_id') classic_link_vpc_security_groups = module.params.get( 'classic_link_vpc_security_groups') bdm = BlockDeviceMapping() if user_data_path: try: with open(user_data_path, 'r') as user_data_file: user_data = user_data_file.read() except IOError as e: module.fail_json(msg=str(e), exception=traceback.format_exc()) if volumes: for volume in volumes: if 'device_name' not in volume: module.fail_json(msg='Device name must be set for volume') # Minimum volume size is 1GB. We'll use volume size explicitly set to 0 # to be a signal not to create this volume if 'volume_size' not in volume or int(volume['volume_size']) > 0: bdm[volume['device_name']] = create_block_device( module, volume) lc = LaunchConfiguration( name=name, image_id=image_id, key_name=key_name, security_groups=security_groups, user_data=user_data, block_device_mappings=[bdm], instance_type=instance_type, kernel_id=kernel_id, spot_price=spot_price, instance_monitoring=instance_monitoring, associate_public_ip_address=assign_public_ip, ramdisk_id=ramdisk_id, instance_profile_name=instance_profile_name, ebs_optimized=ebs_optimized, classic_link_vpc_security_groups=classic_link_vpc_security_groups, classic_link_vpc_id=classic_link_vpc_id, ) launch_configs = connection.get_all_launch_configurations(names=[name]) changed = False if not launch_configs: try: connection.create_launch_configuration(lc) launch_configs = connection.get_all_launch_configurations( names=[name]) changed = True except BotoServerError as e: module.fail_json(msg=str(e)) result = dict( ((a[0], a[1]) for a in vars(launch_configs[0]).items() if a[0] not in ('connection', 'created_time', 'instance_monitoring', 'block_device_mappings'))) result['created_time'] = str(launch_configs[0].created_time) # Looking at boto's launchconfig.py, it looks like this could be a boolean # value or an object with an enabled attribute. The enabled attribute # could be a boolean or a string representation of a boolean. Since # I can't test all permutations myself to see if my reading of the code is # correct, have to code this *very* defensively if launch_configs[0].instance_monitoring is True: result['instance_monitoring'] = True else: try: result['instance_monitoring'] = module.boolean( launch_configs[0].instance_monitoring.enabled) except AttributeError: result['instance_monitoring'] = False if launch_configs[0].block_device_mappings is not None: result['block_device_mappings'] = [] for bdm in launch_configs[0].block_device_mappings: result['block_device_mappings'].append( dict(device_name=bdm.device_name, virtual_name=bdm.virtual_name)) if bdm.ebs is not None: result['block_device_mappings'][-1]['ebs'] = dict( snapshot_id=bdm.ebs.snapshot_id, volume_size=bdm.ebs.volume_size) if user_data_path: result[ 'user_data'] = "hidden" # Otherwise, we dump binary to the user's terminal module.exit_json(changed=changed, name=result['name'], created_time=result['created_time'], image_id=result['image_id'], arn=result['launch_configuration_arn'], security_groups=result['security_groups'], instance_type=result['instance_type'], result=result)
def test_create_launch_configuration_with_block_device_mappings(): block_device_mapping = BlockDeviceMapping() ephemeral_drive = BlockDeviceType() ephemeral_drive.ephemeral_name = 'ephemeral0' block_device_mapping['/dev/xvdb'] = ephemeral_drive snapshot_drive = BlockDeviceType() snapshot_drive.snapshot_id = "snap-1234abcd" snapshot_drive.volume_type = "standard" block_device_mapping['/dev/xvdp'] = snapshot_drive ebs_drive = BlockDeviceType() ebs_drive.volume_type = "io1" ebs_drive.size = 100 ebs_drive.iops = 1000 ebs_drive.delete_on_termination = False block_device_mapping['/dev/xvdh'] = ebs_drive conn = boto.connect_autoscale(use_block_device_types=True) config = LaunchConfiguration( name='tester', image_id='ami-abcd1234', instance_type='m1.small', key_name='the_keys', security_groups=["default", "default2"], user_data=b"This is some user_data", instance_monitoring=True, instance_profile_name= 'arn:aws:iam::123456789012:instance-profile/testing', spot_price=0.1, block_device_mappings=[block_device_mapping]) conn.create_launch_configuration(config) launch_config = conn.get_all_launch_configurations()[0] launch_config.name.should.equal('tester') launch_config.image_id.should.equal('ami-abcd1234') launch_config.instance_type.should.equal('m1.small') launch_config.key_name.should.equal('the_keys') set(launch_config.security_groups).should.equal( set(['default', 'default2'])) launch_config.user_data.should.equal(b"This is some user_data") launch_config.instance_monitoring.enabled.should.equal('true') launch_config.instance_profile_name.should.equal( 'arn:aws:iam::123456789012:instance-profile/testing') launch_config.spot_price.should.equal(0.1) len(launch_config.block_device_mappings).should.equal(3) returned_mapping = launch_config.block_device_mappings set(returned_mapping.keys()).should.equal( set(['/dev/xvdb', '/dev/xvdp', '/dev/xvdh'])) returned_mapping['/dev/xvdh'].iops.should.equal(1000) returned_mapping['/dev/xvdh'].size.should.equal(100) returned_mapping['/dev/xvdh'].volume_type.should.equal("io1") returned_mapping['/dev/xvdh'].delete_on_termination.should.be.false returned_mapping['/dev/xvdp'].snapshot_id.should.equal("snap-1234abcd") returned_mapping['/dev/xvdp'].volume_type.should.equal("standard") returned_mapping['/dev/xvdb'].ephemeral_name.should.equal('ephemeral0')
def test_create_launch_configuration_with_block_device_mappings(): block_device_mapping = BlockDeviceMapping() ephemeral_drive = BlockDeviceType() ephemeral_drive.ephemeral_name = "ephemeral0" block_device_mapping["/dev/xvdb"] = ephemeral_drive snapshot_drive = BlockDeviceType() snapshot_drive.snapshot_id = "snap-1234abcd" snapshot_drive.volume_type = "standard" block_device_mapping["/dev/xvdp"] = snapshot_drive ebs_drive = BlockDeviceType() ebs_drive.volume_type = "io1" ebs_drive.size = 100 ebs_drive.iops = 1000 ebs_drive.delete_on_termination = False block_device_mapping["/dev/xvdh"] = ebs_drive conn = boto.connect_autoscale(use_block_device_types=True) config = LaunchConfiguration( name="tester", image_id="ami-abcd1234", instance_type="m1.small", key_name="the_keys", security_groups=["default", "default2"], user_data=b"This is some user_data", instance_monitoring=True, instance_profile_name="arn:aws:iam::{}:instance-profile/testing". format(ACCOUNT_ID), spot_price=0.1, block_device_mappings=[block_device_mapping], ) conn.create_launch_configuration(config) launch_config = conn.get_all_launch_configurations()[0] launch_config.name.should.equal("tester") launch_config.image_id.should.equal("ami-abcd1234") launch_config.instance_type.should.equal("m1.small") launch_config.key_name.should.equal("the_keys") set(launch_config.security_groups).should.equal( set(["default", "default2"])) launch_config.user_data.should.equal(b"This is some user_data") launch_config.instance_monitoring.enabled.should.equal("true") launch_config.instance_profile_name.should.equal( "arn:aws:iam::{}:instance-profile/testing".format(ACCOUNT_ID)) launch_config.spot_price.should.equal(0.1) len(launch_config.block_device_mappings).should.equal(3) returned_mapping = launch_config.block_device_mappings set(returned_mapping.keys()).should.equal( set(["/dev/xvdb", "/dev/xvdp", "/dev/xvdh"])) returned_mapping["/dev/xvdh"].iops.should.equal(1000) returned_mapping["/dev/xvdh"].size.should.equal(100) returned_mapping["/dev/xvdh"].volume_type.should.equal("io1") returned_mapping["/dev/xvdh"].delete_on_termination.should.be.false returned_mapping["/dev/xvdp"].snapshot_id.should.equal("snap-1234abcd") returned_mapping["/dev/xvdp"].volume_type.should.equal("standard") returned_mapping["/dev/xvdb"].ephemeral_name.should.equal("ephemeral0")
def spawn_worker_instance(): # Check that the user logged in is also authorized to do this if not current_user.is_authorized(): return login_manager.unauthorized() errors = {} # Check required fields for f in ['name', 'token']: val = request.form[f] if val is None or val.strip() == '': errors[f] = "This field is required" # Check required file if not request.files['public-ssh-key']: errors['code-tarball'] = "Public key file is required" # Bug 961200: Check that a proper OpenSSH public key was uploaded. # It should start with "ssh-rsa AAAAB3" pubkey = request.files['public-ssh-key'].read() if not validate_public_key(pubkey): errors[ 'public-ssh-key'] = "Supplied file does not appear to be a valid OpenSSH public key." if errors: return get_worker_params(errors, request.form) # Upload s3 key to bucket sshkey = bucket.new_key("keys/%s.pub" % request.form['token']) sshkey.set_contents_from_string(pubkey) ephemeral = app.config.get("EPHEMERAL_MAP", None) # Create boot_script = render_template( 'boot-script.sh', aws_region=app.config['AWS_REGION'], temporary_bucket=app.config['TEMPORARY_BUCKET'], ssh_key=sshkey.key, ephemeral_map=ephemeral) mapping = None if ephemeral: mapping = BlockDeviceMapping() for device, eph_name in ephemeral.iteritems(): mapping[device] = BlockDeviceType(ephemeral_name=eph_name) # Create EC2 instance reservation = ec2.run_instances( image_id= 'ami-eb4608db', # ubuntu/images/hvm/ubuntu-utopic-14.10-amd64-server-20141022.3 security_groups=app.config['SECURITY_GROUPS'], user_data=boot_script, block_device_map=mapping, instance_type=app.config['INSTANCE_TYPE'], instance_initiated_shutdown_behavior='terminate', client_token=request.form['token'], instance_profile_name=app.config['INSTANCE_PROFILE']) instance = reservation.instances[0] # Associate a few tags ec2.create_tags( [instance.id], { "Owner": current_user.email, "Name": request.form['name'], "Application": app.config['INSTANCE_APP_TAG'] }) # Send an email to the user who launched it params = { 'monitoring_url': abs_url_for('monitor', instance_id=instance.id) } ses.send_email( source=app.config['EMAIL_SOURCE'], subject=("telemetry-analysis worker instance: %s (%s) launched" % (request.form['name'], instance.id)), format='html', body=render_template('instance-launched-email.html', **params), to_addresses=[current_user.email]) return redirect(url_for('monitor', instance_id=instance.id))
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) user_data_content = None print("Setting up security groups...") master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id) slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print("ERROR: There are already instances running in group %s or %s" % (master_group.name, slave_group.name), file=stderr) sys.exit(1) # Use the default Ubuntu AMI. if opts.ami is None: if opts.region == "us-east-1": opts.ami = "ami-2d39803a" elif opts.region == "us-west-1": opts.ami = "ami-06116566" elif opts.region == "us-west-2": opts.ami = "ami-9abea4fb" elif opts.region == "eu-west-1": opts.ami = "ami-f95ef58a" elif opts.region == "eu-central-1": opts.ami = "ami-87564feb" elif opts.region == "ap-northeast-1": opts.ami = "ami-a21529cc" elif opts.region == "ap-northeast-2": opts.ami = "ami-09dc1267" elif opts.region == "ap-southeast-1": opts.ami = "ami-25c00c46" elif opts.region == "ap-southeast-2": opts.ami = "ami-6c14310f" elif opts.region == "ap-south-1": opts.ami = "ami-4a90fa25" elif opts.region == "sa-east-1": opts.ami = "ami-0fb83963" else: raise Exception("The specified region is unknown.") # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: additional_group_ids = [ sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id) ] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations( active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slaves granted, waiting longer" % (len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts. instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print( "Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts. instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')) for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))) # Return all the instances return (master_nodes, slave_nodes)
def create_instance_args(): """ Looks up security group, subnet and returns arguments to pass into ec2.run_instances() including user data """ vpc = boto.vpc.connect_to_region(args.region) subnet = vpc.get_all_subnets(filters={ 'tag:aws:cloudformation:stack-name': stack_name, 'tag:play': args.play }) if len(subnet) < 1: # # try scheme for non-cloudformation builds # subnet = vpc.get_all_subnets( filters={ 'tag:play': args.play, 'tag:environment': args.environment, 'tag:deployment': args.deployment }) if len(subnet) < 1: sys.stderr.write( "ERROR: Expected at least one subnet, got {} for {}-{}-{}\n". format(len(subnet), args.environment, args.deployment, args.play)) sys.exit(1) subnet_id = subnet[0].id vpc_id = subnet[0].vpc_id security_group_id = get_instance_sec_group(vpc_id) if args.identity: config_secure = 'true' with open(args.identity) as f: identity_contents = f.read() else: config_secure = 'false' identity_contents = "dummy" user_data = """#!/bin/bash set -x set -e exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 base_dir="/var/tmp/edx-cfg" extra_vars="$base_dir/extra-vars-$$.yml" secure_identity="$base_dir/secure-identity" git_ssh="$base_dir/git_ssh.sh" configuration_version="{configuration_version}" configuration_secure_version="{configuration_secure_version}" configuration_private_version="{configuration_private_version}" configuration_internal_version="{configuration_internal_version}" environment="{environment}" deployment="{deployment}" play="{play}" cluster="{play}" config_secure={config_secure} git_repo_name="configuration" git_repo="https://github.com/edx/$git_repo_name" git_repo_secure="{configuration_secure_repo}" git_repo_secure_name=$(basename $git_repo_secure .git) git_repo_private="{configuration_private_repo}" git_repo_private_name=$(basename $git_repo_private .git) git_repo_internal="{configuration_internal_repo}" git_repo_internal_name=$(basename $git_repo_internal .git) secure_vars_file={secure_vars_file} environment_deployment_secure_vars="$base_dir/$git_repo_secure_name/ansible/vars/{environment}-{deployment}.yml" deployment_secure_vars="$base_dir/$git_repo_secure_name/ansible/vars/{deployment}.yml" environment_deployment_internal_vars="$base_dir/$git_repo_internal_name/ansible/vars/{environment}-{deployment}.yml" deployment_internal_vars="$base_dir/$git_repo_internal_name/ansible/vars/{deployment}.yml" instance_id=\\ $(curl http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null) instance_ip=\\ $(curl http://169.254.169.254/latest/meta-data/local-ipv4 2>/dev/null) instance_type=\\ $(curl http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null) playbook_dir="$base_dir/{playbook_dir}" if $config_secure; then git_cmd="env GIT_SSH=$git_ssh git" else git_cmd="git" fi ANSIBLE_ENABLE_SQS=true SQS_NAME={queue_name} SQS_REGION={region} SQS_MSG_PREFIX="[ $instance_id $instance_ip $environment-$deployment $play ]" PYTHONUNBUFFERED=1 HIPCHAT_TOKEN={hipchat_token} HIPCHAT_ROOM={hipchat_room} HIPCHAT_MSG_PREFIX="$environment-$deployment-$play: " HIPCHAT_FROM="ansible-$instance_id" HIPCHAT_MSG_COLOR=$(echo -e "yellow\\ngreen\\npurple\\ngray" | shuf | head -1) DATADOG_API_KEY={datadog_api_key} # environment for ansible export ANSIBLE_ENABLE_SQS SQS_NAME SQS_REGION SQS_MSG_PREFIX PYTHONUNBUFFERED export HIPCHAT_TOKEN HIPCHAT_ROOM HIPCHAT_MSG_PREFIX HIPCHAT_FROM export HIPCHAT_MSG_COLOR DATADOG_API_KEY if [[ ! -x /usr/bin/git || ! -x /usr/bin/pip ]]; then echo "Installing pkg dependencies" /usr/bin/apt-get update /usr/bin/apt-get install -y git python-pip python-apt \\ git-core build-essential python-dev libxml2-dev \\ libxslt-dev curl libmysqlclient-dev --force-yes fi # python3 is required for certain other things # (currently xqwatcher so it can run python2 and 3 grader code, # but potentially more in the future). It's not available on Ubuntu 12.04, # but in those cases we don't need it anyways. if [[ -n "$(apt-cache search --names-only '^python3-pip$')" ]]; then /usr/bin/apt-get update /usr/bin/apt-get install -y python3-pip python3-dev fi # this is missing on 14.04 (base package on 12.04) # we need to do this on any build, since the above apt-get # only runs on a build from scratch /usr/bin/apt-get install -y python-httplib2 --force-yes # Must upgrade to latest before pinning to work around bug # https://github.com/pypa/pip/issues/3862 pip install --upgrade pip hash -r #pip may have moved from /usr/bin/ to /usr/local/bin/. This clears bash's path cache. pip install --upgrade pip==8.1.2 # upgrade setuptools early to avoid no distribution errors pip install --upgrade setuptools==24.0.3 rm -rf $base_dir mkdir -p $base_dir cd $base_dir cat << EOF > $git_ssh #!/bin/sh exec /usr/bin/ssh -o StrictHostKeyChecking=no -i "$secure_identity" "\$@" EOF chmod 755 $git_ssh if $config_secure; then cat << EOF > $secure_identity {identity_contents} EOF fi cat << EOF >> $extra_vars --- # extra vars passed into # abbey.py including versions # of all the repositories {extra_vars_yml} # abbey will always run fake migrations # this is so that the application can come # up healthy fake_migrations: true disable_edx_services: true COMMON_TAG_EC2_INSTANCE: true # abbey should never take instances in # and out of elbs elb_pre_post: false EOF chmod 400 $secure_identity $git_cmd clone $git_repo $git_repo_name cd $git_repo_name $git_cmd checkout $configuration_version cd $base_dir if $config_secure; then $git_cmd clone $git_repo_secure $git_repo_secure_name cd $git_repo_secure_name $git_cmd checkout $configuration_secure_version cd $base_dir fi if [[ ! -z $git_repo_private ]]; then $git_cmd clone $git_repo_private $git_repo_private_name cd $git_repo_private_name $git_cmd checkout $configuration_private_version cd $base_dir fi if [[ ! -z $git_repo_internal ]]; then $git_cmd clone $git_repo_internal $git_repo_internal_name cd $git_repo_internal_name $git_cmd checkout $configuration_internal_version cd $base_dir fi cd $base_dir/$git_repo_name sudo pip install -r pre-requirements.txt sudo pip install -r requirements.txt cd $playbook_dir if [[ -r "$deployment_internal_vars" ]]; then extra_args_opts+=" -e@$deployment_internal_vars" fi if [[ -r "$environment_deployment_internal_vars" ]]; then extra_args_opts+=" -e@$environment_deployment_internal_vars" fi if [[ -r "$deployment_secure_vars" ]]; then extra_args_opts+=" -e@$deployment_secure_vars" fi if [[ -r "$environment_deployment_secure_vars" ]]; then extra_args_opts+=" -e@$environment_deployment_secure_vars" fi if $secure_vars_file; then extra_args_opts+=" -e@$secure_vars_file" fi extra_args_opts+=" -e@$extra_vars" ansible-playbook -vvvv -c local -i "localhost," $play.yml $extra_args_opts ansible-playbook -vvvv -c local -i "localhost," stop_all_edx_services.yml $extra_args_opts rm -rf $base_dir """.format( hipchat_token=args.hipchat_api_token, hipchat_room=args.ansible_hipchat_room_id, configuration_version=args.configuration_version, configuration_secure_version=args.configuration_secure_version, configuration_secure_repo=args.configuration_secure_repo, configuration_private_version=args.configuration_private_version, configuration_private_repo=args.configuration_private_repo, configuration_internal_version=args.configuration_internal_version, configuration_internal_repo=args.configuration_internal_repo, environment=args.environment, deployment=args.deployment, play=args.play, playbook_dir=args.playbook_dir, config_secure=config_secure, identity_contents=identity_contents, queue_name=run_id, extra_vars_yml=extra_vars_yml, secure_vars_file=secure_vars_file, cache_id=args.cache_id, datadog_api_key=args.datadog_api_key, region=args.region) mapping = BlockDeviceMapping() root_vol = BlockDeviceType(size=args.root_vol_size, volume_type='gp2') mapping['/dev/sda1'] = root_vol ec2_args = { 'security_group_ids': [security_group_id], 'subnet_id': subnet_id, 'key_name': args.keypair, 'image_id': base_ami, 'instance_type': args.instance_type, 'instance_profile_name': args.role_name, 'user_data': user_data, 'block_device_map': mapping, } return ec2_args
def node_install(cn=def_cn, inst_type_idx=def_inst_type, idn=0, avz=def_default_avz, rt=def_default_requesttype, group_name='oggmssh', ssh_port=22, cidr='0.0.0.0/0'): """ Request and prepare single instance """ # FSO---connect cloud = boto.ec2.connect_to_region(avz[:-1], profile_name=ec2Profile) aminfo = cloud.get_image(def_ami[avz[:-1]]) vpcconn = VPCConnection(region=cloud.region, profile_name=ec2Profile) try: vpc_id, subnet_id = def_subnet[avz] vpc = vpcconn.get_all_vpcs(vpc_ids=[vpc_id])[0] except: vpc_id = None subnet_id = None vpc = None # FSO---check if node with same name already exists if node_exists(cn + '_node' + str(idn)): print("Node already exists") sys.exit() # Check if ssh keypair exists key_name = get_keypair_name(avz[:-1]) check_keypair(cloud, key_name) # FSO---create a bigger root device dev_sda1 = EBSBlockDeviceType() dev_sda1.size = rootfs_size_gb dev_sda1.delete_on_termination = True bdm = BlockDeviceMapping() bdm['/dev/sda1'] = dev_sda1 dev_sdf_vol = get_user_persist_ebs(cloud, avz) # Check to see if specified security group already exists. # If we get an InvalidGroup.NotFound error back from EC2, # it means that it doesn't exist and we need to create it. try: group = cloud.get_all_security_groups(groupnames=[group_name])[0] except cloud.ResponseError as e: if e.code == 'InvalidGroup.NotFound': print('Creating Security Group: %s' % group_name) # Create a security group to control access to instance via SSH. group = cloud.create_security_group( group_name, 'A group that allows SSH access') else: raise # Authorize all Intra-VPC traffic if vpc is not None: try: group.authorize('-1', -1, -1, vpc.cidr_block) except cloud.ResponseError as e: if e.code != 'InvalidPermission.Duplicate': raise # Add a rule to the security group to authorize SSH traffic # on the specified port. try: group.authorize('tcp', ssh_port, ssh_port, cidr) except cloud.ResponseError as e: if e.code == 'InvalidPermission.Duplicate': print('Security Group: %s already authorized' % group_name) else: raise log_with_ts("request node " + str(idn)) print('Reserving instance for node', aminfo.id, instance_infos[inst_type_idx]['type'], aminfo.name, aminfo.region) if rt == 'spot': print("placing node in ", avz) requests = cloud.request_spot_instances( def_price, def_ami[avz[:-1]], count=1, type='one-time', security_group_ids=[group.id], key_name=key_name, placement=avz, subnet_id=subnet_id, ebs_optimized=True, instance_type=instance_infos[inst_type_idx]['type'], block_device_map=bdm) req_ids = [request.id for request in requests] instance_ids = wait_for_fulfillment(cloud, req_ids) instances = cloud.get_only_instances(instance_ids=instance_ids) node = instances[0] log_with_ts("fullfilled spot node " + str(idn)) else: print("placing node in ", avz) reservation = cloud.run_instances( image_id=def_ami[avz[:-1]], key_name=key_name, placement=avz, subnet_id=subnet_id, security_group_ids=[group.id], ebs_optimized=True, instance_type=instance_infos[inst_type_idx]['type'], block_device_map=bdm) node = reservation.instances[0] log_with_ts("fullfilled ondemand node " + str(idn)) time.sleep(2) while not node.update() == 'running': print('waiting for', cn, 'node', idn, 'to boot...') time.sleep(5) log_with_ts("booted node " + str(idn)) if dev_sdf_vol is not None: cloud.attach_volume(dev_sdf_vol.id, node.id, "/dev/sdf") node.add_tag('Name', cn + '_node' + str(idn)) node.add_tag('type', cn + 'node') node.add_tag('node-owner', user_identifier) # FSO---set delete on termination flag to true for ebs block device node.modify_attribute('blockDeviceMapping', {'/dev/sda1': True}) # FSO--- test socket connect to ssh service ssh_test(node) log_with_ts("reachable node " + str(idn)) update_key_filename(node.region.name) # Mount potential user volume if dev_sdf_vol is not None: use_user_volume(node.dns_name) log_with_ts("finished node " + str(idn))
def do_build(ctxt, **kwargs): conn = ctxt.cnx_ec2 if 'template' in kwargs and kwargs['template']: template_file_name = kwargs['template'] kwargs = parse_template(ctxt, template_file_name, kwargs) del kwargs['template'] defaultrun = {'instance_type': 'm1.large', 'key_name': ctxt.key_name } for key in defaultrun: if key not in kwargs or kwargs[key] == None: kwargs[key] = defaultrun[key] (remote_user, kwargs) = get_remote_user(ctxt, **kwargs) (key_file, kwargs) = get_key_file(ctxt, **kwargs) (tags,kwargs) = do_tags(**kwargs) do_run_scripts = kwargs.pop('run') ########### # Check VM naming ########### if 'Name' not in tags and kwargs['hostname'] is not None: tags['Name'] = kwargs['hostname'] if 'Name' not in tags: yield "instance name is mandatory" return try: oslib.ec2_objects.Instance(ctxt, name=tags['Name']).get() # if get succed, the name already exist, else get throws an exception yield "duplicate name %s" % tags['Name'] return except: pass user_data_properties = {} image = kwargs.pop('image_id', None) ########### # Check device mapping ########### volumes = BlockDeviceMapping(conn) first_volume = 'f' l = first_volume ebs_optimized = False for volume_info in kwargs.pop('volume_size', []): # yaml is not typed, volume_info can be a string or a number if isinstance(volume_info, basestring): options = volume_info.split(',') size = int(oslib.parse_size(options[0], 'G', default_suffix='G')) else: options = [] size = int(volume_info) vol_kwargs = {"connection":conn, "size": size} if len(options) > 1: for opt in options[1:]: parsed = opt.split('=') key = parsed[0] if len(parsed) == 2: value = parsed[1] elif len(parsed) == 1: value = True else: raise OSLibError("can't parse volume argument %s", opt) if key == 'iops': ebs_optimized = True vol_kwargs['volume_type'] = 'io1' vol_kwargs[key] = value volumes["/dev/sd%s"%l] = BlockDeviceType(**vol_kwargs) l = chr( ord(l[0]) + 1) kwargs['ebs_optimized'] = ebs_optimized # if drive letter is not f, some volumes definition was found if l != first_volume: kwargs['block_device_map'] = volumes user_data_properties['volumes'] = ' '.join(volumes.keys()) # after user_data_properties['volumes'] otherwise they will be lvm'ed for snapshot_id in kwargs.pop('snap_id', []): volumes["/dev/sd%s"%l] = BlockDeviceType(connection=conn, snapshot_id=snapshot_id) l = chr( ord(l[0]) + 1) kwargs = build_user_data(user_data_properties, **kwargs) ########### # Check elastic IP ########### if kwargs['elastic_ip']: eip = True else: eip = False del kwargs['elastic_ip'] for k in kwargs.keys()[:]: value = kwargs[k] if kwargs[k] == None: del(kwargs[k]) elif value.__class__ == [].__class__ and len(value) == 0: del(kwargs[k]) if 'private_ip_address' in kwargs and kwargs['private_ip_address']: netif_specification = NetworkInterfaceCollection() netif_kwargs = {} if kwargs['private_ip_address']: netif_kwargs['private_ip_address'] = kwargs['private_ip_address'] del kwargs['private_ip_address'] if 'associate_public_ip_address' in kwargs and kwargs['associate_public_ip_address']: netif_kwargs['associate_public_ip_address'] = kwargs['associate_public_ip_address'] del kwargs['associate_public_ip_address'] if 'security_groups' in kwargs and kwargs['security_groups']: netif_kwargs['groups'] = kwargs['security_groups'] del kwargs['security_groups'] netif_kwargs['subnet_id'] = kwargs['subnet_id'] del kwargs['subnet_id'] print netif_kwargs spec = NetworkInterfaceSpecification(**netif_kwargs) netif_specification.append(spec) kwargs['network_interfaces'] = netif_specification reservation = conn.run_instances(image, **kwargs) instance = reservation.instances[0] # Quick hack to keep the selected remote user instance.remote_user = remote_user if len(tags) > 0: conn.create_tags([ instance.id ], tags) if instance.interfaces and len(instance.interfaces) > 0: for interface in instance.interfaces: conn.create_tags([ interface.id ], {'creator': tags['creator']}) while instance.state != 'running' and instance.state != 'terminated': instance.update(True) yield (".") time.sleep(1) yield ("\n") if eip: ip = conn.allocate_address().public_ip conn.associate_address(instance_id = instance.id, public_ip=ip) conn.create_tags([instance.id], {"EIP": ip}) #Update tag for this instance's volumes for device in instance.block_device_mapping: device_type = instance.block_device_mapping[device] (vol_tags, vol_kwargs) = do_tags(name='%s/%s' % (tags['Name'], device.replace('/dev/',''))) conn.create_tags([ device_type.volume_id ], vol_tags) instance.update(True) windows_instance = instance.platform == 'Windows' if do_run_scripts and not windows_instance: while instance.state != 'terminated': try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(1.0) s.connect((instance.public_dns_name, 22)) s.close() break except socket.error, msg: yield (".") s.close() time.sleep(1) yield ("\n") instance.key_file = key_file remote_setup(instance, remote_user, key_file)
def register(snapshot_id, region, arch, size=None, name=None, desc=None, pvm=False): conn = utils.connect(region) if None in (name, size): log.debug('getting snapshot - %s', snapshot_id) snapshot = conn.get_all_snapshots(snapshot_ids=[snapshot_id])[0] size = size if size else snapshot.volume_size name = name if name else snapshot.description virt = 'hvm' kernel_id = None device_base = '/dev/xvd' ec2_arch = "x86_64" if arch == "amd64" else arch if pvm: kernel_id = utils.get_kernel(region, arch) virt = 'paravirtual' device_base = '/dev/sd' name += '-pvm' log.debug('creating block_device_map') block_device_map = BlockDeviceMapping() rootfs = BlockDeviceType() rootfs.delete_on_termination = True rootfs.size = size rootfs.snapshot_id = snapshot_id rootfs_device_name = device_base + 'a' block_device_map[rootfs_device_name] = rootfs ephemeral = BlockDeviceType() ephemeral.ephemeral_name = 'ephemeral0' ephemeral_device_name = device_base + 'b' block_device_map[ephemeral_device_name] = ephemeral log.debug('registering image - %s', name) client3 = utils.connect_boto3(region) response = client3.register_image(Name=name, Architecture=ec2_arch, RootDeviceName=rootfs_device_name, BlockDeviceMappings=[{ 'DeviceName': '/dev/xvda', 'Ebs': { 'DeleteOnTermination': True, 'VolumeSize': size, 'SnapshotId': snapshot_id, }, }, { 'DeviceName': '/dev/xvdb', 'VirtualName': 'ephemeral0', }], VirtualizationType=virt, EnaSupport=True) ami_id = response['ImageId'] log.info('registered image - %s %s %s', ami_id, name, region) return ami_id, name