def create_image(args): AWSACCID = _getawsaccid() conn = boto.ec2.connect_to_region(args.region, aws_access_key_id=AWSAKEY, aws_secret_access_key=AWSSKEY) if args.snapshotid == "" or args.snapshotid is None: print 'You have to pass the snapshot ID used to create the image with --snapshotid="snapid"' raise SystemExit(1) else: namei = raw_input("Enter name of image: ") descr = raw_input("Enter a description for image: ") print "Creating image from snapshot %s ..." % args.snapshotid ebs = EBSBlockDeviceType() ebs.snapshot_id = args.snapshotid block_map = BlockDeviceMapping() block_map['/dev/sda1'] = ebs try: if args.region == "eu-west-1": ret = conn.register_image(name=namei,description=descr,architecture='x86_64',kernel_id='aki-71665e05',\ root_device_name='/dev/sda1', block_device_map=block_map) else: ret = conn.register_image(name=namei,description=descr,architecture='x86_64',kernel_id='aki-b6aa75df',\ root_device_name='/dev/sda1', block_device_map=block_map) print "Image creation successful" except EC2ResponseError: print "Image creation error"
def create_node(self, name, distribution, metadata={}): size = self._default_size disk_size = 8 with start_action( action_type=u"flocker:provision:aws:create_node", name=name, distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ): metadata = metadata.copy() metadata["Name"] = name disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap["/dev/sda1"] = disk1 images = self._connection.get_all_images(filters={"name": IMAGE_NAMES[distribution]}) # Retry several times, no sleep between retries is needed. instance = poll_until( lambda: self._get_node(images[0].id, size, diskmap, metadata), repeat(0, 10), lambda x: None ) return AWSNode(name=name, _provisioner=self, _instance=instance, distribution=distribution)
def _register_image(self, snapshot_id): conn = self.platform.new_ec2_conn() instance_id = self.platform.get_instance_id() instance = conn.get_all_instances([instance_id])[0].instances[0] block_device_map = BlockDeviceMapping(conn) root_vol = EBSBlockDeviceType(snapshot_id=snapshot_id) root_vol.delete_on_termination = True # Adding ephemeral devices for eph, device in EPH_STORAGE_MAPPING[linux.os['arch']].items(): bdt = EBSBlockDeviceType(conn) bdt.ephemeral_name = eph block_device_map[device] = bdt root_partition = instance.root_device_name[:-1] if root_partition in self.platform.get_block_device_mapping().values(): block_device_map[root_partition] = root_vol else: block_device_map[instance.root_device_name] = root_vol return conn.register_image( name=self.image_name, root_device_name=instance.root_device_name, block_device_map=block_device_map, kernel_id=instance.kernel, virtualization_type=instance.virtualization_type, ramdisk_id=self.platform.get_ramdisk_id(), architecture=instance.architecture)
def register_snap(self, snap_id, arch, name, aki=None, desc=None, ari=None, pub=True, disk=False): """ Register an EBS volume snapshot as an AMI. Returns the AMI ID. An arch, snapshot ID, and name for the AMI must be provided. Optionally a description, AKI ID, ARI ID and billing code may be specified too. disk is whether or not we are registering a disk image. """ self.logger.info('Registering snap: %s' % (snap_id)) snap = self.conn.get_all_snapshots([snap_id])[0] #Makes block device map ebs = EBSBlockDeviceType() ebs.snapshot_id = snap_id block_map = BlockDeviceMapping() if aki == None: raise Fedora_EC2Error('Need to specify an AKI') if disk: disk = '/dev/sda=%s' % snap_id root = '/dev/sda' else: disk = '/dev/sda1=%s' % snap_id root = '/dev/sda1' block_map[root] = ebs ami_id = self.conn.register_image(name=name, description=desc, image_location = '', architecture=arch, kernel_id=aki, ramdisk_id=ari,root_device_name=root, block_device_map=block_map) if not ami_id.startswith('ami-'): self._log_error('Could not register an AMI') self.logger.info('Registered an AMI: %s' % ami_id) return ami_id
def create_node(self, name, distribution, metadata={}): size = self._default_size disk_size = 10 with start_action( action_type=u"flocker:provision:aws:create_node", name=name, distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ): metadata = metadata.copy() metadata['Name'] = name disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap['/dev/sda1'] = disk1 images = self._connection.get_all_images( filters={'name': IMAGE_NAMES[distribution]}, ) # Retry several times, no sleep between retries is needed. instance = poll_until( lambda: self._get_node(images[0].id, size, diskmap, metadata), repeat(0, 10), lambda x: None) return AWSNode( name=name, _provisioner=self, _instance=instance, distribution=distribution, )
def create_instance(): """Support function to create a new AWS instance.""" from boto.ec2.blockdevicemapping import EBSBlockDeviceType, BlockDeviceMapping kwargs = dict( instance_type = conf.type, key_name=conf.key_pair, placement=conf.zone, ) if conf.disk_size: # We want a larger EBS root volume, so override /dev/sda1. dev_root = EBSBlockDeviceType() dev_root.size = conf.disk_size # Create the mapping. dev_mapping = BlockDeviceMapping() dev_mapping['/dev/sda1'] = dev_root kwargs['block_device_map'] = dev_mapping reservation = env.aws.run_instances( conf.ami, **kwargs) instance = env.aws.instance = reservation.instances[0] wait_for_status(instance, "Creating server", "running") env.aws.create_tags([instance.id], {'Name': env.server.name}) print " Done. \nInstance built:", instance.public_dns_name return instance
def launch_instance(self): if not self.verify_settings(): return is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'}) if is_instance_store: block_map = None else: block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True reservation = self.conn.run_instances( self.config['ec2_ami_id'], key_name=self.config['ec2_key_name'], security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']], instance_type=self.config['ec2_instance_type'], placement=self.config['ec2_zone'], placement_group=self.config['ec2_placement_group'], monitoring_enabled=self.config['ec2_monitoring_enabled'], block_device_map=block_map, user_data=self.user_data) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag() msg1 = "Started Instance: {0}\n".format(self.instance.id) LOG.info(msg1) print msg1 p = int(self.config['ssh_port']) port = "-p {0} ".format(p) if p and not p == 22 else '' ## change user to 'root' for all non-Ubuntu systems user = self.config['sudouser'] if self.config['sudouser'] and self.config['ssh_import'] else 'ubuntu' #XXX - TODO: replace public dns with fqdn, where appropriate msg2 = "To access: ssh {0}{1}@{2}\n".format( '-p {0} '.format(port) if port else '', user, self.instance.public_dns_name) msg3 = "To terminate: shaker-terminate {0}".format( self.instance.id) LOG.info(msg2) LOG.info(msg3) print msg2 print msg3
def create_image(conn): reservation = None if launch_type == 'on-demand': reservation = launch_and_wait(conn, base_instance_type, 1, base_image) instance = get_instances_from_reservation(reservation)[0] instance_ip = instance.ip_address instance_id = instance.id log_file = setup_instance(instance_ip) boot_disk = EBSBlockDeviceType() boot_disk.size = 50 bdm = BlockDeviceMapping() bdm['/dev/sda1'] = boot_disk global node_image try: images = conn.get_all_images(owners=['self']) for image in images: image.deregister() f = open('cloud_configs/' + cloud + '/' + cloud + '_node_image.py', 'w') f.write("node_image = 'DEREGISTERED!'") f.close() except: 1 node_image = conn.create_image(instance_id, 'AWS-pwa-node-image', block_device_mapping=bdm) image = conn.get_all_images(image_ids=[node_image])[0] f = open(log_file, 'a+') while image.state == 'pending': sleep(15) f.write("Image upload state: " + image.state + '\n') image.update() f.write("Image upload state: " + image.state + '\n') if image.state == 'failed': sys.exit("AMI CREATION FAILED!") f.write('\n' * 2) f.write('#' * 30 + '\n') f.write('#' * 30 + '\n\n') f.write("node_image = '" + str(node_image) + "'\n\n") f.write('#' * 30 + '\n') f.write('#' * 30 + '\n') f.close() f = open('cloud_configs/' + cloud + '/' + cloud + '_node_image.py', 'w') f.write("node_image = '" + str(node_image) + "'") f.close()
def launch_instance(skip_updates=False): ''' Launch an Oracle database instance. ''' # Assume the keypair name is based on our env.key_filename. instance_key_name = os.path.basename(env.key_filename).replace('.pem', '') # Check that we have a security group configured already. security_group_list = ec2_connection.get_all_security_groups() security_group_found = False for security_group in security_group_list: if security_group.name == security_group_name: security_group_found = True break # If we didn't find it, create it. if not security_group_found: create_security_group() # We want a larger EBS root volume, so override /dev/sda1. # Create an EBS device with 40GB allocated. dev_root = EBSBlockDeviceType() dev_root.size = 40 # Create the mapping. dev_mapping = BlockDeviceMapping() dev_mapping['/dev/sda1'] = dev_root reservation = ec2_connection.run_instances(ami_id, instance_type=instance_type, key_name=instance_key_name, security_groups=[security_group_name], block_device_map = dev_mapping) # This is hacky but (mostly) works. instance = reservation.instances[0] print(green("Launching instance on reservation {}.".format(instance, reservation))) ''' Wait for instance state to change; if it doesn't change to running, then fail. ''' print(yellow('Waiting for instance to start...')) set_tags = False while instance.state == u'pending': # Try to set tags. if set_tags == False: try: ec2_connection.create_tags([instance.id], {"Name": instance_name}) set_tags = True print(green("Instance {} tagged.".format(instance))) except EC2ResponseError, e: print(red("Tagging failed; sleeping, updating instance, and trying again.")) # Check up on its status every so often time.sleep(10) instance.update()
def launch_instance(self): if not self.verify_settings(): return block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True reservation = self.conn.run_instances( self.config['ec2_ami_id'], key_name=self.config['ec2_key_name'], security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']], instance_type=self.config['ec2_instance_type'], placement_group=self.config['ec2_placement_group'], monitoring_enabled=self.config['ec2_monitoring_enabled'], block_device_map=block_map, user_data=self.user_data) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag() msg1 = "Started Instance: {0}\n".format(self.instance.id) LOG.info(msg1) print msg1 p = int(self.config['ssh_port']) port = "-p {0} ".format(p) if p and not p == 22 else '' ## change user to 'root' for all non-Ubuntu systems user = self.config['sudouser'] if self.config[ 'sudouser'] and self.config['ssh_import'] else 'ubuntu' #XXX - TODO: replace public dns with fqdn, where appropriate msg2 = "To access: ssh {0}{1}@{2}\n".format( '-p {0} '.format(port) if port else '', user, self.instance.public_dns_name) msg3 = "To terminate: shaker-terminate {0}".format( self.instance.id) LOG.info(msg2) LOG.info(msg3) print msg2 print msg3
def launch_instance(self): if not self.verify_settings(): return block_map = BlockDeviceMapping() root_device = self.config["ec2_root_device"] block_map[root_device] = EBSBlockDeviceType() if self.config["ec2_size"]: block_map[root_device].size = self.config["ec2_size"] block_map[root_device].delete_on_termination = True for num, device_location in enumerate(self.config["ec2_ephemeral_devices"]): device = BlockDeviceType() device.ephemeral_name = "ephemeral%d" % num block_map[device_location] = device reservation = self.conn.run_instances( self.config["ec2_ami_id"], key_name=self.config["ec2_key_name"], security_groups=self.config["ec2_security_groups"] or [self.config["ec2_security_group"]], instance_type=self.config["ec2_instance_type"], placement=self.config["ec2_zone"], monitoring_enabled=self.config["ec2_monitoring_enabled"], block_device_map=block_map, user_data=self.user_data, ) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == "running": time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance %s failed after %d seconds" % (self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config["hostname"]: self.assign_name_tag() msg1 = "Started Instance: {0}\n".format(self.instance.id) LOG.info(msg1) print msg1 p = int(self.config["ssh_port"]) port = "-p {0} ".format(p) if p and not p == 22 else "" ## change user to 'root' for all non-Ubuntu systems user = self.config["sudouser"] if self.config["sudouser"] and self.config["ssh_import"] else "ubuntu" # XXX - TODO: replace public dns with fqdn, where appropriate msg2 = "To access: ssh {0}{1}@{2}\n" "To terminate: shaker-terminate {3}".format( port, user, self.instance.public_dns_name, self.instance.id ) LOG.info(msg2) print msg2
def test_launch_config(self): # This unit test is based on #753 and #1343 self.set_http_response(status_code=200) dev_sdf = EBSBlockDeviceType(snapshot_id='snap-12345') dev_sdg = EBSBlockDeviceType(snapshot_id='snap-12346') bdm = BlockDeviceMapping() bdm['/dev/sdf'] = dev_sdf bdm['/dev/sdg'] = dev_sdg lc = launchconfig.LaunchConfiguration( connection=self.service_connection, name='launch_config', image_id='123456', instance_type='m1.large', user_data='#!/bin/bash', security_groups=['group1', 'group2'], spot_price='price', block_device_mappings=[bdm], associate_public_ip_address=True, volume_type='atype', delete_on_termination=False, iops=3000) response = self.service_connection.create_launch_configuration(lc) self.assert_request_parameters( { 'Action': 'CreateLaunchConfiguration', 'BlockDeviceMappings.member.1.DeviceName': '/dev/sdf', 'BlockDeviceMappings.member.1.Ebs.DeleteOnTermination': 'false', 'BlockDeviceMappings.member.1.Ebs.SnapshotId': 'snap-12345', 'BlockDeviceMappings.member.2.DeviceName': '/dev/sdg', 'BlockDeviceMappings.member.2.Ebs.DeleteOnTermination': 'false', 'BlockDeviceMappings.member.2.Ebs.SnapshotId': 'snap-12346', 'EbsOptimized': 'false', 'LaunchConfigurationName': 'launch_config', 'ImageId': '123456', 'UserData': base64.b64encode('#!/bin/bash').decode('utf-8'), 'InstanceMonitoring.Enabled': 'false', 'InstanceType': 'm1.large', 'SecurityGroups.member.1': 'group1', 'SecurityGroups.member.2': 'group2', 'SpotPrice': 'price', 'AssociatePublicIpAddress': 'true', 'VolumeType': 'atype', 'DeleteOnTermination': 'false', 'Iops': 3000, }, ignore_params_values=['Version'])
def run_encryptor_instance(aws_svc, encryptor_image_id, snapshot, root_size, guest_image_id, sg_id, update_ami=False): bdm = BlockDeviceMapping() guest_unencrypted_root = EBSBlockDeviceType( volume_type='gp2', snapshot_id=snapshot, delete_on_termination=True) # Use gp2 for fast burst I/O copying root drive bdm['/dev/sda4'] = guest_unencrypted_root if not update_ami: log.info('Launching encryptor instance with snapshot %s', snapshot) # They are creating an encrypted AMI instead of updating it # Use gp2 for fast burst I/O copying root drive guest_encrypted_root = EBSBlockDeviceType( volume_type='gp2', delete_on_termination=True) guest_encrypted_root.size = 2 * root_size + 1 bdm['/dev/sda5'] = guest_encrypted_root else: log.info('Launching encryptor instance for updating %s', guest_image_id) guest_encrypted_root = EBSBlockDeviceType( volume_type='gp2', snapshot_id=snapshot, delete_on_termination=True) guest_encrypted_root.size = root_size bdm['/dev/sda5'] = guest_encrypted_root instance = aws_svc.run_instance(encryptor_image_id, security_group_ids=[sg_id], block_device_map=bdm) aws_svc.create_tags( instance.id, name=NAME_ENCRYPTOR, description=DESCRIPTION_ENCRYPTOR % {'image_id': guest_image_id} ) instance = _wait_for_instance(aws_svc, instance.id) log.info('Launched encryptor instance %s', instance.id) # Tag volumes. bdm = instance.block_device_mapping if not update_ami: aws_svc.create_tags( bdm['/dev/sda5'].volume_id, name=NAME_ENCRYPTED_ROOT_VOLUME) aws_svc.create_tags( bdm['/dev/sda2'].volume_id, name=NAME_METAVISOR_ROOT_VOLUME) aws_svc.create_tags( bdm['/dev/sda1'].volume_id, name=NAME_METAVISOR_GRUB_VOLUME) aws_svc.create_tags( bdm['/dev/sda3'].volume_id, name=NAME_METAVISOR_LOG_VOLUME) return instance
def launch_instance(self): if not self.verify_settings(): return is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'}) if is_instance_store: block_map = None else: block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True opts = { 'key_name': self.config['ec2_key_name'], 'security_groups': self.config['ec2_security_groups'] or [self.config['ec2_security_group']], 'instance_type': self.config['ec2_instance_type'], 'placement': self.config['ec2_zone'], 'placement_group': self.config['ec2_placement_group'], 'monitoring_enabled': self.config['ec2_monitoring_enabled'], 'block_device_map': block_map, 'user_data': self.user_data } if self.config.get('ec2_subnet_id',False): # when providing subnet_id, must use security_group_ids and not # named security_groups or API call will fail. opts.pop('security_groups',None) opts['security_group_ids'] = self.config['ec2_security_group_ids'] or [self.config['ec2_security_group_id']] if not opts['security_group_ids']: raise AssertionError('Must specify ec2_security_group_id or ec2_security_group_ids with subnet_id') opts['subnet_id'] = self.config['ec2_subnet_id'] reservation = self.conn.run_instances(self.config['ec2_ami_id'], **opts) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag()
def get_block_device(instance_type, ebs_vol_size): block_map = BlockDeviceMapping() if ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = ebs_vol_size device.delete_on_termination = True block_map['/dev/sdv'] = device for i in range(get_num_disks(instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev return block_map
def startInstance(ec2connection, hardwareProfile, ARCH, RHEL, AMI, SSHKEYNAME): conn_region = ec2connection map = BlockDeviceMapping() t = EBSBlockDeviceType() t.size = "15" # map = {'DeviceName':'/dev/sda','VolumeSize':'15'} map["/dev/sda1"] = t # blockDeviceMap = [] # blockDeviceMap.append( {'DeviceName':'/dev/sda', 'Ebs':{'VolumeSize' : '100'} }) if ARCH == "i386" and RHEL == "6.1": reservation = conn_region.run_instances( AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map ) elif ARCH == "x86_64" and RHEL == "6.1": reservation = conn_region.run_instances( AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map ) elif ARCH == "i386": reservation = conn_region.run_instances( AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map ) elif ARCH == "x86_64": reservation = conn_region.run_instances( AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map ) else: print "arch type is neither i386 or x86_64.. will exit" exit(1) myinstance = reservation.instances[0] time.sleep(5) while not myinstance.update() == "running": time.sleep(5) print myinstance.update() instanceDetails = myinstance.__dict__ pprint(instanceDetails) # region = instanceDetails['placement'] # print 'region =' + region publicDNS = instanceDetails["public_dns_name"] print "public hostname = " + publicDNS # check for console output here to make sure ssh is up return publicDNS
def create_server(): """ Creates EC2 Instance and saves it state in a local json file """ # looks for an existing 'data.json' file, so that we don't start # additional ec2 instances when we don't need them. # if is_there_state(): return True else: conn = connect_to_ec2() print(_green("Started...")) print(_yellow("...Creating EC2 instance...")) # we need a larger boot device to store our cached images dev_sda1 = EBSBlockDeviceType() dev_sda1.size = 120 bdm = BlockDeviceMapping() bdm['/dev/sda1'] = dev_sda1 # get an ec2 ami image object with our choosen ami image = conn.get_all_images(env.ec2_ami)[0] # start a new instance reservation = image.run(1, 1, key_name=env.ec2_key_pair, security_groups=env.ec2_security, block_device_map=bdm, instance_type=env.ec2_instancetype) # and get our instance_id instance = reservation.instances[0] # add a tag to our instance conn.create_tags([instance.id], {"Name": env.ec2_instance_name}) # and loop and wait until ssh is available while instance.state == u'pending': yellow("Instance state: %s" % instance.state) sleep(10) instance.update() wait_for_ssh(instance.public_dns_name) green("Instance state: %s" % instance.state) green("Public dns: %s" % instance.public_dns_name) # finally save the details or our new instance into the local state file save_state_locally(instance.id)
def get_block_device(instance_type, ebs_vol_size): block_map = BlockDeviceMapping() if ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device for i in range(get_num_disks(instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev return block_map
def register_ebs_ami(self, snapshot_id, arch = 'x86_64', default_ephem_map = True, img_name = None, img_desc = None): # register against snapshot try: aki=PVGRUB_AKIS[self.region.name][arch] except KeyError: raise Exception("Unable to determine pvgrub hd00 AKI for region (%s) arch (%s)" % (self.region.name, arch)) if not img_name: rand_id = random.randrange(2**32) # These names need to be unique, hence the pseudo-uuid img_name='EBSHelper AMI - %s - uuid-%x' % (snapshot_id, rand_id) if not img_desc: img_desc='Created directly from volume snapshot %s' % (snapshot_id) self.log.debug("Registering snapshot (%s) as new EBS AMI" % (snapshot_id)) ebs = EBSBlockDeviceType() ebs.snapshot_id = snapshot_id ebs.delete_on_termination = True block_map = BlockDeviceMapping() block_map['/dev/sda'] = ebs # The ephemeral mappings are automatic with S3 images # For EBS images we need to make them explicit # These settings are required to make the same fstab work on both S3 and EBS images if default_ephem_map: e0 = EBSBlockDeviceType() e0.ephemeral_name = 'ephemeral0' e1 = EBSBlockDeviceType() e1.ephemeral_name = 'ephemeral1' block_map['/dev/sdb'] = e0 block_map['/dev/sdc'] = e1 result = self.conn.register_image(name=img_name, description=img_desc, architecture=arch, kernel_id=aki, root_device_name='/dev/sda', block_device_map=block_map) return str(result)
def create_server(): """ Creates EC2 Instance and saves it state in a local json file """ # looks for an existing 'data.json' file, so that we don't start # additional ec2 instances when we don't need them. # if is_there_state(): return True else: conn = connect_to_ec2() print(_green("Started...")) print(_yellow("...Creating EC2 instance...")) # we need a larger boot device to store our cached images dev_sda1 = EBSBlockDeviceType() dev_sda1.size = 120 bdm = BlockDeviceMapping() bdm['/dev/sda1'] = dev_sda1 # get an ec2 ami image object with our choosen ami image = conn.get_all_images(env.ec2_ami)[0] # start a new instance reservation = image.run(1, 1, key_name=env.ec2_key_pair, security_groups=env.ec2_security, block_device_map = bdm, instance_type=env.ec2_instancetype) # and get our instance_id instance = reservation.instances[0] # add a tag to our instance conn.create_tags([instance.id], {"Name": env.ec2_instance_name}) # and loop and wait until ssh is available while instance.state == u'pending': yellow("Instance state: %s" % instance.state) sleep(10) instance.update() wait_for_ssh(instance.public_dns_name) green("Instance state: %s" % instance.state) green("Public dns: %s" % instance.public_dns_name) # finally save the details or our new instance into the local state file save_state_locally(instance.id)
def diag(aws_svc=None, region='us-west-2', instance_id=None, snapshot_id=None, vpc_id=None, subnet_id=None, security_group_ids=None, diag_instance_type='m3.medium', ssh_keypair=None): if instance_id: snapshot_id = snapshot_log_volume(aws_svc, instance_id).id log.info("Waiting for 30 seconds for snapshot to be available") time.sleep(30) diag_image = DIAG_IMAGES_BY_REGION[region] log.info("Launching diag instance") if not security_group_ids: vpc_id = None if subnet_id: subnet = aws_svc.get_subnet(subnet_id) vpc_id = subnet.vpc_id temp_sg_id = create_diag_security_group(aws_svc, vpc_id=vpc_id).id security_group_ids = [temp_sg_id] log_volume = EBSBlockDeviceType(delete_on_termination=True, snapshot_id=snapshot_id) bdm = BlockDeviceMapping() # Choose /dev/sda3 since it is the first free mountpoint bdm['/dev/sda3'] = log_volume diag_instance = aws_svc.run_instance(diag_image, instance_type=diag_instance_type, ebs_optimized=False, subnet_id=subnet_id, security_group_ids=security_group_ids, block_device_map=bdm) aws_svc.create_tags(diag_instance.id, name=NAME_DIAG_INSTANCE % {'snapshot_id': snapshot_id}, description=DESCRIPTION_DIAG_INSTANCE % {'snapshot_id': snapshot_id}) wait_for_instance(aws_svc, diag_instance.id) diag_instance = aws_svc.get_instance(diag_instance.id) print "Diag instance id: %s" % diag_instance.id if diag_instance.ip_address: print "IP address: %s" % diag_instance.ip_address if diag_instance.private_ip_address: print "Private IP address: %s" % diag_instance.private_ip_address print "User: root" print "SSH Keypair: %s" % ssh_keypair print "Log volume mountpoint: /dev/xbd2a for PV, /dev/xbd2e for HVM"
def startInstance(self, ami, ec2_keyName, sec_group, hwp): map = BlockDeviceMapping() t = EBSBlockDeviceType() t.size = '15' #map = {'DeviceName':'/dev/sda','VolumeSize':'15'} map['/dev/sda1'] = t reservation = self.connection.run_instances(ami, instance_type=hwp, key_name=ec2_keyName, security_groups=sec_group, block_device_map=map) myinstance = reservation.instances[0] time.sleep(5) while(not myinstance.update() == 'running'): time.sleep(5) print myinstance.update() #pprint(instanceDetails) return myinstance
def launch_instance(self): if not self.verify_settings(): return is_instance_store = self.conn.get_all_images( self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'}) if is_instance_store: block_map = None else: block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True reservation = self.conn.run_instances( self.config['ec2_ami_id'], key_name=self.config['ec2_key_name'], security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']], instance_type=self.config['ec2_instance_type'], placement=self.config['ec2_zone'], placement_group=self.config['ec2_placement_group'], monitoring_enabled=self.config['ec2_monitoring_enabled'], block_device_map=block_map, user_data=self.user_data) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag()
def test_launch_config(self): # This unit test is based on #753 and #1343 self.set_http_response(status_code=200) dev_sdf = EBSBlockDeviceType(snapshot_id='snap-12345') dev_sdg = EBSBlockDeviceType(snapshot_id='snap-12346') bdm = BlockDeviceMapping() bdm['/dev/sdf'] = dev_sdf bdm['/dev/sdg'] = dev_sdg lc = launchconfig.LaunchConfiguration( connection=self.service_connection, name='launch_config', image_id='123456', instance_type='m1.large', security_groups=['group1', 'group2'], spot_price='price', block_device_mappings=[bdm]) response = self.service_connection.create_launch_configuration(lc) self.assert_request_parameters( { 'Action': 'CreateLaunchConfiguration', 'BlockDeviceMappings.member.1.DeviceName': '/dev/sdf', 'BlockDeviceMappings.member.1.Ebs.DeleteOnTermination': 'false', 'BlockDeviceMappings.member.1.Ebs.SnapshotId': 'snap-12345', 'BlockDeviceMappings.member.2.DeviceName': '/dev/sdg', 'BlockDeviceMappings.member.2.Ebs.DeleteOnTermination': 'false', 'BlockDeviceMappings.member.2.Ebs.SnapshotId': 'snap-12346', 'EbsOptimized': 'false', 'LaunchConfigurationName': 'launch_config', 'ImageId': '123456', 'InstanceMonitoring.Enabled': 'false', 'InstanceType': 'm1.large', 'SecurityGroups.member.1': 'group1', 'SecurityGroups.member.2': 'group2', 'SpotPrice': 'price', }, ignore_params_values=['Version'])
def parse_block_device_args(self, block_device_maps_args): block_device_map = BlockDeviceMapping() for block_device_map_arg in block_device_maps_args: parts = block_device_map_arg.split('=') if len(parts) > 1: device_name = parts[0] block_dev_type = EBSBlockDeviceType() value_parts = parts[1].split(':') if value_parts[0].startswith('snap'): block_dev_type.snapshot_id = value_parts[0] else: if value_parts[0].startswith('ephemeral'): block_dev_type.ephemeral_name = value_parts[0] if len(value_parts) > 1: block_dev_type.size = int(value_parts[1]) if len(value_parts) > 2: if value_parts[2] == 'true': block_dev_type.delete_on_termination = True block_device_map[device_name] = block_dev_type return block_device_map
def startInstance(ec2connection, hardwareProfile): conn_region = ec2connection map = BlockDeviceMapping() t = EBSBlockDeviceType() t.size = '15' #map = {'DeviceName':'/dev/sda','VolumeSize':'15'} map['/dev/sda1'] = t #blockDeviceMap = [] #blockDeviceMap.append( {'DeviceName':'/dev/sda', 'Ebs':{'VolumeSize' : '100'} }) if ARCH == 'i386' and RHEL == '6.1': reservation = conn_region.run_instances(AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map ) elif ARCH == 'x86_64' and RHEL == '6.1': reservation = conn_region.run_instances(AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map ) elif ARCH == 'i386': reservation = conn_region.run_instances(AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map ) elif ARCH == 'x86_64': reservation = conn_region.run_instances(AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map) else: print "arch type is neither i386 or x86_64.. will exit" exit(1) myinstance = reservation.instances[0] time.sleep(5) while(not myinstance.update() == 'running'): time.sleep(5) print myinstance.update() instanceDetails = myinstance.__dict__ pprint(instanceDetails) #region = instanceDetails['placement'] #print 'region =' + region publicDNS = instanceDetails['public_dns_name'] print 'public hostname = ' + publicDNS # check for console output here to make sure ssh is up return publicDNS
def _register_image(self, snapshot_id): conn = self.platform.new_ec2_conn() instance_id = self.platform.get_instance_id() instance = conn.get_all_instances([instance_id])[0].instances[0] block_device_map = BlockDeviceMapping(conn) root_vol = EBSBlockDeviceType(snapshot_id=snapshot_id) root_vol.delete_on_termination = True # Adding ephemeral devices for eph, device in EPH_STORAGE_MAPPING[linux.os['arch']].items(): bdt = EBSBlockDeviceType(conn) bdt.ephemeral_name = eph block_device_map[device] = bdt root_partition = instance.root_device_name[:-1] if root_partition in self.platform.get_block_device_mapping().values(): block_device_map[root_partition] = root_vol else: block_device_map[instance.root_device_name] = root_vol return conn.register_image( name=self.image_name, root_device_name=instance.root_device_name, block_device_map=block_device_map, kernel_id=instance.kernel, virtualization_type=instance.virtualization_type, ramdisk_id=self.platform.get_ramdisk_id(), architecture=instance.architecture)
def launch_instance(self): if not self.verify_settings(): return is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'}) if is_instance_store: block_map = None else: block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True reservation = self.conn.run_instances( self.config['ec2_ami_id'], key_name=self.config['ec2_key_name'], security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']], instance_type=self.config['ec2_instance_type'], placement=self.config['ec2_zone'], placement_group=self.config['ec2_placement_group'], monitoring_enabled=self.config['ec2_monitoring_enabled'], block_device_map=block_map, user_data=self.user_data) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag()
def create_image(args): AWSACCID = _getawsaccid() conn = boto.ec2.connect_to_region(args.region,aws_access_key_id=AWSAKEY,aws_secret_access_key=AWSSKEY) if args.snapshotid == "" or args.snapshotid is None: print 'You have to pass the snapshot ID used to create the image with --snapshotid="snapid"' raise SystemExit(1) else: namei = raw_input("Enter name of image: ") descr = raw_input("Enter a description for image: ") vtype = raw_input("Enter a virtualization type for image:[hvm|paravirtual] ") print "Creating image from snapshot %s ..." % args.snapshotid ebs = EBSBlockDeviceType() ebs.snapshot_id = args.snapshotid block_map = BlockDeviceMapping() block_map['/dev/sda1'] = ebs print vtype try: if args.region == "eu-west-1": if vtype == "hvm": #ret = conn.register_image(name=namei,description=descr,architecture='x86_64',kernel_id='aki-71665e05',\ # root_device_name='/dev/sda1', block_device_map=block_map, virtualization_type='hvm') ret = conn.register_image(name=namei,description=descr,architecture='x86_64',\ root_device_name='/dev/sda1', block_device_map=block_map, virtualization_type='hvm') else: ret = conn.register_image(name=namei,description=descr,architecture='x86_64',kernel_id='aki-71665e05',\ root_device_name='/dev/sda1', block_device_map=block_map, virtualization_type='paravirtual') else: if vtype == "hvm": ret = conn.register_image(name=namei,description=descr,architecture='x86_64',kernel_id='aki-b6aa75df',\ root_device_name='/dev/sda1', block_device_map=block_map, virtualization_type='hvm') else: ret = conn.register_image(name=namei,description=descr,architecture='x86_64',kernel_id='aki-b6aa75df',\ root_device_name='/dev/sda1', block_device_map=block_map, virtualization_type='paravirtual') print "Image creation successful" except EC2ResponseError: print "Image creation error"
def create_image(name=IMAGE_NAME, description=IMAGE_DESCRIPTION): """ Create an EBS AMI from the build volume. :type name: string :param name: The name of the AMI to use. :type description: string :param description: The description of the AMI. :rtype: class:`boto.ec2.Image` or ``None`` :return: The image produced. """ instance, volume, device_name = get_volume() snapshot = create_snapshot(IMAGE_NAME) image = None if snapshot is None: print red('Cannot create image with no snapshot') else: # Create block device mapping ebs = EBSBlockDeviceType(snapshot_id=snapshot.id, delete_on_termination=True) ephemeral0 = BlockDeviceType(ephemeral_name='ephemeral0') swap = BlockDeviceType(ephemeral_name='ephemeral1') block_map = BlockDeviceMapping() block_map['/dev/sda1'] = ebs block_map['/dev/sda2'] = ephemeral0 block_map['/dev/sda3'] = swap image_id = instance.connection.register_image( name, description, architecture=instance.architecture, kernel_id=get_kernel(), root_device_name='/dev/sda1', block_device_map=block_map) print green('Image id is %s' % image_id) time.sleep(5) image = instance.connection.get_all_images((image_id, ))[0] add_name(image, name) return image
def build_block_device_map(source_image, target_snapshot_id, source_volume_size): """Create a block device map which is used for the copied AMI. The created block device map contains a root volumes with 8GB of storage on general purpose SSD (gp2). """ root_device_name = source_image.root_device_name del_root_volume = source_image.block_device_mapping[ root_device_name].delete_on_termination block_device_map = BlockDeviceMapping() block_device_map[root_device_name] = EBSBlockDeviceType( snapshot_id=target_snapshot_id, size=source_volume_size, volume_type='gp2', delete_on_termination=del_root_volume) return block_device_map
def register_ebs_ami(self, snapshot_id, arch="x86_64", default_ephem_map=True, img_name=None, img_desc=None): # register against snapshot try: aki = PVGRUB_AKIS[self.region.name][arch] except KeyError: raise Exception("Unable to find pvgrub hd00 AKI for %s, arch (%s)" % (self.region.name, arch)) if not img_name: rand_id = random.randrange(2 ** 32) # These names need to be unique, hence the pseudo-uuid img_name = "EBSHelper AMI - %s - uuid-%x" % (snapshot_id, rand_id) if not img_desc: img_desc = "Created directly from volume snapshot %s" % snapshot_id self.log.debug("Registering %s as new EBS AMI" % snapshot_id) self.create_sgroup("ec2helper-vnc-ssh-%x" % random.randrange(2 ** 32), allow_vnc=True) ebs = EBSBlockDeviceType() ebs.snapshot_id = snapshot_id ebs.delete_on_termination = True block_map = BlockDeviceMapping() block_map["/dev/sda"] = ebs # The ephemeral mappings are automatic with S3 images # For EBS images we need to make them explicit # These settings are required to make the same fstab work on both S3 # and EBS images if default_ephem_map: e0 = EBSBlockDeviceType() e0.ephemeral_name = "ephemeral0" e1 = EBSBlockDeviceType() e1.ephemeral_name = "ephemeral1" block_map["/dev/sdb"] = e0 block_map["/dev/sdc"] = e1 result = self.conn.register_image( name=img_name, description=img_desc, architecture=arch, kernel_id=aki, root_device_name="/dev/sda", block_device_map=block_map, ) sleep(10) new_amis = self.conn.get_all_images([result]) new_amis[0].add_tag("Name", resource_tag) return str(result)
def build(hosts, cred, dry, inventory='hosts'): hret = {} old_state = {} con = None for h in hosts: logger.info(" Run action on host [%s]" % (h)) hret[h] = {} hv = {} hv = vmbuilder.utils.load_host_vars(h, inventory=inventory) hvars = hv['VM_PROVIDER'] if con is None: con = _connect(hvars['region'], cred) reservations = con.get_all_reservations(filters={"tag:Name": h}) old_state[h] = "absent" for reservation in reservations: instance = reservation.instances[0] if instance.state != 'terminated': hret[h]['instance'] = instance old_state[h] = "present" logger.info(" Server [%s] is already present" % (h)) if old_state[h] == 'present': continue bdm = None if 'disk_size' in hvars: try: dev_sda1 = EBSBlockDeviceType() dev_sda1.size = hvars['disk_size'] dev_sda1.delete_on_termination = True bdm = BlockDeviceMapping() bdm['/dev/sda1'] = dev_sda1 except Exception as e: logger.error("Error building block device for server: %s" % (e)) exit(1) try: reservation = con.run_instances( hvars['ami'], key_name=hvars['key'], instance_type=hvars['vmtype'], security_group_ids=[hvars['security']], subnet_id=hvars['subnet'], block_device_map=bdm, dry_run=dry ) hret[h]['instance'] = reservation.instances[0] except Exception as e: logger.error("Error building server: %s" % (e)) exit(1) for h in hosts: hv = vmbuilder.utils.load_host_vars(h, inventory=inventory) hvars = hv['VM_PROVIDER'] instance = hret[h]['instance'] status = instance.update() if old_state[h] == 'absent': logger.info(" Waiting for [%s] to be launched..." % (h)) while status == 'pending': time.sleep(5) status = instance.update() if old_state[h] == 'present': logger.info(" State is running with IP [%s]" % (instance.private_ip_address)) elif status == 'running': logger.info(" State changed to running with IP [%s]" % (instance.private_ip_address)) else: logger.error(" Status of [%s] is [%s]" % (h, status)) instance.add_tag("Name", "%s" % (h)) for cur_tag in hvars['tags']: instance.add_tag(cur_tag, hvars['tags'][cur_tag]) if 'extra_disks' in hvars and old_state[h] == 'absent': try: for cur_disk in hvars['extra_disks']: cur_vol = con.create_volume(cur_disk['size'], instance.placement) status = cur_vol.status while status != 'available': logger.info(" Waiting for volume [%s] to be launched..." % (cur_vol)) time.sleep(10) status = cur_vol.update() con.attach_volume(cur_vol.id, instance.id, '/dev/' + cur_disk['device']) except Exception as e: logger.error("Error Attaching new disks: %s" % (e)) exit(1) instance_volumes = con.get_all_volumes(filters={'attachment.instance-id': instance.id}) for counter, cur_vol in enumerate(instance_volumes): cur_vol.add_tag("Name", "%s_disk%d" % (h.split('.')[0], counter)) hret[h]['private_ip_address'] = instance.private_ip_address # If requested assosiate an new elastic IP for the host and create a security group to whitelist external IPs if 'assosiate_eip' in hvars and hvars['assosiate_eip'] is True: if instance.ip_address is None: eip = con.allocate_address() con.associate_address(instance.id, eip.public_ip) logger.info(" Adding public IP [%s]" % (eip.public_ip)) hret[h]['public_ip_address'] = eip.public_ip if 'whitelisted_ips' in hvars: logger.info(" Whitelisting IPs [%s]" % (hvars['whitelisted_ips'])) ips = hvars['whitelisted_ips'].split(',') project = hvars['tags']['Project'] security = hvars['security'] _create_security_group(con, instance, project, ips, security) return hret
def launch_cluster(conn, opts, cluster_name): template_vars = { 'cluster_name':cluster_name, 'master_security_group': cluster_name + "-master", 'slave_security_group': cluster_name + "-slaves", 'discovery_security_group': cluster_name + "-discovery" } if opts.copy_aws_credentials: if opts.deploy_aws_key_id: template_vars['aws_key']=opts.deploy_aws_key_id else: template_vars['aws_key']=opts.aws_access_key_id if opts.deploy_aws_key_secret: template_vars['aws_secret']=opts.deploy_aws_key_secret else: template_vars['aws_secret']=opts.aws_secret_access_key if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) print("Setting up security groups...") master_group = get_or_make_group(conn, template_vars['master_security_group'], opts.vpc_id) slave_group = get_or_make_group(conn, template_vars['slave_security_group'], opts.vpc_id) discovery_group = get_or_make_group(conn, template_vars['discovery_security_group'], opts.vpc_id) authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created if opts.vpc_id is None: master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=discovery_group) else: master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=discovery_group) master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=discovery_group) master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=discovery_group) master_group.authorize('tcp', 22, 22, authorized_address) if slave_group.rules == []: # Group was just now created if opts.vpc_id is None: slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=discovery_group) else: slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=discovery_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=discovery_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=discovery_group) slave_group.authorize('tcp', 22, 22, authorized_address) if discovery_group.rules == []: # Group was just now created if opts.vpc_id is None: discovery_group.authorize(src_group=master_group) discovery_group.authorize(src_group=slave_group) discovery_group.authorize(src_group=discovery_group) else: discovery_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=discovery_group) discovery_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=discovery_group) discovery_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=discovery_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print("ERROR: There are already instances running in group %s or %s" % (master_group.name, slave_group.name), file=stderr) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_ami(opts) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: all_groups = conn.get_all_security_groups() additional_group_ids = [] for group in opts.additional_security_group.split(','): additional_group_ids += [sg.id for sg in all_groups if group in (sg.name, sg.id)] template_vars['security_groups']= template_vars['discovery_security_group'] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: ebs_devices=[] for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device_id = "/dev/sd" + chr(ord('s') + i) device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map[device_id] = device ebs_devices+=device_id template_vars['ebs_devices']=' '.join(ebs_devices) # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): local_devices=[] for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev local_devices+=name template_vars['local_devices']=' '.join(local_devices) master_user_data_content = get_user_data(opts.master_user_data,template_vars) slave_user_data_content = get_user_data(opts.slave_user_data,template_vars) # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=slave_user_data_content, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=slave_user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name if opts.spot_price is not None: # Launch spot instance with the requested price print("Requesting master as spot instance with price $%.3f" % (opts.spot_price)) master_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, key_name=opts.key_pair, launch_group="master-group-%s" % cluster_name, security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=master_user_data_content, instance_profile_name=opts.instance_profile_name) master_req_id = master_reqs[0].id print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r master_instance_ids = [] if master_req_id in id_to_req and id_to_req[master_req_id].state == "active": master_instance_ids.append(id_to_req[master_req_id].instance_id) print("Master granted") reservations = conn.get_all_reservations(master_instance_ids) master_nodes = [] for r in reservations: master_nodes += r.instances break else: print("Master not granted yet, waiting longer") except: print("Canceling spot instance request for master") conn.cancel_spot_instance_requests([master_req_id]) sys.exit(0) else: master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=master_user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) ) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): conn = AWSConnection(conn, VPCConnection(region=conn.region)) print "Setting up VPC..." vpc = get_or_make_vpc(conn, cluster_name, 'mesos-vpc') print "Using vpc: %s" % (vpc.id) print "Setting up subnet..." subnet = get_or_make_subnet(conn, vpc.id, opts.zone, cluster_name, 'mesos-subnet') print "Using subnet: %s" % (subnet.id) # Add internet gateway to VPC. print "Creating internet gateway" ig = get_or_make_ig(conn, vpc.id, cluster_name, 'mesos-vpc') print "Using internet gateway: %s" % (ig.id) # Add route to route table rt = get_or_make_rt(conn, vpc.id, cluster_name, 'mesos-rt') conn.vpc.create_route(rt.id, '0.0.0.0/0', gateway_id=ig.id) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-masters") slave_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-slaves") zoo_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-zoo") if master_group.rules == []: # Group was just now created master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.ec2.get_all_instances() for res in reservations: group_names = [g.name for g in res.groups] if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names: active = [i for i in res.instances if is_active(i)] if len(active) > 0: print >> stderr, ("ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." if opts.ami == "latest": # Figure out the latest AMI from our static URL try: opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip() except: print >> stderr, "Could not read " + LATEST_AMI_URL try: image = conn.ec2.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) slave_reqs = conn.ec2.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = opts.zone, count = opts.slaves, key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids = [req.id for req in slave_reqs] print "Waiting for spot instances to be granted..." while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active = 0 instance_ids = [] for i in my_req_ids: if id_to_req[i].state == "active": active += 1 instance_ids.append(id_to_req[i].instance_id) if active == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (active, opts.slaves) else: # Launch non-spot instances slave_res = conn.ec2.run_instances(opts.ami, key_name = opts.key_pair, subnet_id = subnet.id, security_group_ids = [slave_group.id], instance_type = opts.instance_type, placement = opts.zone, min_count = opts.slaves, max_count = opts.slaves, block_device_map = block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnet.id, groups=[master_group.id], associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface) master_res = conn.ec2.run_instances(opts.ami, key_name = opts.key_pair, instance_type = master_type, placement = opts.zone, network_interfaces = interfaces, min_count = opts.ft, max_count = opts.ft, block_device_map = block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Launch ZooKeeper nodes if required if opts.ft > 1: zoo_res = conn.ec2.run_instances(opts.ami, key_name = opts.key_pair, subnet_id = subnet.id, security_group_ids = [zoo_group.id], instance_type = opts.instance_type, placement = opts.zone, min_count = 3, max_count = 3, block_device_map = block_map) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def make_ebs_based_image(ami_name, docker_image_name, fstype='ext3', mount_point=None, desc='', arch='x86_64', kernel=None, disk_size=10240): '''size of ebs is passed in Mb''' availability_zone = boto.utils.get_instance_metadata()[ 'placement']['availability-zone'] instance_id = boto.utils.get_instance_metadata()['instance_id'] # needs a ec2 connection here conn = boto.ec2.connection.EC2Connection() vol = conn.create_volume(disk_size, availability_zone) devpath = random.choice( [devp for devp in map(lambda x: '/dev/sd%s' % x, string.ascii_lowercase) if not os.path.exists(devp)] ) vol.attach(instance_id, devpath) run('/sbin/mkfs -t %s %s' % (fstype, devpath)) if mount_point is None: mount_point = tempfile.mkdtemp('ebs-based-mount-point') # copy files cid = dockerc.create_container( image=docker_image_name, command='/bin/bash', tty=True, volume=['dev'] ) export_fileobj = dockerc.export(cid) run('/bin/mount -t %s %s %s' % (fstype, devpath, mount_point)) try: with tempfile.TemporaryFile() as fp: data = export_fileobj.read(2048) while data: fp.write(data) data = export_fileobj.read(2048) else: # if it goes well, seek to 0 fp.seek(0) tar = tarfile.open(fileobj=fp) os.chdir(mount_point) tar.extractall() tar.close() os.chdir(os.pardir) finally: run('/bin/umount %s' % mount_point) vol.detach() snapshot = vol.create_snapshot('initial snapshot for ebs') ebs = EBSBlockDeviceType() ebs.snapshot_id = snapshot.id block_map = BlockDeviceMapping() block_map['/dev/sda1'] = ebs ami = conn.register_image( ami_name, description=desc, architecture=arch, kernel_id=kernel, root_device_name='dev/sda1', block_device_map=block_map ) log.info('ebs-based ami: %s' % ami.id)
def node_install(cn=def_cn,inst_type_idx=def_inst_type,idn=0, avz=def_default_avz,rt=def_default_requesttype, group_name='oggmssh', ssh_port=22, cidr='0.0.0.0/0'): """ Request and prepare single instance """ # FSO---connect cloud = boto.ec2.connect_to_region(avz[:-1],profile_name=ec2Profile) aminfo = cloud.get_image(def_ami[avz[:-1]]) # FSO---check if node with same name already exists if node_exists(cn + '_node' + str(idn)): print("Node already exists") sys.exit() # Check if ssh keypair exists key_name = get_keypair_name() check_keypair(cloud, key_name) # FSO---create a bigger root device dev_sda1 = EBSBlockDeviceType() dev_sda1.size = rootfs_size_gb dev_sda1.delete_on_termination = True bdm = BlockDeviceMapping() bdm['/dev/sda1'] = dev_sda1 dev_sdf_vol = get_user_persist_ebs(cloud, avz) # Check to see if specified security group already exists. # If we get an InvalidGroup.NotFound error back from EC2, # it means that it doesn't exist and we need to create it. try: group = cloud.get_all_security_groups(groupnames=[group_name])[0] except cloud.ResponseError as e: if e.code == 'InvalidGroup.NotFound': print('Creating Security Group: %s' % group_name) # Create a security group to control access to instance via SSH. group = cloud.create_security_group(group_name, 'A group that allows SSH access') else: raise # Add a rule to the security group to authorize SSH traffic # on the specified port. try: group.authorize('tcp', ssh_port, ssh_port, cidr) except cloud.ResponseError as e: if e.code == 'InvalidPermission.Duplicate': print('Security Group: %s already authorized' % group_name) else: raise log_with_ts("request node "+str(idn)) print('Reserving instance for node', aminfo.id, instance_infos[inst_type_idx]['type'], aminfo.name, aminfo.region) if rt == 'spot': print("placing node in ",avz) requests = cloud.request_spot_instances(def_price, def_ami[avz[:-1]], count=1, type='one-time', security_groups=[group_name], key_name=key_name, placement=avz, instance_type=instance_infos[inst_type_idx]['type'], block_device_map=bdm) req_ids = [request.id for request in requests] instance_ids = wait_for_fulfillment(cloud,req_ids) instances = cloud.get_only_instances(instance_ids=instance_ids) node = instances[0] log_with_ts("fullfilled spot node "+str(idn)) else: print("placing node in ",avz) reservation = cloud.run_instances(image_id=def_ami[avz[:-1]], key_name=key_name, placement = avz, security_groups=[group_name], instance_type=instance_infos[inst_type_idx]['type'], block_device_map= bdm) node = reservation.instances[0] log_with_ts("fullfilled ondemand node "+str(idn)) time.sleep(2) while not node.update() == 'running': print('waiting for', cn, 'node', idn, 'to boot...') time.sleep(5) log_with_ts("booted node "+str(idn)) if dev_sdf_vol is not None: cloud.attach_volume(dev_sdf_vol.id, node.id, "/dev/sdf") node.add_tag('Name', cn+'_node'+str(idn)) node.add_tag('type', cn+'node') node.add_tag('node-owner', user_identifier) # FSO---set delete on termination flag to true for ebs block device node.modify_attribute('blockDeviceMapping', { '/dev/sda1' : True }) # FSO--- test socket connect to ssh service ssh_test(node) log_with_ts("reachable node "+str(idn)) update_key_filename(node.region.name) # Mount potential user volume if dev_sdf_vol is not None: use_user_volume(node.dns_name) log_with_ts("finished node "+str(idn))
def node_install(cn=def_cn, inst_type_idx=def_inst_type, idn=0, avz=def_default_avz, rt=def_default_requesttype, group_name='oggmssh', ssh_port=22, cidr='0.0.0.0/0'): """ Request and prepare single instance """ # FSO---connect cloud = boto.ec2.connect_to_region(avz[:-1], profile_name=ec2Profile) aminfo = cloud.get_image(def_ami[avz[:-1]]) vpcconn = VPCConnection(region=cloud.region) try: vpc_id, subnet_id = def_subnet[avz] vpc = vpcconn.get_all_vpcs(vpc_ids=[vpc_id])[0] except: vpc_id = None subnet_id = None vpc = None # FSO---check if node with same name already exists if node_exists(cn + '_node' + str(idn)): print("Node already exists") sys.exit() # Check if ssh keypair exists key_name = get_keypair_name(avz[:-1]) check_keypair(cloud, key_name) # FSO---create a bigger root device dev_sda1 = EBSBlockDeviceType() dev_sda1.size = rootfs_size_gb dev_sda1.delete_on_termination = True bdm = BlockDeviceMapping() bdm['/dev/sda1'] = dev_sda1 dev_sdf_vol = get_user_persist_ebs(cloud, avz) # Check to see if specified security group already exists. # If we get an InvalidGroup.NotFound error back from EC2, # it means that it doesn't exist and we need to create it. try: group = cloud.get_all_security_groups(groupnames=[group_name])[0] except cloud.ResponseError as e: if e.code == 'InvalidGroup.NotFound': print('Creating Security Group: %s' % group_name) # Create a security group to control access to instance via SSH. group = cloud.create_security_group( group_name, 'A group that allows SSH access') else: raise # Authorize all Intra-VPC traffic if vpc is not None: try: group.authorize('-1', -1, -1, vpc.cidr_block) except cloud.ResponseError as e: if e.code != 'InvalidPermission.Duplicate': raise # Add a rule to the security group to authorize SSH traffic # on the specified port. try: group.authorize('tcp', ssh_port, ssh_port, cidr) except cloud.ResponseError as e: if e.code == 'InvalidPermission.Duplicate': print('Security Group: %s already authorized' % group_name) else: raise log_with_ts("request node " + str(idn)) print('Reserving instance for node', aminfo.id, instance_infos[inst_type_idx]['type'], aminfo.name, aminfo.region) if rt == 'spot': print("placing node in ", avz) requests = cloud.request_spot_instances( def_price, def_ami[avz[:-1]], count=1, type='one-time', security_group_ids=[group.id], key_name=key_name, placement=avz, subnet_id=subnet_id, ebs_optimized=True, instance_type=instance_infos[inst_type_idx]['type'], block_device_map=bdm) req_ids = [request.id for request in requests] instance_ids = wait_for_fulfillment(cloud, req_ids) instances = cloud.get_only_instances(instance_ids=instance_ids) node = instances[0] log_with_ts("fullfilled spot node " + str(idn)) else: print("placing node in ", avz) reservation = cloud.run_instances( image_id=def_ami[avz[:-1]], key_name=key_name, placement=avz, subnet_id=subnet_id, security_group_ids=[group.id], ebs_optimized=True, instance_type=instance_infos[inst_type_idx]['type'], block_device_map=bdm) node = reservation.instances[0] log_with_ts("fullfilled ondemand node " + str(idn)) time.sleep(2) while not node.update() == 'running': print('waiting for', cn, 'node', idn, 'to boot...') time.sleep(5) log_with_ts("booted node " + str(idn)) if dev_sdf_vol is not None: cloud.attach_volume(dev_sdf_vol.id, node.id, "/dev/sdf") node.add_tag('Name', cn + '_node' + str(idn)) node.add_tag('type', cn + 'node') node.add_tag('node-owner', user_identifier) # FSO---set delete on termination flag to true for ebs block device node.modify_attribute('blockDeviceMapping', {'/dev/sda1': True}) # FSO--- test socket connect to ssh service ssh_test(node) log_with_ts("reachable node " + str(idn)) update_key_filename(node.region.name) # Mount potential user volume if dev_sdf_vol is not None: use_user_volume(node.dns_name) log_with_ts("finished node " + str(idn))
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, "shark-exp-master") slave_group = get_or_make_group(conn, "shark-exp-slaves") zoo_group = get_or_make_group(conn, "ampcamp-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') if opts.cluster_type == "mesos": master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') # hbase master_group.authorize('tcp', 60010, 60010, '0.0.0.0/0') master_group.authorize('tcp', 60050, 60050, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') if opts.cluster_type == "mesos": slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # hbase slave_group.authorize('tcp', 60050, 60050, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: for instance in res.instances: if 'tags' in instance.__dict__ and 'cluster' in instance.tags: if instance.tags['cluster'] == cluster_name and is_active(instance): print >> stderr, ("ERROR: Instances %s is already running in cluster %s" % (instance.id, cluster_name)) sys.exit(1) if opts.ami in ["latest", "standalone"]: opts.ami = get_ami(opts.ami) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) slave_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = opts.zone, count = opts.slaves, key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids = [req.id for req in slave_reqs] print "Waiting for spot instances to be granted..." while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active = 0 instance_ids = [] for i in my_req_ids: if id_to_req[i].state == "active": active += 1 instance_ids.append(id_to_req[i].instance_id) if active == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (active, opts.slaves) else: # Launch non-spot instances slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = opts.zone, min_count = opts.slaves, max_count = opts.slaves, block_device_map = block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Create the right tags tags = {} tags['cluster'] = cluster_name tags['type'] = 'slave' for node in slave_nodes: conn.create_tags([node.id], tags) tags['type'] = 'master' for node in master_nodes: conn.create_tags([node.id], tags) zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): #Remove known hosts to avoid "Offending key for IP ..." errors. known_hosts = os.environ['HOME'] + "/.ssh/known_hosts" if os.path.isfile(known_hosts): os.remove(known_hosts) if opts.key_pair is None: opts.key_pair = keypair() if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) if opts.profile is None: opts.profile = profile() if opts.profile is None: print >> stderr, "ERROR: No profile found in current host. It be provided with -p option." sys.exit(1) public_key = pub_key() user_data = Template("""#!/bin/bash set -e -x echo '$public_key' >> ~root/.ssh/authorized_keys echo '$public_key' >> ~ec2-user/.ssh/authorized_keys""").substitute( public_key=public_key) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") sparknotebook_group = get_or_make_group(conn, "SparkNotebookApplication") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=sparknotebook_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') master_group.authorize('tcp', 7077, 7077, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=sparknotebook_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if not any(r for r in sparknotebook_group.rules for g in r.grants if master_group.id == g.group_id): sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=master_group) sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=master_group) if not any(r for r in sparknotebook_group.rules for g in r.grants if slave_group.id == g.group_id): sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=slave_group) sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=slave_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: best_price = find_best_price(conn, opts.instance_type, zone, opts.spot_price) # Launch spot instances with the requested price print >> stderr, ( "Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)" % (opts.slaves, best_price, opts.slaves * best_price)) num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnetId(), groups=[slave_group.id], associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( interface) slave_reqs = conn.request_spot_instances( price=best_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile, network_interfaces=interfaces) my_req_ids += [req.id for req in slave_reqs] i += 1 print >> stderr, "Waiting for spot instances to be granted" try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print >> stderr, "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: # print >> stderr, ".", print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print >> stderr, "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_group_ids=[slave_group.id], instance_type=opts.instance_type, subnet_id=subnetId(), placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile) slave_nodes += slave_res.instances print >> stderr, "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name if opts.spot_price != None: best_price = find_best_price(conn, master_type, opts.zone, opts.spot_price) # Launch spot instances with the requested price print >> stderr, ( "Requesting master as spot instances with price $%.3f/hour" % (best_price)) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnetId(), groups=[master_group.id], associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( interface) master_reqs = conn.request_spot_instances( price=best_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=opts.zone, count=1, key_name=opts.key_pair, instance_type=master_type, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile, network_interfaces=interfaces) my_req_ids = [r.id for r in master_reqs] print >> stderr, "Waiting for spot instance to be granted" try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests( request_ids=my_req_ids) id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append( id_to_req[i].instance_id) if len(active_instance_ids) == 1: print >> stderr, "Master granted" reservations = conn.get_all_instances( active_instance_ids) master_nodes = [] for r in reservations: master_nodes += r.instances break else: # print >> stderr, ".", print "%d of %d masters granted, waiting longer" % ( len(active_instance_ids), 1) except: print >> stderr, "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, master_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(master_nodes) if running: print >> stderr, ( "WARNING: %d instances are still running" % running) sys.exit(0) else: master_res = image.run(key_name=opts.key_pair, security_group_ids=[master_group.id], instance_type=master_type, subnet_id=subnetId(), placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile) master_nodes = master_res.instances print >> stderr, "Launched master in %s, regid = %s" % ( zone, master_res.id) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) master_group.authorize('tcp', 8080, 8081, authorized_address) master_group.authorize('tcp', 18080, 18080, authorized_address) master_group.authorize('tcp', 19999, 19999, authorized_address) master_group.authorize('tcp', 50030, 50030, authorized_address) master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) slave_group.authorize('tcp', 8080, 8081, authorized_address) slave_group.authorize('tcp', 50060, 50060, authorized_address) slave_group.authorize('tcp', 50075, 50075, authorized_address) slave_group.authorize('tcp', 60060, 60060, authorized_address) slave_group.authorize('tcp', 60075, 60075, authorized_address) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) additional_groups = [] if opts.additional_security_group: additional_groups = [sg for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group] + additional_groups, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=user_data_content) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names for master in master_nodes: master.add_tag( key='Name', value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) for slave in slave_nodes: slave.add_tag( key='Name', value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print("Setting up security groups...") master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id) slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created if opts.vpc_id is None: master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) else: master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=master_group) master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=master_group) master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=master_group) master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=slave_group) master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=slave_group) master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) master_group.authorize('tcp', 8080, 8081, authorized_address) master_group.authorize('tcp', 18080, 18080, authorized_address) master_group.authorize('tcp', 19999, 19999, authorized_address) master_group.authorize('tcp', 50030, 50030, authorized_address) master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) # Rstudio (GUI for R) needs port 8787 for web access master_group.authorize('tcp', 8787, 8787, authorized_address) # HDFS NFS gateway requires 111,2049,4242 for tcp & udp master_group.authorize('tcp', 111, 111, authorized_address) master_group.authorize('udp', 111, 111, authorized_address) master_group.authorize('tcp', 2049, 2049, authorized_address) master_group.authorize('udp', 2049, 2049, authorized_address) master_group.authorize('tcp', 4242, 4242, authorized_address) master_group.authorize('udp', 4242, 4242, authorized_address) # RM in YARN mode uses 8088 master_group.authorize('tcp', 8088, 8088, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created if opts.vpc_id is None: slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) else: slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=master_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=master_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=master_group) slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=slave_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) slave_group.authorize('tcp', 8080, 8081, authorized_address) slave_group.authorize('tcp', 50060, 50060, authorized_address) slave_group.authorize('tcp', 50075, 50075, authorized_address) slave_group.authorize('tcp', 60060, 60060, authorized_address) slave_group.authorize('tcp', 60075, 60075, authorized_address) #Kylix slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060, src_group=slave_group) slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060, src_group=slave_group) slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060, src_group=master_group) slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060, src_group=master_group) master_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060, src_group=slave_group) master_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060, src_group=slave_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print("ERROR: There are already instances running in group %s or %s" % (master_group.name, slave_group.name), file=stderr) sys.exit(1) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: additional_group_ids = [sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) ) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=user_data_content) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names for master in master_nodes: master.add_tag(key='Name', value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) for slave in slave_nodes: slave.add_tag(key='Name', value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." if opts.security_group_prefix is None: master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") else: master_group = get_or_make_group(conn, opts.security_group_prefix + "-master") slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # Check if instances are already running with the cluster name existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances for name: %s " % cluster_name) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] outstanding_request_ids = [] for i in my_req_ids: if i in id_to_req: if id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) else: outstanding_request_ids.append(i) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer for request ids including %s" % ( len(active_instance_ids), opts.slaves, outstanding_request_ids[0:10]) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names # TODO: Add retry logic for tagging with name since it's used to identify a cluster. for master in master_nodes: name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id) for i in range(0, 5): try: master.add_tag(key='Name', value=name) except: print "Failed attempt %i of 5 to tag %s" % ((i + 1), name) if (i == 5): raise "Error - failed max attempts to add name tag" time.sleep(5) for slave in slave_nodes: name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id) for i in range(0, 5): try: slave.add_tag(key='Name', value=name) except: print "Failed attempt %i of 5 to tag %s" % ((i + 1), name) if (i == 5): raise "Error - failed max attempts to add name tag" time.sleep(5) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." if opts.security_group_prefix is None: master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") else: master_group = get_or_make_group( conn, opts.security_group_prefix + "-master") slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves") authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) master_group.authorize('tcp', 8080, 8081, authorized_address) master_group.authorize('tcp', 18080, 18080, authorized_address) master_group.authorize('tcp', 19999, 19999, authorized_address) master_group.authorize('tcp', 50030, 50030, authorized_address) master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) slave_group.authorize('tcp', 8080, 8081, authorized_address) slave_group.authorize('tcp', 50060, 50060, authorized_address) slave_group.authorize('tcp', 50075, 50075, authorized_address) slave_group.authorize('tcp', 60060, 60060, authorized_address) slave_group.authorize('tcp', 60075, 60075, authorized_address) # Check if instances are already running with the cluster name existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances for name: %s " % cluster_name) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) additional_groups = [] if opts.additional_security_group: additional_groups = [ sg for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id) ] print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] outstanding_request_ids = [] for i in my_req_ids: if i in id_to_req: if id_to_req[i].state == "active": active_instance_ids.append( id_to_req[i].instance_id) else: outstanding_request_ids.append(i) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer for request ids including %s" % ( len(active_instance_ids), opts.slaves, outstanding_request_ids[0:10]) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group] + additional_groups, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names # TODO: Add retry logic for tagging with name since it's used to identify a cluster. for master in master_nodes: name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id) for i in range(0, 5): try: master.add_tag(key='Name', value=name) except: print "Failed attempt %i of 5 to tag %s" % ((i + 1), name) if (i == 5): raise "Error - failed max attempts to add name tag" time.sleep(5) for slave in slave_nodes: name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id) for i in range(0, 5): try: slave.add_tag(key='Name', value=name) except: print "Failed attempt %i of 5 to tag %s" % ((i + 1), name) if (i == 5): raise "Error - failed max attempts to add name tag" time.sleep(5) # Return all the instances return (master_nodes, slave_nodes)
def create_nodes(self, reactor, names, distribution, metadata={}): """ Create nodes with the given names. :param reactor: The reactor. :param name: The names of the nodes. :type name: list of str :param str distribution: The name of the distribution to install on the nodes. :param dict metadata: Metadata to associate with the nodes. :return: A list of ``Deferred``s each firing with an INode when the corresponding node is created. The list has the same order as :param:`names`. """ size = self._default_size disk_size = 8 action = start_action( action_type=u"flocker:provision:aws:create_nodes", instance_count=len(names), distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ) with action.context(): disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap['/dev/sda1'] = disk1 images = self._connection.get_all_images( filters={'name': IMAGE_NAMES[distribution]}, ) instances = self._run_nodes(count=len(names), image_id=images[0].id, size=size, diskmap=diskmap) def make_node(ignored, name, instance): return AWSNode( name=name, _provisioner=self, _instance=instance, distribution=distribution, ) results = [] for name, instance in izip_longest(names, instances): if instance is None: results.append(fail(Exception("Could not run instance"))) else: node_metadata = metadata.copy() node_metadata['Name'] = name d = self._async_get_node(reactor, instance, node_metadata) d = DeferredContext(d) d.addCallback(make_node, name, instance) results.append(d.result) action_completion = DeferredContext(DeferredList(results)) action_completion.addActionFinish() # Individual results and errors should be consumed by the caller, # so we can leave action_completion alone now. return results
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: group_names = [g.id for g in res.groups] if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names: active = [i for i in res.instances if is_active(i)] if len(active) > 0: print >> stderr, ("ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) slave_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = opts.zone, count = opts.slaves, key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids = [req.id for req in slave_reqs] print "Waiting for spot instances to be granted..." while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active = 0 instance_ids = [] for i in my_req_ids: if id_to_req[i].state == "active": active += 1 instance_ids.append(id_to_req[i].instance_id) if active == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (active, opts.slaves) else: # Launch non-spot instances slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = opts.zone, min_count = opts.slaves, max_count = opts.slaves, block_device_map = block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = opts.ft, max_count = opts.ft, block_device_map = block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Launch ZooKeeper nodes if required if opts.ft > 1: zoo_res = image.run(key_name = opts.key_pair, security_groups = [zoo_group], instance_type = opts.instance_type, placement = opts.zone, min_count = 3, max_count = 3, block_device_map = block_map) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") # master_group = get_or_make_group(conn, cluster_name) # slave_group = get_or_make_group(conn, cluster_name) # zoo_group = get_or_make_group(conn, cluster_name) if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: group_names = [g.id for g in res.groups] if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names: active = [i for i in res.instances if is_active(i)] if len(active) > 0: print >> stderr, ( "ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) if opts.ami == "std": try: opts.ami = urllib2.urlopen(STD_AMI_URL).read().strip() print "GraphLab AMI for Standard Instances: " + opts.ami except: print >> stderr, "Could not read " + STD_AMI_URL elif opts.ami == "hpc": try: opts.ami = urllib2.urlopen(HVM_AMI_URL).read().strip() print "GraphLab AMI for HPC Instances: " + opts.ami except: print >> stderr, "Could not read " + HVM_AMI_URL print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=opts.zone, count=opts.slaves, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map) my_req_ids = [req.id for req in slave_reqs] print "Waiting for spot instances to be granted..." while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active = 0 instance_ids = [] for i in my_req_ids: if id_to_req[i].state == "active": active += 1 instance_ids.append(id_to_req[i].instance_id) if active == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (active, opts.slaves) else: # Launch non-spot instances slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=opts.zone, min_count=opts.slaves, max_count=opts.slaves, block_device_map=block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, num_nodes, cluster_name): if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) print("Setting up security groups...") slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) authorized_address = opts.authorized_address if slave_group.rules == []: # Group was just now created if opts.vpc_id is None: slave_group.authorize(src_group=slave_group) else: slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=slave_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) # Check if instances are already running in our groups existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves: print("ERROR: There are already instances running in group %s" % slave_group.name, file=stderr) sys.exit(1) if opts.ami is None: print("ERROR: AMI is not set, exit") sys.exit(1) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: additional_group_ids = [sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (num_nodes, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(num_nodes, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == num_nodes: print("All %d spot instances granted" % (num_nodes + 1)) reservations = conn.get_all_reservations(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slave spot instances granted, waiting longer" % ( len(active_instance_ids), num_nodes)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: slave_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: print ("WARNING: --spot-price was not set; consider launch slaves as spot instances to save money") # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(num_nodes, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) # Return all the instances return slave_nodes
def update_ami(aws_svc, encrypted_ami, updater_ami, encrypted_ami_name, subnet_id=None, security_group_ids=None, enc_svc_class=encryptor_service.EncryptorService, guest_instance_type='m3.medium', updater_instance_type='m3.medium', instance_config=None, status_port=encryptor_service.ENCRYPTOR_STATUS_PORT): encrypted_guest = None updater = None mv_root_id = None temp_sg_id = None if instance_config is None: instance_config = InstanceConfig() try: guest_image = aws_svc.get_image(encrypted_ami) # Step 1. Launch encrypted guest AMI # Use 'updater' mode to avoid chain loading the guest # automatically. We just want this AMI/instance up as the # base to create a new AMI and preserve license # information embedded in the guest AMI log.info("Launching encrypted guest/updater") instance_config.brkt_config['solo_mode'] = 'updater' instance_config.brkt_config['status_port'] = status_port encrypted_guest = aws_svc.run_instance( encrypted_ami, instance_type=guest_instance_type, ebs_optimized=False, subnet_id=subnet_id, user_data=json.dumps(instance_config.brkt_config)) aws_svc.create_tags( encrypted_guest.id, name=NAME_GUEST_CREATOR, description=DESCRIPTION_GUEST_CREATOR % {'image_id': encrypted_ami} ) # Run updater in same zone as guest so we can swap volumes user_data = instance_config.make_userdata() compressed_user_data = gzip_user_data(user_data) # If the user didn't specify a security group, create a temporary # security group that allows brkt-cli to get status from the updater. run_instance = aws_svc.run_instance if not security_group_ids: vpc_id = None if subnet_id: subnet = aws_svc.get_subnet(subnet_id) vpc_id = subnet.vpc_id temp_sg_id = create_encryptor_security_group( aws_svc, vpc_id=vpc_id, status_port=status_port).id security_group_ids = [temp_sg_id] # Wrap with a retry, to handle eventual consistency issues with # the newly-created group. run_instance = aws_svc.retry( aws_svc.run_instance, error_code_regexp='InvalidGroup\.NotFound' ) updater = run_instance( updater_ami, instance_type=updater_instance_type, user_data=compressed_user_data, ebs_optimized=False, subnet_id=subnet_id, placement=encrypted_guest.placement, security_group_ids=security_group_ids) aws_svc.create_tags( updater.id, name=NAME_METAVISOR_UPDATER, description=DESCRIPTION_METAVISOR_UPDATER, ) wait_for_instance(aws_svc, encrypted_guest.id, state="running") log.info("Launched guest: %s Updater: %s" % (encrypted_guest.id, updater.id) ) # Step 2. Wait for the updater to finish and stop the instances aws_svc.stop_instance(encrypted_guest.id) updater = wait_for_instance(aws_svc, updater.id, state="running") host_ips = [] if updater.ip_address: host_ips.append(updater.ip_address) if updater.private_ip_address: host_ips.append(updater.private_ip_address) log.info('Adding %s to NO_PROXY environment variable' % updater.private_ip_address) if os.environ.get('NO_PROXY'): os.environ['NO_PROXY'] += "," + \ updater.private_ip_address else: os.environ['NO_PROXY'] = updater.private_ip_address enc_svc = enc_svc_class(host_ips, port=status_port) log.info('Waiting for updater service on %s (port %s on %s)', updater.id, enc_svc.port, ', '.join(host_ips)) wait_for_encryptor_up(enc_svc, Deadline(600)) try: wait_for_encryption(enc_svc) except Exception as e: # Stop the updater instance, to make the console log available. encrypt_ami.stop_and_wait(aws_svc, updater.id) log_exception_console(aws_svc, e, updater.id) raise aws_svc.stop_instance(updater.id) encrypted_guest = wait_for_instance( aws_svc, encrypted_guest.id, state="stopped") updater = wait_for_instance(aws_svc, updater.id, state="stopped") guest_bdm = encrypted_guest.block_device_mapping updater_bdm = updater.block_device_mapping # Step 3. Detach old BSD drive(s) and delete from encrypted guest if guest_image.virtualization_type == 'paravirtual': d_list = ['/dev/sda1', '/dev/sda2', '/dev/sda3'] else: d_list = [encrypted_guest.root_device_name] for d in d_list: log.info("Detaching old metavisor disk: %s from %s" % (guest_bdm[d].volume_id, encrypted_guest.id)) aws_svc.detach_volume(guest_bdm[d].volume_id, instance_id=encrypted_guest.id, force=True ) aws_svc.delete_volume(guest_bdm[d].volume_id) # Step 4. Snapshot MV volume(s) log.info("Creating snapshots") if guest_image.virtualization_type == 'paravirtual': description = DESCRIPTION_SNAPSHOT % {'image_id': updater.id} snap_root = aws_svc.create_snapshot( updater_bdm['/dev/sda2'].volume_id, name=NAME_METAVISOR_ROOT_SNAPSHOT, description=description ) snap_log = aws_svc.create_snapshot( updater_bdm['/dev/sda3'].volume_id, name=NAME_METAVISOR_LOG_SNAPSHOT, description=description ) wait_for_snapshots(aws_svc, snap_root.id, snap_log.id) dev_root = EBSBlockDeviceType(volume_type='gp2', snapshot_id=snap_root.id, delete_on_termination=True) dev_log = EBSBlockDeviceType(volume_type='gp2', snapshot_id=snap_log.id, delete_on_termination=True) guest_bdm['/dev/sda2'] = dev_root guest_bdm['/dev/sda3'] = dev_log # Use updater as base instance for create_image boot_snap_name = NAME_METAVISOR_GRUB_SNAPSHOT root_device_name = updater.root_device_name guest_root = '/dev/sda5' d_list.append(guest_root) else: # Use guest_instance as base instance for create_image boot_snap_name = NAME_METAVISOR_ROOT_SNAPSHOT root_device_name = guest_image.root_device_name guest_root = '/dev/sdf' d_list.append(guest_root) # Preserve volume type for any additional attached volumes for d in guest_bdm.keys(): if d not in d_list: log.debug("Preserving volume type for disk %s", d) vol_id = guest_bdm[d].volume_id vol = aws_svc.get_volume(vol_id) guest_bdm[d].volume_type = vol.type # Step 5. Move new MV boot disk to base instance log.info("Detach boot volume from %s" % (updater.id,)) mv_root_id = updater_bdm['/dev/sda1'].volume_id aws_svc.detach_volume(mv_root_id, instance_id=updater.id, force=True ) # Step 6. Attach new boot disk to guest instance log.info("Attaching new metavisor boot disk: %s to %s" % (mv_root_id, encrypted_guest.id) ) aws_svc.attach_volume(mv_root_id, encrypted_guest.id, root_device_name) encrypted_guest = encrypt_ami.wait_for_volume_attached( aws_svc, encrypted_guest.id, root_device_name) guest_bdm[root_device_name] = \ encrypted_guest.block_device_mapping[root_device_name] guest_bdm[root_device_name].delete_on_termination = True guest_bdm[root_device_name].volume_type = 'gp2' guest_root_vol_id = guest_bdm[guest_root].volume_id guest_root_vol = aws_svc.get_volume(guest_root_vol_id) guest_bdm[guest_root].volume_type = guest_root_vol.type # Step 7. Create new AMI. Preserve billing/license info log.info("Creating new AMI") ami = aws_svc.create_image( encrypted_guest.id, encrypted_ami_name, description=guest_image.description, no_reboot=True, block_device_mapping=guest_bdm ) wait_for_image(aws_svc, ami) image = aws_svc.get_image(ami, retry=True) aws_svc.create_tags( image.block_device_mapping[root_device_name].snapshot_id, name=boot_snap_name, ) aws_svc.create_tags( image.block_device_mapping[guest_root].snapshot_id, name=NAME_ENCRYPTED_ROOT_SNAPSHOT, ) aws_svc.create_tags(ami) return ami finally: instance_ids = set() volume_ids = set() sg_ids = set() if encrypted_guest: instance_ids.add(encrypted_guest.id) if updater: instance_ids.add(updater.id) if mv_root_id: volume_ids.add(mv_root_id) if temp_sg_id: sg_ids.add(temp_sg_id) clean_up(aws_svc, instance_ids=instance_ids, volume_ids=volume_ids, security_group_ids=sg_ids)
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, instance_profile_name="spark-node", min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, instance_profile_name="spark-node", block_device_map=block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Return all the instances return (master_nodes, slave_nodes)
def create_node(self, name, distribution, size=None, disk_size=8, metadata={}): if size is None: size = self._default_size with start_action( action_type=u"flocker:provision:aws:create_node", name=name, distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ): metadata = metadata.copy() metadata['Name'] = name disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap['/dev/sda1'] = disk1 images = self._connection.get_all_images( filters={'name': IMAGE_NAMES[distribution]}, ) with start_action( action_type= u"flocker:provision:aws:create_node:run_instances", ) as context: reservation = self._connection.run_instances( images[0].id, key_name=self._keyname, instance_type=size, security_groups=self._security_groups, block_device_map=diskmap, placement=self._zone, # On some operating systems, a tty is requried for sudo. # Since AWS systems have a non-root user as the login, # disable this, so we can use sudo with conch. user_data=dedent("""\ #!/bin/sh sed -i '/Defaults *requiretty/d' /etc/sudoers """), ) instance = reservation.instances[0] context.add_success_fields(instance_id=instance.id) self._connection.create_tags([instance.id], metadata) # Display state as instance starts up, to keep user informed that # things are happening. _wait_until_running(instance) return AWSNode( name=name, _provisioner=self, _instance=instance, distribution=distribution, )
def create_node(self, name, distribution, size=None, disk_size=8, metadata={}): if size is None: size = self._default_size with start_action( action_type=u"flocker:provision:aws:create_node", name=name, distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ): metadata = metadata.copy() metadata['Name'] = name disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap['/dev/sda1'] = disk1 images = self._connection.get_all_images( filters={'name': IMAGE_NAMES[distribution]}, ) with start_action( action_type=u"flocker:provision:aws:create_node:run_instances", ) as context: reservation = self._connection.run_instances( images[0].id, key_name=self._keyname, instance_type=size, security_groups=self._security_groups, block_device_map=diskmap, placement=self._zone, # On some operating systems, a tty is requried for sudo. # Since AWS systems have a non-root user as the login, # disable this, so we can use sudo with conch. user_data=dedent("""\ #!/bin/sh sed -i '/Defaults *requiretty/d' /etc/sudoers """), ) instance = reservation.instances[0] context.add_success_fields(instance_id=instance.id) self._connection.create_tags([instance.id], metadata) # Display state as instance starts up, to keep user informed that # things are happening. _wait_until_running(instance) return AWSNode( name=name, _provisioner=self, _instance=instance, distribution=distribution, )
def create_cluster(conn, args): if args.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if args.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) # make or get the security group. security_group = get_or_make_group(conn, args.name, args.vpc_id) # set the inbound permission rules if len(security_group.rules) == 0: if __name__ == '__main__': if args.vpc_id is None: security_group.authorize(src_group=security_group) else: security_group.authorize('tcp', 22, 22, args.authorized_address) security_group.authorize('tcp', 8888, 8888, args.authorized_address) security_group.authorize('tcp', 7000, 7000, args.authorized_address) security_group.authorize('tcp', 7001, 7001, args.authorized_address) security_group.authorize('tcp', 7199, 7199, args.authorized_address) security_group.authorize('tcp', 9042, 9042, args.authorized_address) security_group.authorize('tcp', 9160, 9160, args.authorized_address) else: print("Security group already exists, skipping creation.") instances = cluster_nodes(conn, args.name) if any(instances): additional_tags = {} for i in instances: i.add_tags( dict(additional_tags, Name="{cn}-node-{iid}".format(cn=args.name, iid=i.id))) return instances else: print( "Launching {m} instances for cluster...".format(m=args.node_count)) try: image = conn.get_all_images(image_ids=args.ami)[0] block_map = BlockDeviceMapping() if args.ebs_vol_size > 0: if args.instance_type.startswith('m3.'): for i in range(get_num_disks(args.instance_type)): device = BlockDeviceType() device.ephemeral_name = "ephemeral%d" % i name = "/dev/sd" + string.ascii_letters[i + 1] block_map[name] = device else: device = EBSBlockDeviceType() device.size = args.ebs_vol_size device.volume_type = args.ebs_vol_type device.delete_on_termination = True key = "/dev/sd" + chr(ord('s') + 1) block_map[key] = device nodes = image.run(key_name=args.key_pair, security_group_ids=[security_group.id], instance_type="", placement=args.zone, min_count=args.node_count, max_count=args.node_count, block_device_map=block_map, subnet_id=None, placement_group=None, user_data=None, instance_initiated_shutdown_behavior="stop", instance_profile_name=None) print("Waiting for AWS to propagate instance metadata...") time.sleep(15) additional_tags = {} for node in nodes.instances: node.add_tags( dict(additional_tags, Name="{cn}-node-{iid}".format(cn=args.name, iid=node.id))) return nodes.instances except Exception as e: print("Caught exception: ", e) print("ERROR: Could not find AMI " + args.ami, file=stderr) sys.exit(1)
def _run_encryptor_instance( aws_svc, encryptor_image_id, snapshot, root_size, guest_image_id, security_group_ids=None, subnet_id=None, zone=None, instance_config=None, status_port=encryptor_service.ENCRYPTOR_STATUS_PORT): bdm = BlockDeviceMapping() if instance_config is None: instance_config = InstanceConfig() image = aws_svc.get_image(encryptor_image_id) virtualization_type = image.virtualization_type # Use gp2 for fast burst I/O copying root drive guest_unencrypted_root = EBSBlockDeviceType( volume_type='gp2', snapshot_id=snapshot, delete_on_termination=True) # Use gp2 for fast burst I/O copying root drive log.info('Launching encryptor instance with snapshot %s', snapshot) # They are creating an encrypted AMI instead of updating it # Use gp2 for fast burst I/O copying root drive guest_encrypted_root = EBSBlockDeviceType( volume_type='gp2', delete_on_termination=True) guest_encrypted_root.size = 2 * root_size + 1 if virtualization_type == 'paravirtual': bdm['/dev/sda4'] = guest_unencrypted_root bdm['/dev/sda5'] = guest_encrypted_root else: # Use 'sd' names even though AWS maps these to 'xvd' # The AWS GUI only exposes 'sd' names, and won't allow # the user to attach to an existing 'sd' name in use, but # would allow conflicts if we used 'xvd' names here. bdm['/dev/sdf'] = guest_unencrypted_root bdm['/dev/sdg'] = guest_encrypted_root # If security groups were not specified, create a temporary security # group that allows us to poll the metavisor for encryption progress. temp_sg_id = None instance = None try: run_instance = aws_svc.run_instance if not security_group_ids: vpc_id = None if subnet_id: subnet = aws_svc.get_subnet(subnet_id) vpc_id = subnet.vpc_id temp_sg_id = create_encryptor_security_group( aws_svc, vpc_id=vpc_id, status_port=status_port).id security_group_ids = [temp_sg_id] # Wrap with a retry, to handle eventual consistency issues with # the newly-created group. run_instance = aws_svc.retry( aws_svc.run_instance, error_code_regexp='InvalidGroup\.NotFound' ) user_data = instance_config.make_userdata() compressed_user_data = gzip_user_data(user_data) instance = run_instance( encryptor_image_id, security_group_ids=security_group_ids, user_data=compressed_user_data, placement=zone, block_device_map=bdm, subnet_id=subnet_id ) aws_svc.create_tags( instance.id, name=NAME_ENCRYPTOR, description=DESCRIPTION_ENCRYPTOR % {'image_id': guest_image_id} ) log.info('Launching encryptor instance %s', instance.id) instance = wait_for_instance(aws_svc, instance.id) # Tag volumes. bdm = instance.block_device_mapping if virtualization_type == 'paravirtual': aws_svc.create_tags( bdm['/dev/sda5'].volume_id, name=NAME_ENCRYPTED_ROOT_VOLUME) aws_svc.create_tags( bdm['/dev/sda2'].volume_id, name=NAME_METAVISOR_ROOT_VOLUME) aws_svc.create_tags( bdm['/dev/sda1'].volume_id, name=NAME_METAVISOR_GRUB_VOLUME) aws_svc.create_tags( bdm['/dev/sda3'].volume_id, name=NAME_METAVISOR_LOG_VOLUME) else: aws_svc.create_tags( bdm['/dev/sda1'].volume_id, name=NAME_METAVISOR_ROOT_VOLUME) aws_svc.create_tags( bdm['/dev/sdg'].volume_id, name=NAME_ENCRYPTED_ROOT_VOLUME) except: cleanup_instance_ids = [] cleanup_sg_ids = [] if instance: cleanup_instance_ids = [instance.id] if temp_sg_id: cleanup_sg_ids = [temp_sg_id] clean_up( aws_svc, instance_ids=cleanup_instance_ids, security_group_ids=cleanup_sg_ids ) raise return instance, temp_sg_id
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: group_names = [g.id for g in res.groups] if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names: active = [i for i in res.instances if is_active(i)] if len(active) > 0: print >> stderr, ( "ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=opts.zone, min_count=opts.slaves, max_count=opts.slaves, block_device_map=block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=opts.ft, max_count=opts.ft, block_device_map=block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Launch ZooKeeper nodes if required if opts.ft > 1: zoo_res = image.run(key_name=opts.key_pair, security_groups=[zoo_group], instance_type=opts.instance_type, placement=opts.zone, min_count=3, max_count=3, block_device_map=block_map) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) #print "Setting up security groups..." #master_group = get_or_make_group(conn, cluster_name + "-master") #slave_group = get_or_make_group(conn, cluster_name + "-slaves") #if master_group.rules == []: # Group was just now created # master_group.authorize(src_group=master_group) # master_group.authorize(src_group=slave_group) # master_group.authorize('tcp', 22, 22, '0.0.0.0/0') # master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') # master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') # master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') # master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') # master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') # master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') # if opts.ganglia: # master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') #if slave_group.rules == []: # Group was just now created # slave_group.authorize(src_group=master_group) # slave_group.authorize(src_group=slave_group) # slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') # slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') # slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') # slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') # slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') # slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) #if existing_slaves or (existing_masters and not opts.use_existing_master): # print >> stderr, ("ERROR: There are already instances running in " + # "group %s or %s" % (master_group.name, slave_group.name)) # sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = zone, count = num_slaves_this_zone, key_name = opts.key_pair, #security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name = opts.key_pair, security_group_ids = ["sg-87956be2","sg-1ac33f7f", "sg-1ec33f7b"], subnet_id = "subnet-4182b007", instance_type = opts.instance_type, placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name = opts.key_pair, security_group_ids = ["sg-bd956bd8","sg-1ac33f7f", "sg-1ec33f7b"], subnet_id = "subnet-4182b007", instance_type = master_type, placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") master_group.owner_id = os.getenv('EC2_USER_ID') slave_group = get_or_make_group(conn, cluster_name + "-slaves") slave_group.owner_id = os.getenv('EC2_USER_ID') zoo_group = get_or_make_group(conn, cluster_name + "-zoo") zoo_group.owner_id = os.getenv('EC2_USER_ID') if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves, existing_zoos = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ( "ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to logging.debug("Calling boto BlockDeviceMapping()...") block_map = BlockDeviceMapping() logging.debug(" Printing block_map..") #print block_map if opts.ebs_vol_size > 0: logging.debug("Calling boto EBSBlockDeviceType()...") device = EBSBlockDeviceType() #print "device: ", device device.size = opts.ebs_vol_size device.delete_on_termination = True device.ephemeral_name = "ephemeral0" #block_map["/dev/sdv"] = device #block_map["/dev/sdv"] = device block_map["/dev/vdb"] = device if opts.user_data_file != None: user_data_file = open(opts.user_data_file) try: opts.user_data = user_data_file.read() #print "user data (encoded) = ", opts.user_data finally: user_data_file.close() # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=opts.user_data) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=opts.user_data) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Launch ZooKeeper nodes if required if int(opts.ft) > 1: print "Running " + opts.ft + " zookeepers" zoo_res = image.run(key_name=opts.key_pair, security_groups=[zoo_group], instance_type=opts.instance_type, placement=opts.zone, min_count=3, max_count=3, block_device_map=block_map, user_data=opts.user_data) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') if opts.cluster_type == "mesos": master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups active_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if any(active_nodes): print >> stderr, ("ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) # Figure out the latest AMI from our static URL if opts.ami == "latest": try: opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip() print "Latest Spark AMI: " + opts.ami except: print >> stderr, "Could not read " + LATEST_AMI_URL sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = zone, count = num_slaves_this_zone, key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes, zoo_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) + len(zoo_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def _create_server_ec2(connection, region, disk_name, disk_size, ami, key_pair, instance_type, tags={}, security_groups=None, delete_on_termination=True, log=False, wait_for_ssh_available=True): """ Creates EC2 Instance """ if log: log_green("Started...") log_yellow("...Creating EC2 instance...") ebs_volume = EBSBlockDeviceType() ebs_volume.size = disk_size bdm = BlockDeviceMapping() bdm[disk_name] = ebs_volume # get an ec2 ami image object with our choosen ami image = connection.get_all_images(ami)[0] # start a new instance reservation = image.run(1, 1, key_name=key_pair, security_groups=security_groups, block_device_map=bdm, instance_type=instance_type) # and get our instance_id instance = reservation.instances[0] # and loop and wait until ssh is available while instance.state == u'pending': if log: log_yellow("Instance state: %s" % instance.state) sleep(10) instance.update() if log: log_green("Instance state: %s" % instance.state) if wait_for_ssh_available: wait_for_ssh(instance.public_dns_name) # update the EBS volumes to be deleted on instance termination if delete_on_termination: for dev, bd in instance.block_device_mapping.items(): instance.modify_attribute('BlockDeviceMapping', ["%s=%d" % (dev, 1)]) # add a tag to our instance if tags: connection.create_tags([instance.id], tags) if log: log_green("Public dns: %s" % instance.public_dns_name) # returns our new instance return instance