def _register_image(self, snapshot_id): conn = self.platform.new_ec2_conn() instance_id = self.platform.get_instance_id() instance = conn.get_all_instances([instance_id])[0].instances[0] block_device_map = BlockDeviceMapping(conn) root_vol = EBSBlockDeviceType(snapshot_id=snapshot_id) root_vol.delete_on_termination = True # Adding ephemeral devices for eph, device in EPH_STORAGE_MAPPING[linux.os['arch']].items(): bdt = EBSBlockDeviceType(conn) bdt.ephemeral_name = eph block_device_map[device] = bdt root_partition = instance.root_device_name[:-1] if root_partition in self.platform.get_block_device_mapping().values(): block_device_map[root_partition] = root_vol else: block_device_map[instance.root_device_name] = root_vol return conn.register_image( name=self.image_name, root_device_name=instance.root_device_name, block_device_map=block_device_map, kernel_id=instance.kernel, virtualization_type=instance.virtualization_type, ramdisk_id=self.platform.get_ramdisk_id(), architecture=instance.architecture)
def create_node(self, name, distribution, metadata={}): size = self._default_size disk_size = 8 with start_action( action_type=u"flocker:provision:aws:create_node", name=name, distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ): metadata = metadata.copy() metadata["Name"] = name disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap["/dev/sda1"] = disk1 images = self._connection.get_all_images(filters={"name": IMAGE_NAMES[distribution]}) # Retry several times, no sleep between retries is needed. instance = poll_until( lambda: self._get_node(images[0].id, size, diskmap, metadata), repeat(0, 10), lambda x: None ) return AWSNode(name=name, _provisioner=self, _instance=instance, distribution=distribution)
def _register_image(self, snapshot_id): conn = self.platform.new_ec2_conn() instance_id = self.platform.get_instance_id() instance = conn.get_all_instances([instance_id])[0].instances[0] block_device_map = BlockDeviceMapping(conn) root_vol = EBSBlockDeviceType(snapshot_id=snapshot_id) root_vol.delete_on_termination = True # Adding ephemeral devices for eph, device in EPH_STORAGE_MAPPING[linux.os['arch']].items(): bdt = EBSBlockDeviceType(conn) bdt.ephemeral_name = eph block_device_map[device] = bdt root_partition = instance.root_device_name[:-1] if root_partition in self.platform.get_block_device_mapping().values(): block_device_map[root_partition] = root_vol else: block_device_map[instance.root_device_name] = root_vol return conn.register_image( name=self.image_name, root_device_name=instance.root_device_name, block_device_map=block_device_map, kernel_id=instance.kernel, virtualization_type=instance.virtualization_type, ramdisk_id=self.platform.get_ramdisk_id(), architecture=instance.architecture)
def register_ebs_ami(self, snapshot_id, arch = 'x86_64', default_ephem_map = True, img_name = None, img_desc = None): # register against snapshot try: aki=PVGRUB_AKIS[self.region.name][arch] except KeyError: raise Exception("Unable to determine pvgrub hd00 AKI for region (%s) arch (%s)" % (self.region.name, arch)) if not img_name: rand_id = random.randrange(2**32) # These names need to be unique, hence the pseudo-uuid img_name='EBSHelper AMI - %s - uuid-%x' % (snapshot_id, rand_id) if not img_desc: img_desc='Created directly from volume snapshot %s' % (snapshot_id) self.log.debug("Registering snapshot (%s) as new EBS AMI" % (snapshot_id)) ebs = EBSBlockDeviceType() ebs.snapshot_id = snapshot_id ebs.delete_on_termination = True block_map = BlockDeviceMapping() block_map['/dev/sda'] = ebs # The ephemeral mappings are automatic with S3 images # For EBS images we need to make them explicit # These settings are required to make the same fstab work on both S3 and EBS images if default_ephem_map: e0 = EBSBlockDeviceType() e0.ephemeral_name = 'ephemeral0' e1 = EBSBlockDeviceType() e1.ephemeral_name = 'ephemeral1' block_map['/dev/sdb'] = e0 block_map['/dev/sdc'] = e1 result = self.conn.register_image(name=img_name, description=img_desc, architecture=arch, kernel_id=aki, root_device_name='/dev/sda', block_device_map=block_map) return str(result)
def create_node(self, name, distribution, metadata={}): size = self._default_size disk_size = 10 with start_action( action_type=u"flocker:provision:aws:create_node", name=name, distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ): metadata = metadata.copy() metadata['Name'] = name disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap['/dev/sda1'] = disk1 images = self._connection.get_all_images( filters={'name': IMAGE_NAMES[distribution]}, ) # Retry several times, no sleep between retries is needed. instance = poll_until( lambda: self._get_node(images[0].id, size, diskmap, metadata), repeat(0, 10), lambda x: None) return AWSNode( name=name, _provisioner=self, _instance=instance, distribution=distribution, )
def launch_instance(self): if not self.verify_settings(): return is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'}) if is_instance_store: block_map = None else: block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True reservation = self.conn.run_instances( self.config['ec2_ami_id'], key_name=self.config['ec2_key_name'], security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']], instance_type=self.config['ec2_instance_type'], placement=self.config['ec2_zone'], placement_group=self.config['ec2_placement_group'], monitoring_enabled=self.config['ec2_monitoring_enabled'], block_device_map=block_map, user_data=self.user_data) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag() msg1 = "Started Instance: {0}\n".format(self.instance.id) LOG.info(msg1) print msg1 p = int(self.config['ssh_port']) port = "-p {0} ".format(p) if p and not p == 22 else '' ## change user to 'root' for all non-Ubuntu systems user = self.config['sudouser'] if self.config['sudouser'] and self.config['ssh_import'] else 'ubuntu' #XXX - TODO: replace public dns with fqdn, where appropriate msg2 = "To access: ssh {0}{1}@{2}\n".format( '-p {0} '.format(port) if port else '', user, self.instance.public_dns_name) msg3 = "To terminate: shaker-terminate {0}".format( self.instance.id) LOG.info(msg2) LOG.info(msg3) print msg2 print msg3
def launch_instance(self): if not self.verify_settings(): return block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True reservation = self.conn.run_instances( self.config['ec2_ami_id'], key_name=self.config['ec2_key_name'], security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']], instance_type=self.config['ec2_instance_type'], placement_group=self.config['ec2_placement_group'], monitoring_enabled=self.config['ec2_monitoring_enabled'], block_device_map=block_map, user_data=self.user_data) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag() msg1 = "Started Instance: {0}\n".format(self.instance.id) LOG.info(msg1) print msg1 p = int(self.config['ssh_port']) port = "-p {0} ".format(p) if p and not p == 22 else '' ## change user to 'root' for all non-Ubuntu systems user = self.config['sudouser'] if self.config[ 'sudouser'] and self.config['ssh_import'] else 'ubuntu' #XXX - TODO: replace public dns with fqdn, where appropriate msg2 = "To access: ssh {0}{1}@{2}\n".format( '-p {0} '.format(port) if port else '', user, self.instance.public_dns_name) msg3 = "To terminate: shaker-terminate {0}".format( self.instance.id) LOG.info(msg2) LOG.info(msg3) print msg2 print msg3
def launch_instance(self): if not self.verify_settings(): return block_map = BlockDeviceMapping() root_device = self.config["ec2_root_device"] block_map[root_device] = EBSBlockDeviceType() if self.config["ec2_size"]: block_map[root_device].size = self.config["ec2_size"] block_map[root_device].delete_on_termination = True for num, device_location in enumerate(self.config["ec2_ephemeral_devices"]): device = BlockDeviceType() device.ephemeral_name = "ephemeral%d" % num block_map[device_location] = device reservation = self.conn.run_instances( self.config["ec2_ami_id"], key_name=self.config["ec2_key_name"], security_groups=self.config["ec2_security_groups"] or [self.config["ec2_security_group"]], instance_type=self.config["ec2_instance_type"], placement=self.config["ec2_zone"], monitoring_enabled=self.config["ec2_monitoring_enabled"], block_device_map=block_map, user_data=self.user_data, ) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == "running": time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance %s failed after %d seconds" % (self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config["hostname"]: self.assign_name_tag() msg1 = "Started Instance: {0}\n".format(self.instance.id) LOG.info(msg1) print msg1 p = int(self.config["ssh_port"]) port = "-p {0} ".format(p) if p and not p == 22 else "" ## change user to 'root' for all non-Ubuntu systems user = self.config["sudouser"] if self.config["sudouser"] and self.config["ssh_import"] else "ubuntu" # XXX - TODO: replace public dns with fqdn, where appropriate msg2 = "To access: ssh {0}{1}@{2}\n" "To terminate: shaker-terminate {3}".format( port, user, self.instance.public_dns_name, self.instance.id ) LOG.info(msg2) print msg2
def launch_instance(self): if not self.verify_settings(): return is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'}) if is_instance_store: block_map = None else: block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True opts = { 'key_name': self.config['ec2_key_name'], 'security_groups': self.config['ec2_security_groups'] or [self.config['ec2_security_group']], 'instance_type': self.config['ec2_instance_type'], 'placement': self.config['ec2_zone'], 'placement_group': self.config['ec2_placement_group'], 'monitoring_enabled': self.config['ec2_monitoring_enabled'], 'block_device_map': block_map, 'user_data': self.user_data } if self.config.get('ec2_subnet_id',False): # when providing subnet_id, must use security_group_ids and not # named security_groups or API call will fail. opts.pop('security_groups',None) opts['security_group_ids'] = self.config['ec2_security_group_ids'] or [self.config['ec2_security_group_id']] if not opts['security_group_ids']: raise AssertionError('Must specify ec2_security_group_id or ec2_security_group_ids with subnet_id') opts['subnet_id'] = self.config['ec2_subnet_id'] reservation = self.conn.run_instances(self.config['ec2_ami_id'], **opts) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag()
def get_block_device(instance_type, ebs_vol_size): block_map = BlockDeviceMapping() if ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = ebs_vol_size device.delete_on_termination = True block_map['/dev/sdv'] = device for i in range(get_num_disks(instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev return block_map
def get_block_device(instance_type, ebs_vol_size): block_map = BlockDeviceMapping() if ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device for i in range(get_num_disks(instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev return block_map
def register_ebs_ami(self, snapshot_id, arch="x86_64", default_ephem_map=True, img_name=None, img_desc=None): # register against snapshot try: aki = PVGRUB_AKIS[self.region.name][arch] except KeyError: raise Exception("Unable to find pvgrub hd00 AKI for %s, arch (%s)" % (self.region.name, arch)) if not img_name: rand_id = random.randrange(2 ** 32) # These names need to be unique, hence the pseudo-uuid img_name = "EBSHelper AMI - %s - uuid-%x" % (snapshot_id, rand_id) if not img_desc: img_desc = "Created directly from volume snapshot %s" % snapshot_id self.log.debug("Registering %s as new EBS AMI" % snapshot_id) self.create_sgroup("ec2helper-vnc-ssh-%x" % random.randrange(2 ** 32), allow_vnc=True) ebs = EBSBlockDeviceType() ebs.snapshot_id = snapshot_id ebs.delete_on_termination = True block_map = BlockDeviceMapping() block_map["/dev/sda"] = ebs # The ephemeral mappings are automatic with S3 images # For EBS images we need to make them explicit # These settings are required to make the same fstab work on both S3 # and EBS images if default_ephem_map: e0 = EBSBlockDeviceType() e0.ephemeral_name = "ephemeral0" e1 = EBSBlockDeviceType() e1.ephemeral_name = "ephemeral1" block_map["/dev/sdb"] = e0 block_map["/dev/sdc"] = e1 result = self.conn.register_image( name=img_name, description=img_desc, architecture=arch, kernel_id=aki, root_device_name="/dev/sda", block_device_map=block_map, ) sleep(10) new_amis = self.conn.get_all_images([result]) new_amis[0].add_tag("Name", resource_tag) return str(result)
def launch_instance(self): if not self.verify_settings(): return is_instance_store = self.conn.get_all_images( self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'}) if is_instance_store: block_map = None else: block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True reservation = self.conn.run_instances( self.config['ec2_ami_id'], key_name=self.config['ec2_key_name'], security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']], instance_type=self.config['ec2_instance_type'], placement=self.config['ec2_zone'], placement_group=self.config['ec2_placement_group'], monitoring_enabled=self.config['ec2_monitoring_enabled'], block_device_map=block_map, user_data=self.user_data) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag()
def parse_block_device_args(self, block_device_maps_args): block_device_map = BlockDeviceMapping() for block_device_map_arg in block_device_maps_args: parts = block_device_map_arg.split('=') if len(parts) > 1: device_name = parts[0] block_dev_type = EBSBlockDeviceType() value_parts = parts[1].split(':') if value_parts[0].startswith('snap'): block_dev_type.snapshot_id = value_parts[0] else: if value_parts[0].startswith('ephemeral'): block_dev_type.ephemeral_name = value_parts[0] if len(value_parts) > 1: block_dev_type.size = int(value_parts[1]) if len(value_parts) > 2: if value_parts[2] == 'true': block_dev_type.delete_on_termination = True block_device_map[device_name] = block_dev_type return block_device_map
def launch_instance(self): if not self.verify_settings(): return is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'}) if is_instance_store: block_map = None else: block_map = BlockDeviceMapping() root_device = self.config['ec2_root_device'] block_map[root_device] = EBSBlockDeviceType() if self.config['ec2_size']: block_map[root_device].size = self.config['ec2_size'] block_map[root_device].delete_on_termination = True reservation = self.conn.run_instances( self.config['ec2_ami_id'], key_name=self.config['ec2_key_name'], security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']], instance_type=self.config['ec2_instance_type'], placement=self.config['ec2_zone'], placement_group=self.config['ec2_placement_group'], monitoring_enabled=self.config['ec2_monitoring_enabled'], block_device_map=block_map, user_data=self.user_data) self.instance = reservation.instances[0] secs = RUN_INSTANCE_TIMEOUT rest_interval = 5 while secs and not self.instance.state == 'running': time.sleep(rest_interval) secs = secs - rest_interval try: self.instance.update() except boto.exception.EC2ResponseError: pass if secs <= 0: errmsg = "run instance {0} failed after {1} seconds".format( self.instance.id, RUN_INSTANCE_TIMEOUT) LOG.error(errmsg) else: if self.config['hostname']: self.assign_name_tag()
def build(hosts, cred, dry, inventory='hosts'): hret = {} old_state = {} con = None for h in hosts: logger.info(" Run action on host [%s]" % (h)) hret[h] = {} hv = {} hv = vmbuilder.utils.load_host_vars(h, inventory=inventory) hvars = hv['VM_PROVIDER'] if con is None: con = _connect(hvars['region'], cred) reservations = con.get_all_reservations(filters={"tag:Name": h}) old_state[h] = "absent" for reservation in reservations: instance = reservation.instances[0] if instance.state != 'terminated': hret[h]['instance'] = instance old_state[h] = "present" logger.info(" Server [%s] is already present" % (h)) if old_state[h] == 'present': continue bdm = None if 'disk_size' in hvars: try: dev_sda1 = EBSBlockDeviceType() dev_sda1.size = hvars['disk_size'] dev_sda1.delete_on_termination = True bdm = BlockDeviceMapping() bdm['/dev/sda1'] = dev_sda1 except Exception as e: logger.error("Error building block device for server: %s" % (e)) exit(1) try: reservation = con.run_instances( hvars['ami'], key_name=hvars['key'], instance_type=hvars['vmtype'], security_group_ids=[hvars['security']], subnet_id=hvars['subnet'], block_device_map=bdm, dry_run=dry ) hret[h]['instance'] = reservation.instances[0] except Exception as e: logger.error("Error building server: %s" % (e)) exit(1) for h in hosts: hv = vmbuilder.utils.load_host_vars(h, inventory=inventory) hvars = hv['VM_PROVIDER'] instance = hret[h]['instance'] status = instance.update() if old_state[h] == 'absent': logger.info(" Waiting for [%s] to be launched..." % (h)) while status == 'pending': time.sleep(5) status = instance.update() if old_state[h] == 'present': logger.info(" State is running with IP [%s]" % (instance.private_ip_address)) elif status == 'running': logger.info(" State changed to running with IP [%s]" % (instance.private_ip_address)) else: logger.error(" Status of [%s] is [%s]" % (h, status)) instance.add_tag("Name", "%s" % (h)) for cur_tag in hvars['tags']: instance.add_tag(cur_tag, hvars['tags'][cur_tag]) if 'extra_disks' in hvars and old_state[h] == 'absent': try: for cur_disk in hvars['extra_disks']: cur_vol = con.create_volume(cur_disk['size'], instance.placement) status = cur_vol.status while status != 'available': logger.info(" Waiting for volume [%s] to be launched..." % (cur_vol)) time.sleep(10) status = cur_vol.update() con.attach_volume(cur_vol.id, instance.id, '/dev/' + cur_disk['device']) except Exception as e: logger.error("Error Attaching new disks: %s" % (e)) exit(1) instance_volumes = con.get_all_volumes(filters={'attachment.instance-id': instance.id}) for counter, cur_vol in enumerate(instance_volumes): cur_vol.add_tag("Name", "%s_disk%d" % (h.split('.')[0], counter)) hret[h]['private_ip_address'] = instance.private_ip_address # If requested assosiate an new elastic IP for the host and create a security group to whitelist external IPs if 'assosiate_eip' in hvars and hvars['assosiate_eip'] is True: if instance.ip_address is None: eip = con.allocate_address() con.associate_address(instance.id, eip.public_ip) logger.info(" Adding public IP [%s]" % (eip.public_ip)) hret[h]['public_ip_address'] = eip.public_ip if 'whitelisted_ips' in hvars: logger.info(" Whitelisting IPs [%s]" % (hvars['whitelisted_ips'])) ips = hvars['whitelisted_ips'].split(',') project = hvars['tags']['Project'] security = hvars['security'] _create_security_group(con, instance, project, ips, security) return hret
def node_install(cn=def_cn,inst_type_idx=def_inst_type,idn=0, avz=def_default_avz,rt=def_default_requesttype, group_name='oggmssh', ssh_port=22, cidr='0.0.0.0/0'): """ Request and prepare single instance """ # FSO---connect cloud = boto.ec2.connect_to_region(avz[:-1],profile_name=ec2Profile) aminfo = cloud.get_image(def_ami[avz[:-1]]) # FSO---check if node with same name already exists if node_exists(cn + '_node' + str(idn)): print("Node already exists") sys.exit() # Check if ssh keypair exists key_name = get_keypair_name() check_keypair(cloud, key_name) # FSO---create a bigger root device dev_sda1 = EBSBlockDeviceType() dev_sda1.size = rootfs_size_gb dev_sda1.delete_on_termination = True bdm = BlockDeviceMapping() bdm['/dev/sda1'] = dev_sda1 dev_sdf_vol = get_user_persist_ebs(cloud, avz) # Check to see if specified security group already exists. # If we get an InvalidGroup.NotFound error back from EC2, # it means that it doesn't exist and we need to create it. try: group = cloud.get_all_security_groups(groupnames=[group_name])[0] except cloud.ResponseError as e: if e.code == 'InvalidGroup.NotFound': print('Creating Security Group: %s' % group_name) # Create a security group to control access to instance via SSH. group = cloud.create_security_group(group_name, 'A group that allows SSH access') else: raise # Add a rule to the security group to authorize SSH traffic # on the specified port. try: group.authorize('tcp', ssh_port, ssh_port, cidr) except cloud.ResponseError as e: if e.code == 'InvalidPermission.Duplicate': print('Security Group: %s already authorized' % group_name) else: raise log_with_ts("request node "+str(idn)) print('Reserving instance for node', aminfo.id, instance_infos[inst_type_idx]['type'], aminfo.name, aminfo.region) if rt == 'spot': print("placing node in ",avz) requests = cloud.request_spot_instances(def_price, def_ami[avz[:-1]], count=1, type='one-time', security_groups=[group_name], key_name=key_name, placement=avz, instance_type=instance_infos[inst_type_idx]['type'], block_device_map=bdm) req_ids = [request.id for request in requests] instance_ids = wait_for_fulfillment(cloud,req_ids) instances = cloud.get_only_instances(instance_ids=instance_ids) node = instances[0] log_with_ts("fullfilled spot node "+str(idn)) else: print("placing node in ",avz) reservation = cloud.run_instances(image_id=def_ami[avz[:-1]], key_name=key_name, placement = avz, security_groups=[group_name], instance_type=instance_infos[inst_type_idx]['type'], block_device_map= bdm) node = reservation.instances[0] log_with_ts("fullfilled ondemand node "+str(idn)) time.sleep(2) while not node.update() == 'running': print('waiting for', cn, 'node', idn, 'to boot...') time.sleep(5) log_with_ts("booted node "+str(idn)) if dev_sdf_vol is not None: cloud.attach_volume(dev_sdf_vol.id, node.id, "/dev/sdf") node.add_tag('Name', cn+'_node'+str(idn)) node.add_tag('type', cn+'node') node.add_tag('node-owner', user_identifier) # FSO---set delete on termination flag to true for ebs block device node.modify_attribute('blockDeviceMapping', { '/dev/sda1' : True }) # FSO--- test socket connect to ssh service ssh_test(node) log_with_ts("reachable node "+str(idn)) update_key_filename(node.region.name) # Mount potential user volume if dev_sdf_vol is not None: use_user_volume(node.dns_name) log_with_ts("finished node "+str(idn))
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") # master_group = get_or_make_group(conn, cluster_name) # slave_group = get_or_make_group(conn, cluster_name) # zoo_group = get_or_make_group(conn, cluster_name) if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: group_names = [g.id for g in res.groups] if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names: active = [i for i in res.instances if is_active(i)] if len(active) > 0: print >> stderr, ( "ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) if opts.ami == "std": try: opts.ami = urllib2.urlopen(STD_AMI_URL).read().strip() print "GraphLab AMI for Standard Instances: " + opts.ami except: print >> stderr, "Could not read " + STD_AMI_URL elif opts.ami == "hpc": try: opts.ami = urllib2.urlopen(HVM_AMI_URL).read().strip() print "GraphLab AMI for HPC Instances: " + opts.ami except: print >> stderr, "Could not read " + HVM_AMI_URL print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=opts.zone, count=opts.slaves, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map) my_req_ids = [req.id for req in slave_reqs] print "Waiting for spot instances to be granted..." while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active = 0 instance_ids = [] for i in my_req_ids: if id_to_req[i].state == "active": active += 1 instance_ids.append(id_to_req[i].instance_id) if active == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (active, opts.slaves) else: # Launch non-spot instances slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=opts.zone, min_count=opts.slaves, max_count=opts.slaves, block_device_map=block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, "strata-master") slave_group = get_or_make_group(conn, "strata-slaves") zoo_group = get_or_make_group(conn, "strata-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') if opts.cluster_type == "mesos": master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 80, 80, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups active_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if any(active_nodes): print >> stderr, ( "ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) # Figure out the latest AMI from our static URL if opts.ami == "latest": try: opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip() print "Latest Spark AMI: " + opts.ami except: print >> stderr, "Could not read " + LATEST_AMI_URL sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes, zoo_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) + len(zoo_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Create the right tags tags = {} tags['cluster'] = cluster_name tags['type'] = 'slave' for node in slave_nodes: conn.create_tags([node.id], tags) tags['type'] = 'master' for node in master_nodes: conn.create_tags([node.id], tags) zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=user_data_content) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names for master in master_nodes: master.add_tag(key='Name', value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) for slave in slave_nodes: slave.add_tag(key='Name', value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) # Return all the instances return (master_nodes, slave_nodes)
def create_node(self, name, distribution, size=None, disk_size=8, metadata={}): if size is None: size = self._default_size with start_action( action_type=u"flocker:provision:aws:create_node", name=name, distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ): metadata = metadata.copy() metadata['Name'] = name disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap['/dev/sda1'] = disk1 images = self._connection.get_all_images( filters={'name': IMAGE_NAMES[distribution]}, ) with start_action( action_type= u"flocker:provision:aws:create_node:run_instances", ) as context: reservation = self._connection.run_instances( images[0].id, key_name=self._keyname, instance_type=size, security_groups=self._security_groups, block_device_map=diskmap, placement=self._zone, # On some operating systems, a tty is requried for sudo. # Since AWS systems have a non-root user as the login, # disable this, so we can use sudo with conch. user_data=dedent("""\ #!/bin/sh sed -i '/Defaults *requiretty/d' /etc/sudoers """), ) instance = reservation.instances[0] context.add_success_fields(instance_id=instance.id) self._connection.create_tags([instance.id], metadata) # Display state as instance starts up, to keep user informed that # things are happening. _wait_until_running(instance) return AWSNode( name=name, _provisioner=self, _instance=instance, distribution=distribution, )
def launch_cluster(conn, opts, num_nodes, cluster_name): if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) print("Setting up security groups...") slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) authorized_address = opts.authorized_address if slave_group.rules == []: # Group was just now created if opts.vpc_id is None: slave_group.authorize(src_group=slave_group) else: slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=slave_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) # Check if instances are already running in our groups existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves: print("ERROR: There are already instances running in group %s" % slave_group.name, file=stderr) sys.exit(1) if opts.ami is None: print("ERROR: AMI is not set, exit") sys.exit(1) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: additional_group_ids = [sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (num_nodes, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(num_nodes, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == num_nodes: print("All %d spot instances granted" % (num_nodes + 1)) reservations = conn.get_all_reservations(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slave spot instances granted, waiting longer" % ( len(active_instance_ids), num_nodes)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: slave_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: print ("WARNING: --spot-price was not set; consider launch slaves as spot instances to save money") # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(num_nodes, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) # Return all the instances return slave_nodes
def node_install(cn=def_cn, inst_type_idx=def_inst_type, idn=0, avz=def_default_avz, rt=def_default_requesttype, group_name='oggmssh', ssh_port=22, cidr='0.0.0.0/0'): """ Request and prepare single instance """ # FSO---connect cloud = boto.ec2.connect_to_region(avz[:-1], profile_name=ec2Profile) aminfo = cloud.get_image(def_ami[avz[:-1]]) vpcconn = VPCConnection(region=cloud.region) try: vpc_id, subnet_id = def_subnet[avz] vpc = vpcconn.get_all_vpcs(vpc_ids=[vpc_id])[0] except: vpc_id = None subnet_id = None vpc = None # FSO---check if node with same name already exists if node_exists(cn + '_node' + str(idn)): print("Node already exists") sys.exit() # Check if ssh keypair exists key_name = get_keypair_name(avz[:-1]) check_keypair(cloud, key_name) # FSO---create a bigger root device dev_sda1 = EBSBlockDeviceType() dev_sda1.size = rootfs_size_gb dev_sda1.delete_on_termination = True bdm = BlockDeviceMapping() bdm['/dev/sda1'] = dev_sda1 dev_sdf_vol = get_user_persist_ebs(cloud, avz) # Check to see if specified security group already exists. # If we get an InvalidGroup.NotFound error back from EC2, # it means that it doesn't exist and we need to create it. try: group = cloud.get_all_security_groups(groupnames=[group_name])[0] except cloud.ResponseError as e: if e.code == 'InvalidGroup.NotFound': print('Creating Security Group: %s' % group_name) # Create a security group to control access to instance via SSH. group = cloud.create_security_group( group_name, 'A group that allows SSH access') else: raise # Authorize all Intra-VPC traffic if vpc is not None: try: group.authorize('-1', -1, -1, vpc.cidr_block) except cloud.ResponseError as e: if e.code != 'InvalidPermission.Duplicate': raise # Add a rule to the security group to authorize SSH traffic # on the specified port. try: group.authorize('tcp', ssh_port, ssh_port, cidr) except cloud.ResponseError as e: if e.code == 'InvalidPermission.Duplicate': print('Security Group: %s already authorized' % group_name) else: raise log_with_ts("request node " + str(idn)) print('Reserving instance for node', aminfo.id, instance_infos[inst_type_idx]['type'], aminfo.name, aminfo.region) if rt == 'spot': print("placing node in ", avz) requests = cloud.request_spot_instances( def_price, def_ami[avz[:-1]], count=1, type='one-time', security_group_ids=[group.id], key_name=key_name, placement=avz, subnet_id=subnet_id, ebs_optimized=True, instance_type=instance_infos[inst_type_idx]['type'], block_device_map=bdm) req_ids = [request.id for request in requests] instance_ids = wait_for_fulfillment(cloud, req_ids) instances = cloud.get_only_instances(instance_ids=instance_ids) node = instances[0] log_with_ts("fullfilled spot node " + str(idn)) else: print("placing node in ", avz) reservation = cloud.run_instances( image_id=def_ami[avz[:-1]], key_name=key_name, placement=avz, subnet_id=subnet_id, security_group_ids=[group.id], ebs_optimized=True, instance_type=instance_infos[inst_type_idx]['type'], block_device_map=bdm) node = reservation.instances[0] log_with_ts("fullfilled ondemand node " + str(idn)) time.sleep(2) while not node.update() == 'running': print('waiting for', cn, 'node', idn, 'to boot...') time.sleep(5) log_with_ts("booted node " + str(idn)) if dev_sdf_vol is not None: cloud.attach_volume(dev_sdf_vol.id, node.id, "/dev/sdf") node.add_tag('Name', cn + '_node' + str(idn)) node.add_tag('type', cn + 'node') node.add_tag('node-owner', user_identifier) # FSO---set delete on termination flag to true for ebs block device node.modify_attribute('blockDeviceMapping', {'/dev/sda1': True}) # FSO--- test socket connect to ssh service ssh_test(node) log_with_ts("reachable node " + str(idn)) update_key_filename(node.region.name) # Mount potential user volume if dev_sdf_vol is not None: use_user_volume(node.dns_name) log_with_ts("finished node " + str(idn))
def create_ami(region, snap_id, force=None, root_dev='/dev/sda1', zone_name=None, default_arch=None, default_type='t1.micro', security_groups=''): """ Creates AMI image from given snapshot. Force option removes prompt request and creates new instance from created ami image. region, snap_id specify snapshot to be processed. Snapshot description in json format will be used to restore instance with same parameters. Will automaticaly process snapshots for same instance with near time (10 minutes or shorter), but for other devices (/dev/sdb, /dev/sdc, etc); force Run instance from ami after creation without confirmation. To enable set value to "RUN"; default_arch architecture to use if not mentioned in snapshot description; default_type instance type to use if not mentioned in snapshot description. Used only if ``force`` is "RUN"; security_groups list of AWS Security Groups names formatted as string separated with semicolon ';'. Used only if ``force`` is "RUN". """ conn = get_region_conn(region) snap = conn.get_all_snapshots(snapshot_ids=[ snap_id, ])[0] instance_id = get_snap_instance(snap) _device = get_snap_device(snap) snaps = conn.get_all_snapshots(owner='self') snapshots = [ snp for snp in snaps if get_snap_instance(snp) == instance_id and get_snap_device(snp) != _device and abs(get_snap_time(snap) - get_snap_time(snp)) <= timedelta(minutes=10) ] snapshot = sorted(snapshots, key=get_snap_time, reverse=True) if snapshots else None # setup for building an EBS boot snapshot default_arch = default_arch or config.get('DEFAULT', 'ARCHITECTURE') arch = get_descr_attr(snap, 'Arch') or default_arch kernel = config.get(conn.region.name, 'KERNEL' + arch.upper()) dev = re.match(r'^/dev/sda$', _device) # if our instance encrypted if dev: kernel = config.get(conn.region.name, 'KERNEL_ENCR_' + arch.upper()) ebs = EBSBlockDeviceType() ebs.snapshot_id = snap_id ebs.delete_on_termination = True block_map = BlockDeviceMapping() block_map[_device] = ebs sdb = BlockDeviceType() sdb.ephemeral_name = 'ephemeral0' block_map['/dev/sdb'] = sdb if snapshot: for s in snapshot: s_dev = get_snap_device(s) s_ebs = EBSBlockDeviceType() s_ebs.delete_on_termination = True s_ebs.snapshot_id = s.id block_map[s_dev] = s_ebs name = 'Created {0} using access key {1}'.format(timestamp(), conn.access_key) name = name.replace(":", ".").replace(" ", "_") # create the new AMI all options from snap JSON description: wait_for(snap, '100%', limit=SNAP_TIME) result = conn.register_image( name=name, description=snap.description, architecture=get_descr_attr(snap, 'Arch') or default_arch, root_device_name=get_descr_attr(snap, 'Root_dev_name') or root_dev, block_device_map=block_map, kernel_id=kernel) sleep(2) image = conn.get_all_images(image_ids=[ result, ])[0] wait_for(image, 'available', limit=10 * 60) add_tags(image, snap.tags) logger.info('The new AMI ID = {0}'.format(result)) info = ('\nEnter RUN if you want to launch instance using ' 'just created {0}: '.format(image)) new_instance = None if force == 'RUN' or raw_input(info).strip() == 'RUN': instance_type = get_descr_attr(snap, 'Type') or default_type new_instance = launch_instance_from_ami( region, image.id, inst_type=instance_type, security_groups=security_groups, zone_name=zone_name) return image, new_instance
def launch_cluster(conn, opts, cluster_name): template_vars = { 'cluster_name':cluster_name, 'master_security_group': cluster_name + "-master", 'slave_security_group': cluster_name + "-slaves", 'discovery_security_group': cluster_name + "-discovery" } if opts.copy_aws_credentials: if opts.deploy_aws_key_id: template_vars['aws_key']=opts.deploy_aws_key_id else: template_vars['aws_key']=opts.aws_access_key_id if opts.deploy_aws_key_secret: template_vars['aws_secret']=opts.deploy_aws_key_secret else: template_vars['aws_secret']=opts.aws_secret_access_key if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) print("Setting up security groups...") master_group = get_or_make_group(conn, template_vars['master_security_group'], opts.vpc_id) slave_group = get_or_make_group(conn, template_vars['slave_security_group'], opts.vpc_id) discovery_group = get_or_make_group(conn, template_vars['discovery_security_group'], opts.vpc_id) authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created if opts.vpc_id is None: master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=discovery_group) else: master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=discovery_group) master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=discovery_group) master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=discovery_group) master_group.authorize('tcp', 22, 22, authorized_address) if slave_group.rules == []: # Group was just now created if opts.vpc_id is None: slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=discovery_group) else: slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=discovery_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=discovery_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=discovery_group) slave_group.authorize('tcp', 22, 22, authorized_address) if discovery_group.rules == []: # Group was just now created if opts.vpc_id is None: discovery_group.authorize(src_group=master_group) discovery_group.authorize(src_group=slave_group) discovery_group.authorize(src_group=discovery_group) else: discovery_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=discovery_group) discovery_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=discovery_group) discovery_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=discovery_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print("ERROR: There are already instances running in group %s or %s" % (master_group.name, slave_group.name), file=stderr) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_ami(opts) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: all_groups = conn.get_all_security_groups() additional_group_ids = [] for group in opts.additional_security_group.split(','): additional_group_ids += [sg.id for sg in all_groups if group in (sg.name, sg.id)] template_vars['security_groups']= template_vars['discovery_security_group'] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: ebs_devices=[] for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device_id = "/dev/sd" + chr(ord('s') + i) device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map[device_id] = device ebs_devices+=device_id template_vars['ebs_devices']=' '.join(ebs_devices) # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): local_devices=[] for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev local_devices+=name template_vars['local_devices']=' '.join(local_devices) master_user_data_content = get_user_data(opts.master_user_data,template_vars) slave_user_data_content = get_user_data(opts.slave_user_data,template_vars) # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=slave_user_data_content, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=slave_user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name if opts.spot_price is not None: # Launch spot instance with the requested price print("Requesting master as spot instance with price $%.3f" % (opts.spot_price)) master_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, key_name=opts.key_pair, launch_group="master-group-%s" % cluster_name, security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=master_user_data_content, instance_profile_name=opts.instance_profile_name) master_req_id = master_reqs[0].id print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r master_instance_ids = [] if master_req_id in id_to_req and id_to_req[master_req_id].state == "active": master_instance_ids.append(id_to_req[master_req_id].instance_id) print("Master granted") reservations = conn.get_all_reservations(master_instance_ids) master_nodes = [] for r in reservations: master_nodes += r.instances break else: print("Master not granted yet, waiting longer") except: print("Canceling spot instance request for master") conn.cancel_spot_instance_requests([master_req_id]) sys.exit(0) else: master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=master_user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) ) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." if opts.security_group_prefix is None: master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") else: master_group = get_or_make_group( conn, opts.security_group_prefix + "-master") slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves") authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) master_group.authorize('tcp', 8080, 8081, authorized_address) master_group.authorize('tcp', 18080, 18080, authorized_address) master_group.authorize('tcp', 19999, 19999, authorized_address) master_group.authorize('tcp', 50030, 50030, authorized_address) master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) slave_group.authorize('tcp', 8080, 8081, authorized_address) slave_group.authorize('tcp', 50060, 50060, authorized_address) slave_group.authorize('tcp', 50075, 50075, authorized_address) slave_group.authorize('tcp', 60060, 60060, authorized_address) slave_group.authorize('tcp', 60075, 60075, authorized_address) # Check if instances are already running with the cluster name existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances for name: %s " % cluster_name) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) additional_groups = [] if opts.additional_security_group: additional_groups = [ sg for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id) ] print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] outstanding_request_ids = [] for i in my_req_ids: if i in id_to_req: if id_to_req[i].state == "active": active_instance_ids.append( id_to_req[i].instance_id) else: outstanding_request_ids.append(i) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer for request ids including %s" % ( len(active_instance_ids), opts.slaves, outstanding_request_ids[0:10]) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group] + additional_groups, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names # TODO: Add retry logic for tagging with name since it's used to identify a cluster. for master in master_nodes: name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id) for i in range(0, 5): try: master.add_tag(key='Name', value=name) except: print "Failed attempt %i of 5 to tag %s" % ((i + 1), name) if (i == 5): raise "Error - failed max attempts to add name tag" time.sleep(5) for slave in slave_nodes: name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id) for i in range(0, 5): try: slave.add_tag(key='Name', value=name) except: print "Failed attempt %i of 5 to tag %s" % ((i + 1), name) if (i == 5): raise "Error - failed max attempts to add name tag" time.sleep(5) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) master_group.authorize('tcp', 8080, 8081, authorized_address) master_group.authorize('tcp', 18080, 18080, authorized_address) master_group.authorize('tcp', 19999, 19999, authorized_address) master_group.authorize('tcp', 50030, 50030, authorized_address) master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) slave_group.authorize('tcp', 8080, 8081, authorized_address) slave_group.authorize('tcp', 50060, 50060, authorized_address) slave_group.authorize('tcp', 50075, 50075, authorized_address) slave_group.authorize('tcp', 60060, 60060, authorized_address) slave_group.authorize('tcp', 60075, 60075, authorized_address) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) additional_groups = [] if opts.additional_security_group: additional_groups = [sg for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group] + additional_groups, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group] + additional_groups, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=user_data_content) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names for master in master_nodes: master.add_tag( key='Name', value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) for slave in slave_nodes: slave.add_tag( key='Name', value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print("Setting up security groups...") master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id) slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created if opts.vpc_id is None: master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) else: master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=master_group) master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=master_group) master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=master_group) master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=slave_group) master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=slave_group) master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) master_group.authorize('tcp', 8080, 8081, authorized_address) master_group.authorize('tcp', 18080, 18080, authorized_address) master_group.authorize('tcp', 19999, 19999, authorized_address) master_group.authorize('tcp', 50030, 50030, authorized_address) master_group.authorize('tcp', 50070, 50070, authorized_address) master_group.authorize('tcp', 60070, 60070, authorized_address) master_group.authorize('tcp', 4040, 4045, authorized_address) # Rstudio (GUI for R) needs port 8787 for web access master_group.authorize('tcp', 8787, 8787, authorized_address) # HDFS NFS gateway requires 111,2049,4242 for tcp & udp master_group.authorize('tcp', 111, 111, authorized_address) master_group.authorize('udp', 111, 111, authorized_address) master_group.authorize('tcp', 2049, 2049, authorized_address) master_group.authorize('udp', 2049, 2049, authorized_address) master_group.authorize('tcp', 4242, 4242, authorized_address) master_group.authorize('udp', 4242, 4242, authorized_address) # RM in YARN mode uses 8088 master_group.authorize('tcp', 8088, 8088, authorized_address) if opts.ganglia: master_group.authorize('tcp', 5080, 5080, authorized_address) if slave_group.rules == []: # Group was just now created if opts.vpc_id is None: slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) else: slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=master_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=master_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=master_group) slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1, src_group=slave_group) slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) slave_group.authorize('tcp', 8080, 8081, authorized_address) slave_group.authorize('tcp', 50060, 50060, authorized_address) slave_group.authorize('tcp', 50075, 50075, authorized_address) slave_group.authorize('tcp', 60060, 60060, authorized_address) slave_group.authorize('tcp', 60075, 60075, authorized_address) #Kylix slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060, src_group=slave_group) slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060, src_group=slave_group) slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060, src_group=master_group) slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060, src_group=master_group) master_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060, src_group=slave_group) master_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060, src_group=slave_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print("ERROR: There are already instances running in group %s or %s" % (master_group.name, slave_group.name), file=stderr) sys.exit(1) # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: additional_group_ids = [sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id)] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print("Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) ) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: group_names = [g.id for g in res.groups] if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names: active = [i for i in res.instances if is_active(i)] if len(active) > 0: print >> stderr, ( "ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=opts.zone, min_count=opts.slaves, max_count=opts.slaves, block_device_map=block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=opts.ft, max_count=opts.ft, block_device_map=block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Launch ZooKeeper nodes if required if opts.ft > 1: zoo_res = image.run(key_name=opts.key_pair, security_groups=[zoo_group], instance_type=opts.instance_type, placement=opts.zone, min_count=3, max_count=3, block_device_map=block_map) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) user_data_content = None if opts.user_data: with open(opts.user_data) as user_data_file: user_data_content = user_data_file.read() print "Setting up security groups..." if opts.security_group_prefix is None: master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") else: master_group = get_or_make_group(conn, opts.security_group_prefix + "-master") slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # Check if instances are already running with the cluster name existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances for name: %s " % cluster_name) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data_content) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] outstanding_request_ids = [] for i in my_req_ids: if i in id_to_req: if id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) else: outstanding_request_ids.append(i) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer for request ids including %s" % ( len(active_instance_ids), opts.slaves, outstanding_request_ids[0:10]) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data_content) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names # TODO: Add retry logic for tagging with name since it's used to identify a cluster. for master in master_nodes: name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id) for i in range(0, 5): try: master.add_tag(key='Name', value=name) except: print "Failed attempt %i of 5 to tag %s" % ((i + 1), name) if (i == 5): raise "Error - failed max attempts to add name tag" time.sleep(5) for slave in slave_nodes: name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id) for i in range(0, 5): try: slave.add_tag(key='Name', value=name) except: print "Failed attempt %i of 5 to tag %s" % ((i + 1), name) if (i == 5): raise "Error - failed max attempts to add name tag" time.sleep(5) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") master_group.owner_id = os.getenv('EC2_USER_ID') slave_group = get_or_make_group(conn, cluster_name + "-slaves") slave_group.owner_id = os.getenv('EC2_USER_ID') zoo_group = get_or_make_group(conn, cluster_name + "-zoo") zoo_group.owner_id = os.getenv('EC2_USER_ID') if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves, existing_zoos = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ( "ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to logging.debug("Calling boto BlockDeviceMapping()...") block_map = BlockDeviceMapping() logging.debug(" Printing block_map..") #print block_map if opts.ebs_vol_size > 0: logging.debug("Calling boto EBSBlockDeviceType()...") device = EBSBlockDeviceType() #print "device: ", device device.size = opts.ebs_vol_size device.delete_on_termination = True device.ephemeral_name = "ephemeral0" #block_map["/dev/sdv"] = device #block_map["/dev/sdv"] = device block_map["/dev/vdb"] = device if opts.user_data_file != None: user_data_file = open(opts.user_data_file) try: opts.user_data = user_data_file.read() #print "user data (encoded) = ", opts.user_data finally: user_data_file.close() # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=opts.user_data) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=opts.user_data) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Launch ZooKeeper nodes if required if int(opts.ft) > 1: print "Running " + opts.ft + " zookeepers" zoo_res = image.run(key_name=opts.key_pair, security_groups=[zoo_group], instance_type=opts.instance_type, placement=opts.zone, min_count=3, max_count=3, block_device_map=block_map, user_data=opts.user_data) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def create_nodes(self, reactor, names, distribution, metadata={}): """ Create nodes with the given names. :param reactor: The reactor. :param name: The names of the nodes. :type name: list of str :param str distribution: The name of the distribution to install on the nodes. :param dict metadata: Metadata to associate with the nodes. :return: A list of ``Deferred``s each firing with an INode when the corresponding node is created. The list has the same order as :param:`names`. """ size = self._default_size disk_size = 8 action = start_action( action_type=u"flocker:provision:aws:create_nodes", instance_count=len(names), distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ) with action.context(): disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap['/dev/sda1'] = disk1 images = self._connection.get_all_images( filters={'name': IMAGE_NAMES[distribution]}, ) instances = self._run_nodes( count=len(names), image_id=images[0].id, size=size, diskmap=diskmap ) def make_node(ignored, name, instance): return AWSNode( name=name, _provisioner=self, _instance=instance, distribution=distribution, ) results = [] for name, instance in izip_longest(names, instances): if instance is None: results.append(fail(Exception("Could not run instance"))) else: node_metadata = metadata.copy() node_metadata['Name'] = name d = self._async_get_node(reactor, instance, node_metadata) d = DeferredContext(d) d.addCallback(make_node, name, instance) results.append(d.result) action_completion = DeferredContext(DeferredList(results)) action_completion.addActionFinish() # Individual results and errors should be consumed by the caller, # so we can leave action_completion alone now. return results
def _launch_wait_snapshot( self, ami, user_data, img_size=10, inst_type="m1.small", img_name=None, img_desc=None, remote_access_command=None, ): ebs_root = EBSBlockDeviceType() ebs_root.size = img_size ebs_root.delete_on_termination = True block_map = BlockDeviceMapping() block_map["/dev/sda"] = ebs_root sgroup_name = "ec2helper-ssh-%x" % random.randrange(2 ** 32) self.create_sgroup(sgroup_name) # Now launch it self.log.debug("Starting %s in %s with as %s" % (ami, self.region.name, inst_type)) reservation = self.conn.run_instances( ami, max_count=1, instance_type=inst_type, user_data=user_data, security_groups=[sgroup_name], block_device_map=block_map, ) if len(reservation.instances) == 0: raise Exception("Attempt to start instance failed") self.instance = reservation.instances[0] wait_for_ec2_instance_state(self.instance, self.log, final_state="running", timeout=300) self.instance.add_tag("Name", resource_tag) self.log.debug("Instance (%s) is now running" % self.instance.id) self.log.debug("Public DNS will be: %s" % self.instance.public_dns_name) self.log.debug("Now waiting up to 30 minutes for instance to stop") wait_for_ec2_instance_state(self.instance, self.log, final_state="stopped", timeout=1800) # Snapshot self.log.debug("Creating a new EBS image from completed/stopped EBS instance") new_ami_id = self.conn.create_image(self.instance.id, img_name, img_desc) self.log.debug("boto creat_image call returned AMI ID: %s" % new_ami_id) self.log.debug("Waiting for newly generated AMI to become available") # As with launching an instance we have seen occasional issues when # trying to query this AMI right away - give it a moment to settle sleep(10) new_amis = self.conn.get_all_images([new_ami_id]) new_ami = new_amis[0] timeout = 120 interval = 10 for i in range(timeout): new_ami.update() if new_ami.state == "available": new_ami.add_tag("Name", resource_tag) break elif new_ami.state == "failed": raise Exception("Amazon reports EBS image creation failed") self.log.debug( "AMI status (%s) is not 'available' - [%d of %d seconds]" % (new_ami.state, i * interval, timeout * interval) ) sleep(interval) self.log.debug("Terminating/deleting instance") safe_call(instance.terminate, (), self.log) sleep(5) if new_ami.state != "available": raise Exception("Failed to produce an AMI ID") self.log.debug("SUCCESS: %s is now available for launch" % new_ami_id) return new_ami_id
def create_nodes(self, reactor, names, distribution, metadata={}): """ Create nodes with the given names. :param reactor: The reactor. :param name: The names of the nodes. :type name: list of str :param str distribution: The name of the distribution to install on the nodes. :param dict metadata: Metadata to associate with the nodes. :return: A list of ``Deferred``s each firing with an INode when the corresponding node is created. The list has the same order as :param:`names`. """ size = self._default_size disk_size = 8 action = start_action( action_type=u"flocker:provision:aws:create_nodes", instance_count=len(names), distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ) with action.context(): disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap['/dev/sda1'] = disk1 images = self._connection.get_all_images( filters={'name': IMAGE_NAMES[distribution]}, ) instances = self._run_nodes(count=len(names), image_id=images[0].id, size=size, diskmap=diskmap) def make_node(ignored, name, instance): return AWSNode( name=name, _provisioner=self, _instance=instance, distribution=distribution, ) results = [] for name, instance in izip_longest(names, instances): if instance is None: results.append(fail(Exception("Could not run instance"))) else: node_metadata = metadata.copy() node_metadata['Name'] = name d = self._async_get_node(reactor, instance, node_metadata) d = DeferredContext(d) d.addCallback(make_node, name, instance) results.append(d.result) action_completion = DeferredContext(DeferredList(results)) action_completion.addActionFinish() # Individual results and errors should be consumed by the caller, # so we can leave action_completion alone now. return results
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) if opts.vpc_id is None: print "Setting up EC2-Classic security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize("tcp", 22, 22, "0.0.0.0/0") master_group.authorize("tcp", 8080, 8081, "0.0.0.0/0") master_group.authorize("tcp", 19999, 19999, "0.0.0.0/0") master_group.authorize("tcp", 50030, 50030, "0.0.0.0/0") master_group.authorize("tcp", 50070, 50070, "0.0.0.0/0") master_group.authorize("tcp", 60070, 60070, "0.0.0.0/0") master_group.authorize("tcp", 4040, 4045, "0.0.0.0/0") if opts.ganglia: master_group.authorize("tcp", 5080, 5080, "0.0.0.0/0") if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize("tcp", 22, 22, "0.0.0.0/0") slave_group.authorize("tcp", 8080, 8081, "0.0.0.0/0") slave_group.authorize("tcp", 50060, 50060, "0.0.0.0/0") slave_group.authorize("tcp", 50075, 50075, "0.0.0.0/0") slave_group.authorize("tcp", 60060, 60060, "0.0.0.0/0") slave_group.authorize("tcp", 60075, 60075, "0.0.0.0/0") else: print "Setting up VPC security groups..." master_group = get_or_make_group(conn, cluster_name + "-master", vpc_id=opts.vpc_id) slave_group = get_or_make_group(conn, cluster_name + "-slaves", vpc_id=opts.vpc_id) if master_group.rules == []: # Group was just now created master_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=master_group) master_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=slave_group) master_group.authorize("tcp", 22, 22, "0.0.0.0/0") master_group.authorize("tcp", 8080, 8081, "0.0.0.0/0") master_group.authorize("tcp", 19999, 19999, "0.0.0.0/0") master_group.authorize("tcp", 50030, 50030, "0.0.0.0/0") master_group.authorize("tcp", 50070, 50070, "0.0.0.0/0") master_group.authorize("tcp", 60070, 60070, "0.0.0.0/0") master_group.authorize("tcp", 4040, 4045, "0.0.0.0/0") if opts.ganglia: master_group.authorize("tcp", 5080, 5080, "0.0.0.0/0") if slave_group.rules == []: # Group was just now created slave_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=master_group) slave_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=slave_group) slave_group.authorize("tcp", 22, 22, "0.0.0.0/0") slave_group.authorize("tcp", 8080, 8081, "0.0.0.0/0") slave_group.authorize("tcp", 50060, 50060, "0.0.0.0/0") slave_group.authorize("tcp", 50075, 50075, "0.0.0.0/0") slave_group.authorize("tcp", 60060, 60060, "0.0.0.0/0") slave_group.authorize("tcp", 60075, 60075, "0.0.0.0/0") # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ( "ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name) ) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Hack to set VPC private hostname //fix later user_data = """#!/bin/bash hostname $(curl http://169.254.169.254/latest/meta-data/local-hostname) """ # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if opts.vpc_id is None: slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map, placement_group=opts.placement_group, ) if opts.vpc_id is not None: interface = ec2.networkinterface.NetworkInterfaceSpecification( device_index=0, subnet_id=opts.subnet_id, groups=[slave_group.id], associate_public_ip_address=True ) interfaces = ec2.networkinterface.NetworkInterfaceCollection(interface) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, count=num_slaves_this_zone, key_name=opts.key_pair, instance_type=opts.instance_type, block_device_map=block_map, network_interfaces=interfaces, user_data=user_data, placement_group=opts.placement_group, ) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: if opts.vpc_id is None: slave_res = image.run( key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, placement_group=opts.placement_group, ) if opts.vpc_id is not None: interface = ec2.networkinterface.NetworkInterfaceSpecification( device_index=0, subnet_id=opts.subnet_id, groups=[slave_group.id], associate_public_ip_address=True, ) interfaces = ec2.networkinterface.NetworkInterfaceCollection(interface) slave_res = conn.run_instances( image_id=opts.ami, key_name=opts.key_pair, instance_type=opts.instance_type, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, network_interfaces=interfaces, user_data=user_data, placement_group=opts.placement_group, ) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == "all": opts.zone = random.choice(conn.get_all_zones()).name if opts.vpc_id is None: master_res = image.run( key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, ) if opts.vpc_id is not None: interface = ec2.networkinterface.NetworkInterfaceSpecification( device_index=0, subnet_id=opts.subnet_id, groups=[master_group.id], associate_public_ip_address=True ) interfaces = ec2.networkinterface.NetworkInterfaceCollection(interface) master_res = conn.run_instances( image_id=opts.ami, key_name=opts.key_pair, instance_type=master_type, min_count=1, max_count=1, block_device_map=block_map, network_interfaces=interfaces, user_data=user_data, ) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Give the instances descriptive names for master in master_nodes: master.add_tag(key="Name", value="spark-{cn}-master-{iid}".format(cn=cluster_name, iid=master.id)) for slave in slave_nodes: slave.add_tag(key="Name", value="spark-{cn}-slave-{iid}".format(cn=cluster_name, iid=slave.id)) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") master_group.owner_id = os.getenv('EC2_USER_ID') slave_group = get_or_make_group(conn, cluster_name + "-slaves") slave_group.owner_id = os.getenv('EC2_USER_ID') zoo_group = get_or_make_group(conn, cluster_name + "-zoo") zoo_group.owner_id = os.getenv('EC2_USER_ID') if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') master_group.authorize('tcp', 40000, 40000, '0.0.0.0/0') #apache hama master_group.authorize('tcp', 40013, 40013, '0.0.0.0/0') #apache hama if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') slave_group.authorize('tcp', 40015, 40015, '0.0.0.0/0') ##apache hama web UI if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves, existing_zoos = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.emi])[0] except: print >> stderr, "Could not find emi " + opts.emi sys.exit(1) try: image_master = conn.get_all_images(image_ids=[opts.emi_master])[0] except: print >> stderr, "Could not find emi " + opts.emi_master sys.exit(1) if (opts.emi_zoo != ""): try: image_zoo = conn.get_all_images(image_ids=[opts.emi_zoo])[0] except: print >> stderr, "Could not find emi " + opts.emi_zoo sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to logging.debug( "Calling boto BlockDeviceMapping()...") block_map = BlockDeviceMapping() logging.debug(" Printing block_map..") #print block_map if opts.ebs_vol_size > 0: logging.debug("Calling boto EBSBlockDeviceType()...") device = EBSBlockDeviceType() #print "device: ", device device.size = opts.ebs_vol_size device.delete_on_termination = True device.ephemeral_name = "ephemeral0" #block_map["/dev/sdv"] = device #block_map["/dev/sdv"] = device block_map["/dev/vdb"] = device if opts.user_data_file != None: user_data_file = open(opts.user_data_file) try: opts.user_data = user_data_file.read() #print "user data (encoded) = ", opts.user_data finally: user_data_file.close() # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map, user_data = opts.user_data) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image_master.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map, user_data = opts.user_data) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Launch ZooKeeper nodes if required if int(opts.ft) > 1: print "Running " + opts.ft + " zookeepers" zoo_res = image_zoo.run(key_name = opts.key_pair, security_groups = [zoo_group], instance_type = opts.instance_type, placement = opts.zone, min_count = 3, max_count = 3, block_device_map = block_map, user_data = opts.user_data) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) print "Setting up security groups..." if opts.one_security_group: master_group = get_or_make_group(conn, cluster_name + "-group") master_group.owner_id = os.getenv('EC2_USER_ID') slave_group = master_group zoo_group = master_group else: master_group = get_or_make_group(conn, cluster_name + "-master") master_group.owner_id = os.getenv('EC2_USER_ID') slave_group = get_or_make_group(conn, cluster_name + "-slaves") slave_group.owner_id = os.getenv('EC2_USER_ID') zoo_group = get_or_make_group(conn, cluster_name + "-zoo") zoo_group.owner_id = os.getenv('EC2_USER_ID') if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50031, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') master_group.authorize('tcp', 40000, 40000, '0.0.0.0/0') #apache hama master_group.authorize('tcp', 40013, 40013, '0.0.0.0/0') #apache hama master_group.authorize('tcp', 8020, 8020, '0.0.0.0/0') #hdfs HA nameservice master_group.authorize('tcp', 8485, 8485, '0.0.0.0/0') #journal nodes master_group.authorize('tcp', 8023, 8023, '0.0.0.0/0') #jt HA master_group.authorize('tcp', 8021, 8021, '0.0.0.0/0') #jt HA master_group.authorize('tcp', 8018, 8019, '0.0.0.0/0') #zkfc master_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui #If cohosted with zookeeper open necessary ports if opts.cohost: print "Opening additional ports for zookeeper... " master_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') master_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') master_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 80, 80, '0.0.0.0/0') #Also needed 8649 and 8651 but check if only for master if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') slave_group.authorize('tcp', 40015, 40015, '0.0.0.0/0') ##apache hama web UI slave_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui slave_group.authorize('tcp', 31000, 32000, '0.0.0.0/0') #task tracker web ui if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') zoo_group.authorize('tcp', 8018, 8020, '0.0.0.0/0') #hdfs HA nameservic zoo_group.authorize('tcp', 8485, 8485, '0.0.0.0/0') #journal nodes zoo_group.authorize('tcp', 8023, 8023, '0.0.0.0/0') #jt HA zoo_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui # Check if instances are already running in our groups # Grouped instances are instances that run on the same security group in order to allow communication # using private IPs and without DNS resolving existing_masters, existing_slaves, existing_zoos, existing_grouped = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master) or existing_grouped: print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.emi])[0] except: print >> stderr, "Could not find emi " + opts.emi sys.exit(1) try: image_master = conn.get_all_images(image_ids=[opts.emi_master])[0] except: print >> stderr, "Could not find emi " + opts.emi_master sys.exit(1) # Launch additional ZooKeeper nodes if required - ex: if mesos masters specified are 2 and the zoo_num=3 (default) if int(opts.ft) > 1: if(opts.cohost): zoo_num = str(int(opts.zoo_num) - int(opts.ft)) #extra zoo instances needed else: zoo_num = opts.zoo_num else: zoo_num = opts.zoo_num if (zoo_num > 0): if opts.emi_zoo == "": emi_zoo = opts.emi_master else: emi_zoo = opts.emi_zoo try: image_zoo = conn.get_all_images(image_ids=[emi_zoo])[0] except: print >> stderr, "Could not find emi " + emi_zoo sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to logging.debug( "Calling boto BlockDeviceMapping()...") block_map = BlockDeviceMapping() logging.debug(" Printing block_map..") #print block_map if opts.ebs_vol_size > 0: logging.debug("Calling boto EBSBlockDeviceType()...") device = EBSBlockDeviceType() #print "device: ", device device.size = opts.ebs_vol_size device.delete_on_termination = True device.ephemeral_name = "ephemeral0" #block_map["/dev/sdv"] = device #block_map["/dev/sdv"] = device block_map["/dev/vdb"] = device if opts.user_data_file != None: user_data_file = open(opts.user_data_file) try: opts.user_data = user_data_file.read() #print "user data (encoded) = ", opts.user_data finally: user_data_file.close() # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map, user_data = opts.user_data) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name print "Running " + opts.ft + " masters" master_res = image_master.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = opts.ft, max_count = opts.ft, block_device_map = block_map, user_data = opts.user_data) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) if(zoo_num > 0): print "Running additional " + zoo_num + " zookeepers" zoo_res = image_zoo.run(key_name = opts.key_pair, security_groups = [zoo_group], instance_type = opts.instance_type, placement = opts.zone, min_count = zoo_num, max_count = zoo_num, block_device_map = block_map, user_data = opts.user_data) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] if (opts.cohost): print "Zookeepers are co-hosted on mesos instances..." # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): conn = AWSConnection(conn, VPCConnection(region=conn.region)) print "Setting up VPC..." vpc = get_or_make_vpc(conn, cluster_name, 'mesos-vpc') print "Using vpc: %s" % (vpc.id) print "Setting up subnet..." subnet = get_or_make_subnet(conn, vpc.id, opts.zone, cluster_name, 'mesos-subnet') print "Using subnet: %s" % (subnet.id) # Add internet gateway to VPC. print "Creating internet gateway" ig = get_or_make_ig(conn, vpc.id, cluster_name, 'mesos-vpc') print "Using internet gateway: %s" % (ig.id) # Add route to route table rt = get_or_make_rt(conn, vpc.id, cluster_name, 'mesos-rt') conn.vpc.create_route(rt.id, '0.0.0.0/0', gateway_id=ig.id) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-masters") slave_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-slaves") zoo_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-zoo") if master_group.rules == []: # Group was just now created master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.ec2.get_all_instances() for res in reservations: group_names = [g.name for g in res.groups] if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names: active = [i for i in res.instances if is_active(i)] if len(active) > 0: print >> stderr, ("ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." if opts.ami == "latest": # Figure out the latest AMI from our static URL try: opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip() except: print >> stderr, "Could not read " + LATEST_AMI_URL try: image = conn.ec2.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) slave_reqs = conn.ec2.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = opts.zone, count = opts.slaves, key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids = [req.id for req in slave_reqs] print "Waiting for spot instances to be granted..." while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active = 0 instance_ids = [] for i in my_req_ids: if id_to_req[i].state == "active": active += 1 instance_ids.append(id_to_req[i].instance_id) if active == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (active, opts.slaves) else: # Launch non-spot instances slave_res = conn.ec2.run_instances(opts.ami, key_name = opts.key_pair, subnet_id = subnet.id, security_group_ids = [slave_group.id], instance_type = opts.instance_type, placement = opts.zone, min_count = opts.slaves, max_count = opts.slaves, block_device_map = block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnet.id, groups=[master_group.id], associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface) master_res = conn.ec2.run_instances(opts.ami, key_name = opts.key_pair, instance_type = master_type, placement = opts.zone, network_interfaces = interfaces, min_count = opts.ft, max_count = opts.ft, block_device_map = block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Launch ZooKeeper nodes if required if opts.ft > 1: zoo_res = conn.ec2.run_instances(opts.ami, key_name = opts.key_pair, subnet_id = subnet.id, security_group_ids = [zoo_group.id], instance_type = opts.instance_type, placement = opts.zone, min_count = 3, max_count = 3, block_device_map = block_map) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): # Remove known hosts to avoid "Offending key for IP ..." errors. known_hosts = os.environ["HOME"] + "/.ssh/known_hosts" if os.path.isfile(known_hosts): os.remove(known_hosts) if opts.key_pair is None: opts.key_pair = keypair() if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) if opts.profile is None: opts.profile = profile() if opts.profile is None: print >> stderr, "ERROR: No profile found in current host. It be provided with -p option." sys.exit(1) public_key = pub_key() user_data = Template( """#!/bin/bash set -e -x echo '$public_key' >> ~root/.ssh/authorized_keys echo '$public_key' >> ~ec2-user/.ssh/authorized_keys""" ).substitute(public_key=public_key) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") sparknotebook_group = get_or_make_group(conn, "SparkNotebookApplication") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=sparknotebook_group) master_group.authorize("tcp", 22, 22, "0.0.0.0/0") master_group.authorize("tcp", 8080, 8081, "0.0.0.0/0") master_group.authorize("tcp", 18080, 18080, "0.0.0.0/0") master_group.authorize("tcp", 19999, 19999, "0.0.0.0/0") master_group.authorize("tcp", 50030, 50030, "0.0.0.0/0") master_group.authorize("tcp", 50070, 50070, "0.0.0.0/0") master_group.authorize("tcp", 60070, 60070, "0.0.0.0/0") master_group.authorize("tcp", 4040, 4045, "0.0.0.0/0") master_group.authorize("tcp", 7077, 7077, "0.0.0.0/0") if opts.ganglia: master_group.authorize("tcp", 5080, 5080, "0.0.0.0/0") if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=sparknotebook_group) slave_group.authorize("tcp", 22, 22, "0.0.0.0/0") slave_group.authorize("tcp", 8080, 8081, "0.0.0.0/0") slave_group.authorize("tcp", 50060, 50060, "0.0.0.0/0") slave_group.authorize("tcp", 50075, 50075, "0.0.0.0/0") slave_group.authorize("tcp", 60060, 60060, "0.0.0.0/0") slave_group.authorize("tcp", 60075, 60075, "0.0.0.0/0") if not any(r for r in sparknotebook_group.rules for g in r.grants if master_group.id == g.group_id): sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=master_group) sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=master_group) if not any(r for r in sparknotebook_group.rules for g in r.grants if slave_group.id == g.group_id): sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=slave_group) sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=slave_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ( "ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name) ) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: best_price = find_best_price(conn, opts.instance_type, zone, opts.spot_price) # Launch spot instances with the requested price print >> stderr, ( "Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)" % (opts.slaves, best_price, opts.slaves * best_price) ) num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnetId(), groups=[slave_group.id], associate_public_ip_address=True ) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface) slave_reqs = conn.request_spot_instances( price=best_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile, network_interfaces=interfaces, ) my_req_ids += [req.id for req in slave_reqs] i += 1 print >> stderr, "Waiting for spot instances to be granted" try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print >> stderr, "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: # print >> stderr, ".", print "%d of %d slaves granted, waiting longer" % (len(active_instance_ids), opts.slaves) except: print >> stderr, "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id], instance_type=opts.instance_type, subnet_id=subnetId(), placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile, ) slave_nodes += slave_res.instances print >> stderr, "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == "all": opts.zone = random.choice(conn.get_all_zones()).name if opts.spot_price != None: best_price = find_best_price(conn, master_type, opts.zone, opts.spot_price) # Launch spot instances with the requested price print >> stderr, ("Requesting master as spot instances with price $%.3f/hour" % (best_price)) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnetId(), groups=[master_group.id], associate_public_ip_address=True ) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface) master_reqs = conn.request_spot_instances( price=best_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=opts.zone, count=1, key_name=opts.key_pair, instance_type=master_type, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile, network_interfaces=interfaces, ) my_req_ids = [r.id for r in master_reqs] print >> stderr, "Waiting for spot instance to be granted" try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests(request_ids=my_req_ids) id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == 1: print >> stderr, "Master granted" reservations = conn.get_all_instances(active_instance_ids) master_nodes = [] for r in reservations: master_nodes += r.instances break else: # print >> stderr, ".", print "%d of %d masters granted, waiting longer" % (len(active_instance_ids), 1) except: print >> stderr, "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, master_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(master_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id], instance_type=master_type, subnet_id=subnetId(), placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile, ) master_nodes = master_res.instances print >> stderr, "Launched master in %s, regid = %s" % (zone, master_res.id) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: group_names = [g.id for g in res.groups] if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names: active = [i for i in res.instances if is_active(i)] if len(active) > 0: print >> stderr, ("ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) slave_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = opts.zone, count = opts.slaves, key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids = [req.id for req in slave_reqs] print "Waiting for spot instances to be granted..." while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active = 0 instance_ids = [] for i in my_req_ids: if id_to_req[i].state == "active": active += 1 instance_ids.append(id_to_req[i].instance_id) if active == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (active, opts.slaves) else: # Launch non-spot instances slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = opts.zone, min_count = opts.slaves, max_count = opts.slaves, block_device_map = block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = opts.ft, max_count = opts.ft, block_device_map = block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Launch ZooKeeper nodes if required if opts.ft > 1: zoo_res = image.run(key_name = opts.key_pair, security_groups = [zoo_group], instance_type = opts.instance_type, placement = opts.zone, min_count = 3, max_count = 3, block_device_map = block_map) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): #Remove known hosts to avoid "Offending key for IP ..." errors. known_hosts = os.environ['HOME'] + "/.ssh/known_hosts" if os.path.isfile(known_hosts): os.remove(known_hosts) if opts.key_pair is None: opts.key_pair = keypair() if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) if opts.profile is None: opts.profile = profile() if opts.profile is None: print >> stderr, "ERROR: No profile found in current host. It be provided with -p option." sys.exit(1) public_key = pub_key() user_data = Template("""#!/bin/bash set -e -x echo '$public_key' >> ~root/.ssh/authorized_keys echo '$public_key' >> ~ec2-user/.ssh/authorized_keys""").substitute( public_key=public_key) print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") sparknotebook_group = get_or_make_group(conn, "SparkNotebookApplication") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=sparknotebook_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0') master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') master_group.authorize('tcp', 7077, 7077, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=sparknotebook_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if not any(r for r in sparknotebook_group.rules for g in r.grants if master_group.id == g.group_id): sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=master_group) sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=master_group) if not any(r for r in sparknotebook_group.rules for g in r.grants if slave_group.id == g.group_id): sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=slave_group) sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=slave_group) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: best_price = find_best_price(conn, opts.instance_type, zone, opts.spot_price) # Launch spot instances with the requested price print >> stderr, ( "Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)" % (opts.slaves, best_price, opts.slaves * best_price)) num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnetId(), groups=[slave_group.id], associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( interface) slave_reqs = conn.request_spot_instances( price=best_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, instance_type=opts.instance_type, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile, network_interfaces=interfaces) my_req_ids += [req.id for req in slave_reqs] i += 1 print >> stderr, "Waiting for spot instances to be granted" try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print >> stderr, "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: # print >> stderr, ".", print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print >> stderr, "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_group_ids=[slave_group.id], instance_type=opts.instance_type, subnet_id=subnetId(), placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile) slave_nodes += slave_res.instances print >> stderr, "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name if opts.spot_price != None: best_price = find_best_price(conn, master_type, opts.zone, opts.spot_price) # Launch spot instances with the requested price print >> stderr, ( "Requesting master as spot instances with price $%.3f/hour" % (best_price)) interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnetId(), groups=[master_group.id], associate_public_ip_address=True) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( interface) master_reqs = conn.request_spot_instances( price=best_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=opts.zone, count=1, key_name=opts.key_pair, instance_type=master_type, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile, network_interfaces=interfaces) my_req_ids = [r.id for r in master_reqs] print >> stderr, "Waiting for spot instance to be granted" try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests( request_ids=my_req_ids) id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append( id_to_req[i].instance_id) if len(active_instance_ids) == 1: print >> stderr, "Master granted" reservations = conn.get_all_instances( active_instance_ids) master_nodes = [] for r in reservations: master_nodes += r.instances break else: # print >> stderr, ".", print "%d of %d masters granted, waiting longer" % ( len(active_instance_ids), 1) except: print >> stderr, "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, master_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(master_nodes) if running: print >> stderr, ( "WARNING: %d instances are still running" % running) sys.exit(0) else: master_res = image.run(key_name=opts.key_pair, security_group_ids=[master_group.id], instance_type=master_type, subnet_id=subnetId(), placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, user_data=user_data, instance_profile_arn=opts.profile) master_nodes = master_res.instances print >> stderr, "Launched master in %s, regid = %s" % ( zone, master_res.id) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr) sys.exit(1) if opts.key_pair is None: print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr) sys.exit(1) user_data_content = None print("Setting up security groups...") master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id) slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id) authorized_address = opts.authorized_address if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, authorized_address) if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, authorized_address) # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print("ERROR: There are already instances running in group %s or %s" % (master_group.name, slave_group.name), file=stderr) sys.exit(1) # Use the default Ubuntu AMI. if opts.ami is None: if opts.region == "us-east-1": opts.ami = "ami-2d39803a" elif opts.region == "us-west-1": opts.ami = "ami-06116566" elif opts.region == "us-west-2": opts.ami = "ami-9abea4fb" elif opts.region == "eu-west-1": opts.ami = "ami-f95ef58a" elif opts.region == "eu-central-1": opts.ami = "ami-87564feb" elif opts.region == "ap-northeast-1": opts.ami = "ami-a21529cc" elif opts.region == "ap-northeast-2": opts.ami = "ami-09dc1267" elif opts.region == "ap-southeast-1": opts.ami = "ami-25c00c46" elif opts.region == "ap-southeast-2": opts.ami = "ami-6c14310f" elif opts.region == "ap-south-1": opts.ami = "ami-4a90fa25" elif opts.region == "sa-east-1": opts.ami = "ami-0fb83963" else: raise Exception("The specified region is unknown.") # we use group ids to work around https://github.com/boto/boto/issues/350 additional_group_ids = [] if opts.additional_security_group: additional_group_ids = [ sg.id for sg in conn.get_all_security_groups() if opts.additional_security_group in (sg.name, sg.id) ] print("Launching instances...") try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print("Could not find AMI " + opts.ami, file=stderr) sys.exit(1) # Create block device mapping so that we can add EBS volumes if asked to. # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: for i in range(opts.ebs_vol_num): device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.volume_type = opts.ebs_vol_type device.delete_on_termination = True block_map["/dev/sd" + chr(ord('s') + i)] = device # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342). if opts.instance_type.startswith('m3.'): for i in range(get_num_disks(opts.instance_type)): dev = BlockDeviceType() dev.ephemeral_name = 'ephemeral%d' % i # The first ephemeral drive is /dev/sdb. name = '/dev/sd' + string.ascii_letters[i + 1] block_map[name] = dev # Launch slaves if opts.spot_price is not None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_profile_name=opts.instance_profile_name) my_req_ids += [req.id for req in slave_reqs] i += 1 print("Waiting for spot instances to be granted...") try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print("All %d slaves granted" % opts.slaves) reservations = conn.get_all_reservations( active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print("%d of %d slaves granted, waiting longer" % (len(active_instance_ids), opts.slaves)) except: print("Canceling spot instance requests") conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print(("WARNING: %d instances are still running" % running), file=stderr) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run( key_name=opts.key_pair, security_group_ids=[slave_group.id] + additional_group_ids, instance_type=opts.instance_type, placement=zone, min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts. instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) slave_nodes += slave_res.instances print( "Launched {s} slave{plural_s} in {z}, regid = {r}".format( s=num_slaves_this_zone, plural_s=('' if num_slaves_this_zone == 1 else 's'), z=zone, r=slave_res.id)) i += 1 # Launch or resume masters if existing_masters: print("Starting master...") for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run( key_name=opts.key_pair, security_group_ids=[master_group.id] + additional_group_ids, instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, block_device_map=block_map, subnet_id=opts.subnet_id, placement_group=opts.placement_group, user_data=user_data_content, instance_initiated_shutdown_behavior=opts. instance_initiated_shutdown_behavior, instance_profile_name=opts.instance_profile_name) master_nodes = master_res.instances print("Launched master in %s, regid = %s" % (zone, master_res.id)) # This wait time corresponds to SPARK-4983 print("Waiting for AWS to propagate instance metadata...") time.sleep(15) # Give the instances descriptive names and set additional tags additional_tags = {} if opts.additional_tags.strip(): additional_tags = dict( map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')) for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))) for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))) # Return all the instances return (master_nodes, slave_nodes)
def _launch_wait_snapshot(self, ami, user_data, img_size = 10, img_name = None, img_desc = None, remote_access_command = None): rand_id = random.randrange(2**32) # Modified from code taken from Image Factory # Create security group security_group_name = "ebs-helper-vnc-tmp-%x" % (rand_id) security_group_desc = "Temporary security group with SSH access generated by EBSHelper python object" self.log.debug("Creating temporary security group (%s)" % (security_group_name)) self.security_group = self.conn.create_security_group(security_group_name, security_group_desc) self.security_group.authorize('tcp', 22, 22, '0.0.0.0/0') self.security_group.authorize('tcp', 5900, 5950, '0.0.0.0/0') ebs_root = EBSBlockDeviceType() ebs_root.size=img_size ebs_root.delete_on_termination = True block_map = BlockDeviceMapping() block_map['/dev/sda'] = ebs_root # Now launch it instance_type="m1.small" self.log.debug("Starting ami %s in region %s with instance_type %s" % (ami, self.region.name, instance_type)) reservation = self.conn.run_instances(ami, max_count=1, instance_type=instance_type, user_data = user_data, security_groups = [ security_group_name ], block_device_map = block_map) # I used to have a check for more than one instance here -- but that would be a profound bug in boto if len(reservation.instances) == 0: raise Exception("Attempt to start instance failed") self.instance = reservation.instances[0] wait_for_ec2_instance_state(self.instance, self.log, final_state='running', timeout=300) self.log.debug("Instance (%s) is now running" % self.instance.id) self.log.debug("Public DNS will be: %s" % self.instance.public_dns_name) self.log.debug("Now waiting up to 30 minutes for instance to stop") wait_for_ec2_instance_state(self.instance, self.log, final_state='stopped', timeout=1800) # Snapshot self.log.debug("Creating a new EBS backed image from completed/stopped EBS instance") new_ami_id = self.conn.create_image(self.instance.id, img_name, img_desc) self.log.debug("boto creat_image call returned AMI ID: %s" % (new_ami_id)) self.log.debug("Waiting for newly generated AMI to become available") # As with launching an instance we have seen occasional issues when trying to query this AMI right # away - give it a moment to settle sleep(10) new_amis = self.conn.get_all_images([ new_ami_id ]) new_ami = new_amis[0] timeout = 120 interval = 10 for i in range(timeout): new_ami.update() if new_ami.state == "available": break elif new_ami.state == "failed": raise Exception("Amazon reports EBS image creation failed") self.log.debug("AMI status (%s) - waiting for 'available' - [%d of %d seconds elapsed]" % (new_ami.state, i * interval, timeout * interval)) sleep(interval) self.log.debug("Terminating/deleting instance") terminate_instance(self.instance) if new_ami.state != "available": raise Exception("Failed to produce an AMI ID") self.log.debug("SUCCESS: %s is now available for launch" % (new_ami_id)) return new_ami_id
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) if existing_slaves or (existing_masters and not opts.use_existing_master): print >> stderr, ("ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)) sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price=opts.spot_price, image_id=opts.ami, launch_group="launch-group-%s" % cluster_name, placement=zone, count=num_slaves_this_zone, key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, block_device_map=block_map) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name=opts.key_pair, security_groups=[slave_group], instance_type=opts.instance_type, placement=zone, instance_profile_name="spark-node", min_count=num_slaves_this_zone, max_count=num_slaves_this_zone, block_device_map=block_map) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % ( num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name=opts.key_pair, security_groups=[master_group], instance_type=master_type, placement=opts.zone, min_count=1, max_count=1, instance_profile_name="spark-node", block_device_map=block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Return all the instances return (master_nodes, slave_nodes)
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." input_group = get_or_make_group(conn, cluster_name + "-input") compute_group = get_or_make_group(conn, cluster_name + "-compute") if input_group.rules == []: # Group was just now created input_group.authorize(src_group=input_group) input_group.authorize(src_group=compute_group) input_group.authorize('tcp', 22, 22, '0.0.0.0/0') input_group.authorize('tcp', 4000, 4000, '0.0.0.0/0') input_group.authorize('tcp', 4001, 4001, '0.0.0.0/0') if compute_group.rules == []: # Group was just now created compute_group.authorize(src_group=input_group) compute_group.authorize(src_group=compute_group) compute_group.authorize('tcp', 22, 22, '0.0.0.0/0') compute_group.authorize('tcp', 4000, 4000, '0.0.0.0/0') compute_group.authorize('tcp', 4001, 4001, '0.0.0.0/0') compute_group.authorize('tcp', 5001, 5001, '0.0.0.0/0') # Check if instances are already running in our groups active_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if any(active_nodes): print >> stderr, ("ERROR: There are already instances running in " + "group %s, %s or %s" % (input_group.name, compute_group.name)) sys.exit(1) # CHANGE THIS IF CHANGING REGIONS opts.ami = 'ami-d76605be' print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device launch_groups = opts.compute_groups + 1 # Launch compute nodes if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d compute nodes as spot instances with price $%.3f" % (launch_groups * opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(launch_groups * opts.slaves, num_zones, i) compute_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = zone, count = num_slaves_this_zone, key_name = opts.key_pair, security_groups = [compute_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids += [req.id for req in compute_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves * launch_groups: print "All %d compute nodes granted" %(opts.slaves * launch_groups) reservations = conn.get_all_instances(active_instance_ids) compute_nodes = [] for r in reservations: compute_nodes += r.instances break else: print "%d of %d compute nodes granted, waiting longer" % ( len(active_instance_ids), opts.slaves * launch_groups) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (input_nodes, compute_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(input_nodes) + len(compute_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 compute_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves * launch_groups, num_zones, i) if num_slaves_this_zone > 0: compute_res = image.run(key_name = opts.key_pair, security_groups = [compute_group], instance_type = opts.instance_type, placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map) compute_nodes += compute_res.instances print "Launched %d compute nodes in %s, regid = %s" % (num_slaves_this_zone, zone, compute_res.id) i += 1 # Launch input nodes input_type = opts.instance_type if input_type == "": input_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name input_res = image.run(key_name = opts.key_pair, security_groups = [input_group], instance_type = input_type, placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map) input_nodes = input_res.instances print "Launched input in %s, regid = %s" % (zone, input_res.id) # Return all the instances return (input_nodes, compute_nodes)
def create_node(self, name, distribution, size=None, disk_size=8, metadata={}): if size is None: size = self._default_size with start_action( action_type=u"flocker:provision:aws:create_node", name=name, distribution=distribution, image_size=size, disk_size=disk_size, metadata=metadata, ): metadata = metadata.copy() metadata['Name'] = name disk1 = EBSBlockDeviceType() disk1.size = disk_size disk1.delete_on_termination = True diskmap = BlockDeviceMapping() diskmap['/dev/sda1'] = disk1 images = self._connection.get_all_images( filters={'name': IMAGE_NAMES[distribution]}, ) with start_action( action_type=u"flocker:provision:aws:create_node:run_instances", ) as context: reservation = self._connection.run_instances( images[0].id, key_name=self._keyname, instance_type=size, security_groups=self._security_groups, block_device_map=diskmap, placement=self._zone, # On some operating systems, a tty is requried for sudo. # Since AWS systems have a non-root user as the login, # disable this, so we can use sudo with conch. user_data=dedent("""\ #!/bin/sh sed -i '/Defaults *requiretty/d' /etc/sudoers """), ) instance = reservation.instances[0] context.add_success_fields(instance_id=instance.id) self._connection.create_tags([instance.id], metadata) # Display state as instance starts up, to keep user informed that # things are happening. _wait_until_running(instance) return AWSNode( name=name, _provisioner=self, _instance=instance, distribution=distribution, )
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: group_names = [g.id for g in res.groups] if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names: active = [i for i in res.instances if is_active(i)] if len(active) > 0: print >> stderr, ("ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = opts.zone, min_count = opts.slaves, max_count = opts.slaves, block_device_map = block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = opts.ft, max_count = opts.ft, block_device_map = block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Launch ZooKeeper nodes if required if opts.ft > 1: zoo_res = image.run(key_name = opts.key_pair, security_groups = [zoo_group], instance_type = opts.instance_type, placement = opts.zone, min_count = 3, max_count = 3, block_device_map = block_map) zoo_nodes = zoo_res.instances print "Launched zoo, regid = " + zoo_res.id else: zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): if opts.identity_file is None: print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections." sys.exit(1) if opts.key_pair is None: print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances." sys.exit(1) #print "Setting up security groups..." #master_group = get_or_make_group(conn, cluster_name + "-master") #slave_group = get_or_make_group(conn, cluster_name + "-slaves") #if master_group.rules == []: # Group was just now created # master_group.authorize(src_group=master_group) # master_group.authorize(src_group=slave_group) # master_group.authorize('tcp', 22, 22, '0.0.0.0/0') # master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') # master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0') # master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') # master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') # master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') # master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0') # if opts.ganglia: # master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') #if slave_group.rules == []: # Group was just now created # slave_group.authorize(src_group=master_group) # slave_group.authorize(src_group=slave_group) # slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') # slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') # slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') # slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') # slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') # slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # Check if instances are already running in our groups existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) #if existing_slaves or (existing_masters and not opts.use_existing_master): # print >> stderr, ("ERROR: There are already instances running in " + # "group %s or %s" % (master_group.name, slave_group.name)) # sys.exit(1) # Figure out Spark AMI if opts.ami is None: opts.ami = get_spark_ami(opts) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = zone, count = num_slaves_this_zone, key_name = opts.key_pair, #security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name = opts.key_pair, security_group_ids = ["sg-87956be2","sg-1ac33f7f", "sg-1ec33f7b"], subnet_id = "subnet-4182b007", instance_type = opts.instance_type, placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch or resume masters if existing_masters: print "Starting master..." for inst in existing_masters: if inst.state not in ["shutting-down", "terminated"]: inst.start() master_nodes = existing_masters else: master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name = opts.key_pair, security_group_ids = ["sg-bd956bd8","sg-1ac33f7f", "sg-1ec33f7b"], subnet_id = "subnet-4182b007", instance_type = master_type, placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) # Return all the instances return (master_nodes, slave_nodes)
def create_ami(region, snap_id, force=None, root_dev='/dev/sda1', zone_name=None, default_arch=None, default_type='t1.micro', security_groups=''): """ Creates AMI image from given snapshot. Force option removes prompt request and creates new instance from created ami image. region, snap_id specify snapshot to be processed. Snapshot description in json format will be used to restore instance with same parameters. Will automaticaly process snapshots for same instance with near time (10 minutes or shorter), but for other devices (/dev/sdb, /dev/sdc, etc); force Run instance from ami after creation without confirmation. To enable set value to "RUN"; default_arch architecture to use if not mentioned in snapshot description; default_type instance type to use if not mentioned in snapshot description. Used only if ``force`` is "RUN"; security_groups list of AWS Security Groups names formatted as string separated with semicolon ';'. Used only if ``force`` is "RUN". """ conn = get_region_conn(region) snap = conn.get_all_snapshots(snapshot_ids=[snap_id, ])[0] instance_id = get_snap_instance(snap) _device = get_snap_device(snap) snaps = conn.get_all_snapshots(owner='self') snapshots = [snp for snp in snaps if get_snap_instance(snp) == instance_id and get_snap_device(snp) != _device and abs(get_snap_time(snap) - get_snap_time(snp)) <= timedelta(minutes=10)] snapshot = sorted(snapshots, key=get_snap_time, reverse=True) if snapshots else None # setup for building an EBS boot snapshot default_arch = default_arch or config.get('DEFAULT', 'ARCHITECTURE') arch = get_descr_attr(snap, 'Arch') or default_arch kernel = config.get(conn.region.name, 'KERNEL' + arch.upper()) dev = re.match(r'^/dev/sda$', _device) # if our instance encrypted if dev: kernel = config.get(conn.region.name, 'KERNEL_ENCR_' + arch.upper()) ebs = EBSBlockDeviceType() ebs.snapshot_id = snap_id ebs.delete_on_termination = True block_map = BlockDeviceMapping() block_map[_device] = ebs sdb = BlockDeviceType() sdb.ephemeral_name = 'ephemeral0' block_map['/dev/sdb'] = sdb if snapshot: for s in snapshot: s_dev = get_snap_device(s) s_ebs = EBSBlockDeviceType() s_ebs.delete_on_termination = True s_ebs.snapshot_id = s.id block_map[s_dev] = s_ebs name = 'Created {0} using access key {1}'.format(timestamp(), conn.access_key) name = name.replace(":", ".").replace(" ", "_") # create the new AMI all options from snap JSON description: wait_for(snap, '100%', limit=SNAP_TIME) result = conn.register_image( name=name, description=snap.description, architecture=get_descr_attr(snap, 'Arch') or default_arch, root_device_name=get_descr_attr(snap, 'Root_dev_name') or root_dev, block_device_map=block_map, kernel_id=kernel) sleep(2) image = conn.get_all_images(image_ids=[result, ])[0] wait_for(image, 'available', limit=10 * 60) add_tags(image, snap.tags) logger.info('The new AMI ID = {0}'.format(result)) new_instance = None if force == 'RUN': instance_type = get_descr_attr(snap, 'Type') or default_type new_instance = launch_instance_from_ami( region, image.id, inst_type=instance_type, security_groups=security_groups, zone_name=zone_name) return image, new_instance
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, cluster_name + "-master") slave_group = get_or_make_group(conn, cluster_name + "-slaves") zoo_group = get_or_make_group(conn, cluster_name + "-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') if opts.cluster_type == "mesos": master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if opts.ganglia: master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups active_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False) if any(active_nodes): print >> stderr, ("ERROR: There are already instances running in " + "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name)) sys.exit(1) # Figure out the latest AMI from our static URL if opts.ami == "latest": try: opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip() print "Latest Spark AMI: " + opts.ami except: print >> stderr, "Could not read " + LATEST_AMI_URL sys.exit(1) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) zones = get_zones(conn, opts) num_zones = len(zones) i = 0 my_req_ids = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) slave_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = zone, count = num_slaves_this_zone, key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids += [req.id for req in slave_reqs] i += 1 print "Waiting for spot instances to be granted..." try: while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active_instance_ids = [] for i in my_req_ids: if i in id_to_req and id_to_req[i].state == "active": active_instance_ids.append(id_to_req[i].instance_id) if len(active_instance_ids) == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(active_instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % ( len(active_instance_ids), opts.slaves) except: print "Canceling spot instance requests" conn.cancel_spot_instance_requests(my_req_ids) # Log a warning if any of these requests actually launched instances: (master_nodes, slave_nodes, zoo_nodes) = get_existing_cluster( conn, opts, cluster_name, die_on_error=False) running = len(master_nodes) + len(slave_nodes) + len(zoo_nodes) if running: print >> stderr, ("WARNING: %d instances are still running" % running) sys.exit(0) else: # Launch non-spot instances zones = get_zones(conn, opts) num_zones = len(zones) i = 0 slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) if num_slaves_this_zone > 0: slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = zone, min_count = num_slaves_this_zone, max_count = num_slaves_this_zone, block_device_map = block_map) slave_nodes += slave_res.instances print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id) i += 1 # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type if opts.zone == 'all': opts.zone = random.choice(conn.get_all_zones()).name master_res = image.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map) master_nodes = master_res.instances print "Launched master in %s, regid = %s" % (zone, master_res.id) zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)
def launch_cluster(conn, opts, cluster_name): print "Setting up security groups..." master_group = get_or_make_group(conn, "shark-exp-master") slave_group = get_or_make_group(conn, "shark-exp-slaves") zoo_group = get_or_make_group(conn, "ampcamp-zoo") if master_group.rules == []: # Group was just now created master_group.authorize(src_group=master_group) master_group.authorize(src_group=slave_group) master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') if opts.cluster_type == "mesos": master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') # hbase master_group.authorize('tcp', 60010, 60010, '0.0.0.0/0') master_group.authorize('tcp', 60050, 60050, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') if opts.cluster_type == "mesos": slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') # hbase slave_group.authorize('tcp', 60050, 60050, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) zoo_group.authorize(src_group=zoo_group) zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0') zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0') zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0') zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') # Check if instances are already running in our groups print "Checking for running cluster..." reservations = conn.get_all_instances() for res in reservations: for instance in res.instances: if 'tags' in instance.__dict__ and 'cluster' in instance.tags: if instance.tags['cluster'] == cluster_name and is_active(instance): print >> stderr, ("ERROR: Instances %s is already running in cluster %s" % (instance.id, cluster_name)) sys.exit(1) if opts.ami in ["latest", "standalone"]: opts.ami = get_ami(opts.ami) print "Launching instances..." try: image = conn.get_all_images(image_ids=[opts.ami])[0] except: print >> stderr, "Could not find AMI " + opts.ami sys.exit(1) # Create block device mapping so that we can add an EBS volume if asked to block_map = BlockDeviceMapping() if opts.ebs_vol_size > 0: device = EBSBlockDeviceType() device.size = opts.ebs_vol_size device.delete_on_termination = True block_map["/dev/sdv"] = device # Launch slaves if opts.spot_price != None: # Launch spot instances with the requested price print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price)) slave_reqs = conn.request_spot_instances( price = opts.spot_price, image_id = opts.ami, launch_group = "launch-group-%s" % cluster_name, placement = opts.zone, count = opts.slaves, key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, block_device_map = block_map) my_req_ids = [req.id for req in slave_reqs] print "Waiting for spot instances to be granted..." while True: time.sleep(10) reqs = conn.get_all_spot_instance_requests() id_to_req = {} for r in reqs: id_to_req[r.id] = r active = 0 instance_ids = [] for i in my_req_ids: if id_to_req[i].state == "active": active += 1 instance_ids.append(id_to_req[i].instance_id) if active == opts.slaves: print "All %d slaves granted" % opts.slaves reservations = conn.get_all_instances(instance_ids) slave_nodes = [] for r in reservations: slave_nodes += r.instances break else: print "%d of %d slaves granted, waiting longer" % (active, opts.slaves) else: # Launch non-spot instances slave_res = image.run(key_name = opts.key_pair, security_groups = [slave_group], instance_type = opts.instance_type, placement = opts.zone, min_count = opts.slaves, max_count = opts.slaves, block_device_map = block_map) slave_nodes = slave_res.instances print "Launched slaves, regid = " + slave_res.id # Launch masters master_type = opts.master_instance_type if master_type == "": master_type = opts.instance_type master_res = image.run(key_name = opts.key_pair, security_groups = [master_group], instance_type = master_type, placement = opts.zone, min_count = 1, max_count = 1, block_device_map = block_map) master_nodes = master_res.instances print "Launched master, regid = " + master_res.id # Create the right tags tags = {} tags['cluster'] = cluster_name tags['type'] = 'slave' for node in slave_nodes: conn.create_tags([node.id], tags) tags['type'] = 'master' for node in master_nodes: conn.create_tags([node.id], tags) zoo_nodes = [] # Return all the instances return (master_nodes, slave_nodes, zoo_nodes)