예제 #1
0
    def _register_image(self, snapshot_id):
        conn = self.platform.new_ec2_conn()

        instance_id = self.platform.get_instance_id()
        instance = conn.get_all_instances([instance_id])[0].instances[0]

        block_device_map = BlockDeviceMapping(conn)

        root_vol = EBSBlockDeviceType(snapshot_id=snapshot_id)
        root_vol.delete_on_termination = True
        # Adding ephemeral devices
        for eph, device in EPH_STORAGE_MAPPING[linux.os['arch']].items():
            bdt = EBSBlockDeviceType(conn)
            bdt.ephemeral_name = eph
            block_device_map[device] = bdt

        root_partition = instance.root_device_name[:-1]
        if root_partition in self.platform.get_block_device_mapping().values():
            block_device_map[root_partition] = root_vol
        else:
            block_device_map[instance.root_device_name] = root_vol

        return conn.register_image(
            name=self.image_name,
            root_device_name=instance.root_device_name,
            block_device_map=block_device_map,
            kernel_id=instance.kernel,
            virtualization_type=instance.virtualization_type,
            ramdisk_id=self.platform.get_ramdisk_id(),
            architecture=instance.architecture)
예제 #2
0
파일: _aws.py 프로젝트: WUMUXIAN/flocker
    def create_node(self, name, distribution, metadata={}):
        size = self._default_size
        disk_size = 8

        with start_action(
            action_type=u"flocker:provision:aws:create_node",
            name=name,
            distribution=distribution,
            image_size=size,
            disk_size=disk_size,
            metadata=metadata,
        ):

            metadata = metadata.copy()
            metadata["Name"] = name

            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap["/dev/sda1"] = disk1

            images = self._connection.get_all_images(filters={"name": IMAGE_NAMES[distribution]})
            # Retry several times, no sleep between retries is needed.
            instance = poll_until(
                lambda: self._get_node(images[0].id, size, diskmap, metadata), repeat(0, 10), lambda x: None
            )
            return AWSNode(name=name, _provisioner=self, _instance=instance, distribution=distribution)
예제 #3
0
파일: ec2.py 프로젝트: chenleji/scalarizr
    def _register_image(self, snapshot_id):
        conn = self.platform.new_ec2_conn()
    
        instance_id = self.platform.get_instance_id()
        instance = conn.get_all_instances([instance_id])[0].instances[0]

        block_device_map = BlockDeviceMapping(conn)

        root_vol = EBSBlockDeviceType(snapshot_id=snapshot_id)
        root_vol.delete_on_termination = True
        # Adding ephemeral devices
        for eph, device in EPH_STORAGE_MAPPING[linux.os['arch']].items():
            bdt = EBSBlockDeviceType(conn)
            bdt.ephemeral_name = eph
            block_device_map[device] = bdt

        root_partition = instance.root_device_name[:-1]
        if root_partition in self.platform.get_block_device_mapping().values():
            block_device_map[root_partition] = root_vol
        else:
            block_device_map[instance.root_device_name] = root_vol

        return conn.register_image(
            name=self.image_name,
            root_device_name=instance.root_device_name,
            block_device_map=block_device_map,
            kernel_id=instance.kernel,
            virtualization_type=instance.virtualization_type,
            ramdisk_id=self.platform.get_ramdisk_id(),
            architecture=instance.architecture)
예제 #4
0
    def register_ebs_ami(self, snapshot_id, arch = 'x86_64', default_ephem_map = True,
                         img_name = None, img_desc = None):
        # register against snapshot
        try:
            aki=PVGRUB_AKIS[self.region.name][arch]
        except KeyError:
            raise Exception("Unable to determine pvgrub hd00 AKI for region (%s) arch (%s)" % (self.region.name, arch))

        if not img_name:
            rand_id = random.randrange(2**32)
            # These names need to be unique, hence the pseudo-uuid
            img_name='EBSHelper AMI - %s - uuid-%x' % (snapshot_id, rand_id)
        if not img_desc:
            img_desc='Created directly from volume snapshot %s' % (snapshot_id)

        self.log.debug("Registering snapshot (%s) as new EBS AMI" % (snapshot_id))
        ebs = EBSBlockDeviceType()
        ebs.snapshot_id = snapshot_id
        ebs.delete_on_termination = True
        block_map = BlockDeviceMapping()
        block_map['/dev/sda'] = ebs
        # The ephemeral mappings are automatic with S3 images
        # For EBS images we need to make them explicit
        # These settings are required to make the same fstab work on both S3 and EBS images
        if default_ephem_map:
            e0 = EBSBlockDeviceType()
            e0.ephemeral_name = 'ephemeral0'
            e1 = EBSBlockDeviceType()
            e1.ephemeral_name = 'ephemeral1'
            block_map['/dev/sdb'] = e0
            block_map['/dev/sdc'] = e1
        result = self.conn.register_image(name=img_name, description=img_desc,
                           architecture=arch,  kernel_id=aki,
                           root_device_name='/dev/sda', block_device_map=block_map)
        return str(result)
예제 #5
0
파일: _aws.py 프로젝트: zendad/flocker
    def create_node(self, name, distribution, metadata={}):
        size = self._default_size
        disk_size = 10

        with start_action(
                action_type=u"flocker:provision:aws:create_node",
                name=name,
                distribution=distribution,
                image_size=size,
                disk_size=disk_size,
                metadata=metadata,
        ):

            metadata = metadata.copy()
            metadata['Name'] = name

            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]}, )
            # Retry several times, no sleep between retries is needed.
            instance = poll_until(
                lambda: self._get_node(images[0].id, size, diskmap, metadata),
                repeat(0, 10), lambda x: None)
            return AWSNode(
                name=name,
                _provisioner=self,
                _instance=instance,
                distribution=distribution,
            )
예제 #6
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'})
     if is_instance_store:
         block_map = None
     else:
         block_map = BlockDeviceMapping()
         root_device = self.config['ec2_root_device']
         block_map[root_device] = EBSBlockDeviceType()
         if self.config['ec2_size']:
             block_map[root_device].size = self.config['ec2_size']
         block_map[root_device].delete_on_termination = True
     reservation = self.conn.run_instances(
         self.config['ec2_ami_id'],
         key_name=self.config['ec2_key_name'],
         security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']],
         instance_type=self.config['ec2_instance_type'],
         placement=self.config['ec2_zone'],
         placement_group=self.config['ec2_placement_group'],
         monitoring_enabled=self.config['ec2_monitoring_enabled'],
         block_device_map=block_map,
         user_data=self.user_data)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
         msg1 = "Started Instance: {0}\n".format(self.instance.id)
         LOG.info(msg1)
         print msg1
         p = int(self.config['ssh_port'])
         port = "-p {0} ".format(p) if p and not p == 22 else ''
         ## change user to 'root' for all non-Ubuntu systems
         user = self.config['sudouser'] if self.config['sudouser'] and self.config['ssh_import'] else 'ubuntu'
         #XXX - TODO: replace public dns with fqdn, where appropriate
         msg2 = "To access: ssh {0}{1}@{2}\n".format(
             '-p {0} '.format(port) if port else '',
             user,
             self.instance.public_dns_name)
         msg3 = "To terminate: shaker-terminate {0}".format(
                    self.instance.id)
         LOG.info(msg2)
         LOG.info(msg3)
         print msg2
         print msg3
예제 #7
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     block_map = BlockDeviceMapping()
     root_device = self.config['ec2_root_device']
     block_map[root_device] = EBSBlockDeviceType()
     if self.config['ec2_size']:
         block_map[root_device].size = self.config['ec2_size']
     block_map[root_device].delete_on_termination = True
     reservation = self.conn.run_instances(
         self.config['ec2_ami_id'],
         key_name=self.config['ec2_key_name'],
         security_groups=self.config['ec2_security_groups']
         or [self.config['ec2_security_group']],
         instance_type=self.config['ec2_instance_type'],
         placement_group=self.config['ec2_placement_group'],
         monitoring_enabled=self.config['ec2_monitoring_enabled'],
         block_device_map=block_map,
         user_data=self.user_data)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
         msg1 = "Started Instance: {0}\n".format(self.instance.id)
         LOG.info(msg1)
         print msg1
         p = int(self.config['ssh_port'])
         port = "-p {0} ".format(p) if p and not p == 22 else ''
         ## change user to 'root' for all non-Ubuntu systems
         user = self.config['sudouser'] if self.config[
             'sudouser'] and self.config['ssh_import'] else 'ubuntu'
         #XXX - TODO: replace public dns with fqdn, where appropriate
         msg2 = "To access: ssh {0}{1}@{2}\n".format(
             '-p {0} '.format(port) if port else '', user,
             self.instance.public_dns_name)
         msg3 = "To terminate: shaker-terminate {0}".format(
             self.instance.id)
         LOG.info(msg2)
         LOG.info(msg3)
         print msg2
         print msg3
예제 #8
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     block_map = BlockDeviceMapping()
     root_device = self.config["ec2_root_device"]
     block_map[root_device] = EBSBlockDeviceType()
     if self.config["ec2_size"]:
         block_map[root_device].size = self.config["ec2_size"]
     block_map[root_device].delete_on_termination = True
     for num, device_location in enumerate(self.config["ec2_ephemeral_devices"]):
         device = BlockDeviceType()
         device.ephemeral_name = "ephemeral%d" % num
         block_map[device_location] = device
     reservation = self.conn.run_instances(
         self.config["ec2_ami_id"],
         key_name=self.config["ec2_key_name"],
         security_groups=self.config["ec2_security_groups"] or [self.config["ec2_security_group"]],
         instance_type=self.config["ec2_instance_type"],
         placement=self.config["ec2_zone"],
         monitoring_enabled=self.config["ec2_monitoring_enabled"],
         block_device_map=block_map,
         user_data=self.user_data,
     )
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == "running":
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance %s failed after %d seconds" % (self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config["hostname"]:
             self.assign_name_tag()
         msg1 = "Started Instance: {0}\n".format(self.instance.id)
         LOG.info(msg1)
         print msg1
         p = int(self.config["ssh_port"])
         port = "-p {0} ".format(p) if p and not p == 22 else ""
         ## change user to 'root' for all non-Ubuntu systems
         user = self.config["sudouser"] if self.config["sudouser"] and self.config["ssh_import"] else "ubuntu"
         # XXX - TODO: replace public dns with fqdn, where appropriate
         msg2 = "To access: ssh {0}{1}@{2}\n" "To terminate: shaker-terminate {3}".format(
             port, user, self.instance.public_dns_name, self.instance.id
         )
         LOG.info(msg2)
         print msg2
예제 #9
0
파일: __init__.py 프로젝트: tmcclure/shaker
 def launch_instance(self):
     if not self.verify_settings():
         return
     is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'})
     if is_instance_store:
         block_map = None
     else:
         block_map = BlockDeviceMapping()
         root_device = self.config['ec2_root_device']
         block_map[root_device] = EBSBlockDeviceType()
         if self.config['ec2_size']:
             block_map[root_device].size = self.config['ec2_size']
         block_map[root_device].delete_on_termination = True
     opts = {
         'key_name': self.config['ec2_key_name'],
         'security_groups': self.config['ec2_security_groups'] or [self.config['ec2_security_group']],
         'instance_type': self.config['ec2_instance_type'],
         'placement': self.config['ec2_zone'],
         'placement_group': self.config['ec2_placement_group'],
         'monitoring_enabled': self.config['ec2_monitoring_enabled'],
         'block_device_map': block_map,
         'user_data': self.user_data
     }
     if self.config.get('ec2_subnet_id',False):
         # when providing subnet_id, must use security_group_ids and not
         # named security_groups or API call will fail.
         opts.pop('security_groups',None)
         opts['security_group_ids'] = self.config['ec2_security_group_ids'] or [self.config['ec2_security_group_id']]
         if not opts['security_group_ids']:
             raise AssertionError('Must specify ec2_security_group_id or ec2_security_group_ids with subnet_id')
         opts['subnet_id'] = self.config['ec2_subnet_id']
     reservation = self.conn.run_instances(self.config['ec2_ami_id'], **opts)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
def get_block_device(instance_type, ebs_vol_size):
    block_map = BlockDeviceMapping()

    if ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = ebs_vol_size
        device.delete_on_termination = True
        block_map['/dev/sdv'] = device

    for i in range(get_num_disks(instance_type)):
        dev = BlockDeviceType()
        dev.ephemeral_name = 'ephemeral%d' % i
        # The first ephemeral drive is /dev/sdb.
        name = '/dev/sd' + string.ascii_letters[i + 1]
        block_map[name] = dev

    return block_map
예제 #11
0
def get_block_device(instance_type, ebs_vol_size):
    block_map = BlockDeviceMapping()

    if ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    for i in range(get_num_disks(instance_type)):
        dev = BlockDeviceType()
        dev.ephemeral_name = 'ephemeral%d' % i
        # The first ephemeral drive is /dev/sdb.
        name = '/dev/sd' + string.ascii_letters[i + 1]
        block_map[name] = dev

    return block_map
예제 #12
0
    def register_ebs_ami(self, snapshot_id, arch="x86_64", default_ephem_map=True, img_name=None, img_desc=None):
        # register against snapshot
        try:
            aki = PVGRUB_AKIS[self.region.name][arch]
        except KeyError:
            raise Exception("Unable to find pvgrub hd00 AKI for %s, arch (%s)" % (self.region.name, arch))
        if not img_name:
            rand_id = random.randrange(2 ** 32)
            # These names need to be unique, hence the pseudo-uuid
            img_name = "EBSHelper AMI - %s - uuid-%x" % (snapshot_id, rand_id)
        if not img_desc:
            img_desc = "Created directly from volume snapshot %s" % snapshot_id

        self.log.debug("Registering %s as new EBS AMI" % snapshot_id)
        self.create_sgroup("ec2helper-vnc-ssh-%x" % random.randrange(2 ** 32), allow_vnc=True)
        ebs = EBSBlockDeviceType()
        ebs.snapshot_id = snapshot_id
        ebs.delete_on_termination = True
        block_map = BlockDeviceMapping()
        block_map["/dev/sda"] = ebs
        # The ephemeral mappings are automatic with S3 images
        # For EBS images we need to make them explicit
        # These settings are required to make the same fstab work on both S3
        # and EBS images
        if default_ephem_map:
            e0 = EBSBlockDeviceType()
            e0.ephemeral_name = "ephemeral0"
            e1 = EBSBlockDeviceType()
            e1.ephemeral_name = "ephemeral1"
            block_map["/dev/sdb"] = e0
            block_map["/dev/sdc"] = e1
        result = self.conn.register_image(
            name=img_name,
            description=img_desc,
            architecture=arch,
            kernel_id=aki,
            root_device_name="/dev/sda",
            block_device_map=block_map,
        )
        sleep(10)
        new_amis = self.conn.get_all_images([result])
        new_amis[0].add_tag("Name", resource_tag)

        return str(result)
예제 #13
0
파일: __init__.py 프로젝트: kvbik/shaker
 def launch_instance(self):
     if not self.verify_settings():
         return
     is_instance_store = self.conn.get_all_images(
         self.config['ec2_ami_id'],
         filters={'root-device-type': 'instance-store'})
     if is_instance_store:
         block_map = None
     else:
         block_map = BlockDeviceMapping()
         root_device = self.config['ec2_root_device']
         block_map[root_device] = EBSBlockDeviceType()
         if self.config['ec2_size']:
             block_map[root_device].size = self.config['ec2_size']
         block_map[root_device].delete_on_termination = True
     reservation = self.conn.run_instances(
         self.config['ec2_ami_id'],
         key_name=self.config['ec2_key_name'],
         security_groups=self.config['ec2_security_groups']
         or [self.config['ec2_security_group']],
         instance_type=self.config['ec2_instance_type'],
         placement=self.config['ec2_zone'],
         placement_group=self.config['ec2_placement_group'],
         monitoring_enabled=self.config['ec2_monitoring_enabled'],
         block_device_map=block_map,
         user_data=self.user_data)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
예제 #14
0
 def parse_block_device_args(self, block_device_maps_args):
     block_device_map = BlockDeviceMapping()
     for block_device_map_arg in block_device_maps_args:
         parts = block_device_map_arg.split('=')
         if len(parts) > 1:
             device_name = parts[0]
             block_dev_type = EBSBlockDeviceType()
             value_parts = parts[1].split(':')
             if value_parts[0].startswith('snap'):
                 block_dev_type.snapshot_id = value_parts[0]
             else:
                 if value_parts[0].startswith('ephemeral'):
                     block_dev_type.ephemeral_name = value_parts[0]
             if len(value_parts) > 1:
                 block_dev_type.size = int(value_parts[1])
             if len(value_parts) > 2:
                 if value_parts[2] == 'true':
                     block_dev_type.delete_on_termination = True
             block_device_map[device_name] = block_dev_type
     return block_device_map
예제 #15
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'})
     if is_instance_store:
         block_map = None
     else:
         block_map = BlockDeviceMapping()
         root_device = self.config['ec2_root_device']
         block_map[root_device] = EBSBlockDeviceType()
         if self.config['ec2_size']:
             block_map[root_device].size = self.config['ec2_size']
         block_map[root_device].delete_on_termination = True
     reservation = self.conn.run_instances(
         self.config['ec2_ami_id'],
         key_name=self.config['ec2_key_name'],
         security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']],
         instance_type=self.config['ec2_instance_type'],
         placement=self.config['ec2_zone'],
         placement_group=self.config['ec2_placement_group'],
         monitoring_enabled=self.config['ec2_monitoring_enabled'],
         block_device_map=block_map,
         user_data=self.user_data)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
예제 #16
0
def build(hosts, cred, dry, inventory='hosts'):
    hret = {}
    old_state = {}
    con = None
    for h in hosts:
        logger.info("    Run action on host [%s]" % (h))
        hret[h] = {}
        hv = {}
        hv = vmbuilder.utils.load_host_vars(h, inventory=inventory)
        hvars = hv['VM_PROVIDER']
        if con is None:
            con = _connect(hvars['region'], cred)
        reservations = con.get_all_reservations(filters={"tag:Name": h})
        old_state[h] = "absent"
        for reservation in reservations:
            instance = reservation.instances[0]
            if instance.state != 'terminated':
                hret[h]['instance'] = instance
                old_state[h] = "present"
                logger.info("      Server [%s] is already present" % (h))

        if old_state[h] == 'present':
            continue

        bdm = None
        if 'disk_size' in hvars:
            try:
                dev_sda1 = EBSBlockDeviceType()
                dev_sda1.size = hvars['disk_size']
                dev_sda1.delete_on_termination = True
                bdm = BlockDeviceMapping()
                bdm['/dev/sda1'] = dev_sda1
            except Exception as e:
                logger.error("Error building block device for server: %s" % (e))
                exit(1)

        try:
            reservation = con.run_instances(
                hvars['ami'],
                key_name=hvars['key'],
                instance_type=hvars['vmtype'],
                security_group_ids=[hvars['security']],
                subnet_id=hvars['subnet'],
                block_device_map=bdm,
                dry_run=dry
            )
            hret[h]['instance'] = reservation.instances[0]
        except Exception as e:
            logger.error("Error building server: %s" % (e))
            exit(1)

    for h in hosts:
        hv = vmbuilder.utils.load_host_vars(h, inventory=inventory)
        hvars = hv['VM_PROVIDER']
        instance = hret[h]['instance']
        status = instance.update()
        if old_state[h] == 'absent':
            logger.info("        Waiting for [%s] to be launched..." % (h))
            while status == 'pending':
                time.sleep(5)
                status = instance.update()

        if old_state[h] == 'present':
            logger.info("        State is running with IP [%s]" % (instance.private_ip_address))
        elif status == 'running':
            logger.info("        State changed to running with IP [%s]" % (instance.private_ip_address))
        else:
            logger.error("        Status of [%s] is [%s]" % (h, status))

        instance.add_tag("Name", "%s" % (h))
        for cur_tag in hvars['tags']:
            instance.add_tag(cur_tag, hvars['tags'][cur_tag])

        if 'extra_disks' in hvars and old_state[h] == 'absent':
            try:
                for cur_disk in hvars['extra_disks']:
                    cur_vol = con.create_volume(cur_disk['size'], instance.placement)
                    status = cur_vol.status
                    while status != 'available':
                        logger.info("          Waiting for volume [%s] to be launched..." % (cur_vol))
                        time.sleep(10)
                        status = cur_vol.update()
                    con.attach_volume(cur_vol.id, instance.id, '/dev/' + cur_disk['device'])
            except Exception as e:
                logger.error("Error Attaching new disks: %s" % (e))
                exit(1)

        instance_volumes = con.get_all_volumes(filters={'attachment.instance-id': instance.id})
        for counter, cur_vol in enumerate(instance_volumes):
            cur_vol.add_tag("Name", "%s_disk%d" % (h.split('.')[0], counter))

        hret[h]['private_ip_address'] = instance.private_ip_address
        # If requested assosiate an new elastic IP for the host and create a security group to whitelist external IPs
        if 'assosiate_eip' in hvars and hvars['assosiate_eip'] is True:
            if instance.ip_address is None:
                eip = con.allocate_address()
                con.associate_address(instance.id, eip.public_ip)
                logger.info("          Adding public IP [%s]" % (eip.public_ip))
                hret[h]['public_ip_address'] = eip.public_ip
            if 'whitelisted_ips' in hvars:
                logger.info("          Whitelisting IPs [%s]" % (hvars['whitelisted_ips']))
                ips = hvars['whitelisted_ips'].split(',')
                project = hvars['tags']['Project']
                security = hvars['security']
                _create_security_group(con, instance, project, ips, security)
    return hret
예제 #17
0
파일: fabfile.py 프로젝트: Enaith/oggm
def node_install(cn=def_cn,inst_type_idx=def_inst_type,idn=0,
        avz=def_default_avz,rt=def_default_requesttype,
        group_name='oggmssh',
        ssh_port=22,
        cidr='0.0.0.0/0'):
    """
    Request and prepare single instance
    """
    # FSO---connect
    cloud = boto.ec2.connect_to_region(avz[:-1],profile_name=ec2Profile)
    aminfo = cloud.get_image(def_ami[avz[:-1]])

    # FSO---check if node with same name already exists
    if node_exists(cn + '_node' + str(idn)):
        print("Node already exists")
        sys.exit()

    # Check if ssh keypair exists
    key_name = get_keypair_name()
    check_keypair(cloud, key_name)

    # FSO---create a bigger root device
    dev_sda1 = EBSBlockDeviceType()
    dev_sda1.size = rootfs_size_gb
    dev_sda1.delete_on_termination = True
    bdm = BlockDeviceMapping()
    bdm['/dev/sda1'] = dev_sda1

    dev_sdf_vol = get_user_persist_ebs(cloud, avz)

    # Check to see if specified security group already exists.
    # If we get an InvalidGroup.NotFound error back from EC2,
    # it means that it doesn't exist and we need to create it.
    try:
        group = cloud.get_all_security_groups(groupnames=[group_name])[0]
    except cloud.ResponseError as e:
        if e.code == 'InvalidGroup.NotFound':
            print('Creating Security Group: %s' % group_name)
            # Create a security group to control access to instance via SSH.
            group = cloud.create_security_group(group_name, 'A group that allows SSH access')
        else:
            raise

    # Add a rule to the security group to authorize SSH traffic
    # on the specified port.
    try:
        group.authorize('tcp', ssh_port, ssh_port, cidr)
    except cloud.ResponseError as e:
        if e.code == 'InvalidPermission.Duplicate':
            print('Security Group: %s already authorized' % group_name)
        else:
            raise

    log_with_ts("request node "+str(idn))
    print('Reserving instance for node', aminfo.id, instance_infos[inst_type_idx]['type'], aminfo.name, aminfo.region)

    if rt == 'spot':
        print("placing node in ",avz)
        requests = cloud.request_spot_instances(def_price,
                      def_ami[avz[:-1]],
                      count=1,
                      type='one-time',
                      security_groups=[group_name],
                      key_name=key_name,
                      placement=avz,
                      instance_type=instance_infos[inst_type_idx]['type'],
                      block_device_map=bdm)
        req_ids = [request.id for request in requests]
        instance_ids = wait_for_fulfillment(cloud,req_ids)
        instances = cloud.get_only_instances(instance_ids=instance_ids)
        node = instances[0]
        log_with_ts("fullfilled spot node "+str(idn))
    else:
        print("placing node in ",avz)
        reservation = cloud.run_instances(image_id=def_ami[avz[:-1]],
                key_name=key_name,
                placement = avz,
                security_groups=[group_name],
                instance_type=instance_infos[inst_type_idx]['type'],
                block_device_map= bdm)
        node = reservation.instances[0]
        log_with_ts("fullfilled ondemand node "+str(idn))

    time.sleep(2)
    while not node.update() == 'running':
        print('waiting for', cn, 'node', idn, 'to boot...')
        time.sleep(5)

    log_with_ts("booted node "+str(idn))

    if dev_sdf_vol is not None:
        cloud.attach_volume(dev_sdf_vol.id, node.id, "/dev/sdf")

    node.add_tag('Name', cn+'_node'+str(idn))
    node.add_tag('type', cn+'node')
    node.add_tag('node-owner', user_identifier)

    # FSO---set delete on termination flag to true for ebs block device
    node.modify_attribute('blockDeviceMapping', { '/dev/sda1' : True })

    # FSO--- test socket connect to ssh service
    ssh_test(node)
    log_with_ts("reachable node "+str(idn))

    update_key_filename(node.region.name)

    # Mount potential user volume
    if dev_sdf_vol is not None:
        use_user_volume(node.dns_name)

    log_with_ts("finished node "+str(idn))
예제 #18
0
def launch_cluster(conn, opts, cluster_name):
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
    # master_group = get_or_make_group(conn, cluster_name)
    # slave_group = get_or_make_group(conn, cluster_name)
    # zoo_group = get_or_make_group(conn, cluster_name)

    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=zoo_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=zoo_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    if zoo_group.rules == []:  # Group was just now created
        zoo_group.authorize(src_group=master_group)
        zoo_group.authorize(src_group=slave_group)
        zoo_group.authorize(src_group=zoo_group)
        zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

    # Check if instances are already running in our groups
    print "Checking for running cluster..."
    reservations = conn.get_all_instances()
    for res in reservations:
        group_names = [g.id for g in res.groups]
        if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
            active = [i for i in res.instances if is_active(i)]
            if len(active) > 0:
                print >> stderr, (
                    "ERROR: There are already instances running in " +
                    "group %s, %s or %s" %
                    (master_group.name, slave_group.name, zoo_group.name))
                sys.exit(1)

    if opts.ami == "std":
        try:
            opts.ami = urllib2.urlopen(STD_AMI_URL).read().strip()
            print "GraphLab AMI for Standard Instances: " + opts.ami
        except:
            print >> stderr, "Could not read " + STD_AMI_URL
    elif opts.ami == "hpc":
        try:
            opts.ami = urllib2.urlopen(HVM_AMI_URL).read().strip()
            print "GraphLab AMI for HPC Instances: " + opts.ami
        except:
            print >> stderr, "Could not read " + HVM_AMI_URL

    print "Launching instances..."
    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        slave_reqs = conn.request_spot_instances(
            price=opts.spot_price,
            image_id=opts.ami,
            launch_group="launch-group-%s" % cluster_name,
            placement=opts.zone,
            count=opts.slaves,
            key_name=opts.key_pair,
            security_groups=[slave_group],
            instance_type=opts.instance_type,
            block_device_map=block_map)
        my_req_ids = [req.id for req in slave_reqs]
        print "Waiting for spot instances to be granted..."
        while True:
            time.sleep(10)
            reqs = conn.get_all_spot_instance_requests()
            id_to_req = {}
            for r in reqs:
                id_to_req[r.id] = r
            active = 0
            instance_ids = []
            for i in my_req_ids:
                if id_to_req[i].state == "active":
                    active += 1
                    instance_ids.append(id_to_req[i].instance_id)
            if active == opts.slaves:
                print "All %d slaves granted" % opts.slaves
                reservations = conn.get_all_instances(instance_ids)
                slave_nodes = []
                for r in reservations:
                    slave_nodes += r.instances
                break
            else:
                print "%d of %d slaves granted, waiting longer" % (active,
                                                                   opts.slaves)
    else:
        # Launch non-spot instances
        slave_res = image.run(key_name=opts.key_pair,
                              security_groups=[slave_group],
                              instance_type=opts.instance_type,
                              placement=opts.zone,
                              min_count=opts.slaves,
                              max_count=opts.slaves,
                              block_device_map=block_map)
        slave_nodes = slave_res.instances
        print "Launched slaves, regid = " + slave_res.id

    # # Launch masters
    master_type = opts.master_instance_type
    if master_type == "":
        master_type = opts.instance_type
    master_res = image.run(key_name=opts.key_pair,
                           security_groups=[master_group],
                           instance_type=master_type,
                           placement=opts.zone,
                           min_count=1,
                           max_count=1,
                           block_device_map=block_map)
    master_nodes = master_res.instances
    print "Launched master, regid = " + master_res.id

    zoo_nodes = []

    # Return all the instances
    return (master_nodes, slave_nodes, zoo_nodes)
예제 #19
0
def launch_cluster(conn, opts, cluster_name):
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, "strata-master")
    slave_group = get_or_make_group(conn, "strata-slaves")
    zoo_group = get_or_make_group(conn, "strata-zoo")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=zoo_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        if opts.cluster_type == "mesos":
            master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 80, 80, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=zoo_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    if zoo_group.rules == []:  # Group was just now created
        zoo_group.authorize(src_group=master_group)
        zoo_group.authorize(src_group=slave_group)
        zoo_group.authorize(src_group=zoo_group)
        zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

    # Check if instances are already running in our groups
    active_nodes = get_existing_cluster(conn,
                                        opts,
                                        cluster_name,
                                        die_on_error=False)
    if any(active_nodes):
        print >> stderr, (
            "ERROR: There are already instances running in " +
            "group %s, %s or %s" %
            (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)

    # Figure out the latest AMI from our static URL
    if opts.ami == "latest":
        try:
            opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip()
            print "Latest Spark AMI: " + opts.ami
        except:
            print >> stderr, "Could not read " + LATEST_AMI_URL
            sys.exit(1)

    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group],
                instance_type=opts.instance_type,
                block_device_map=block_map)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes,
             zoo_nodes) = get_existing_cluster(conn,
                                               opts,
                                               cluster_name,
                                               die_on_error=False)
            running = len(master_nodes) + len(slave_nodes) + len(zoo_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group],
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch masters
    master_type = opts.master_instance_type
    if master_type == "":
        master_type = opts.instance_type
    if opts.zone == 'all':
        opts.zone = random.choice(conn.get_all_zones()).name
    master_res = image.run(key_name=opts.key_pair,
                           security_groups=[master_group],
                           instance_type=master_type,
                           placement=opts.zone,
                           min_count=1,
                           max_count=1,
                           block_device_map=block_map)
    master_nodes = master_res.instances
    print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Create the right tags
    tags = {}
    tags['cluster'] = cluster_name

    tags['type'] = 'slave'
    for node in slave_nodes:
        conn.create_tags([node.id], tags)

    tags['type'] = 'master'
    for node in master_nodes:
        conn.create_tags([node.id], tags)

    zoo_nodes = []

    # Return all the instances
    return (master_nodes, slave_nodes, zoo_nodes)
예제 #20
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0')
        master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances running in " +
                          "group %s or %s" %
                          (master_group.name, slave_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.letters[i + 1]
            block_map[name] = dev

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group],
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data_content)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group],
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data_content)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group],
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map,
                               user_data=user_data_content)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    for master in master_nodes:
        master.add_tag(key='Name',
                       value='{cn}-master-{iid}'.format(cn=cluster_name,
                                                        iid=master.id))
    for slave in slave_nodes:
        slave.add_tag(key='Name',
                      value='{cn}-slave-{iid}'.format(cn=cluster_name,
                                                      iid=slave.id))

    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #21
0
    def create_node(self,
                    name,
                    distribution,
                    size=None,
                    disk_size=8,
                    metadata={}):
        if size is None:
            size = self._default_size

        with start_action(
                action_type=u"flocker:provision:aws:create_node",
                name=name,
                distribution=distribution,
                image_size=size,
                disk_size=disk_size,
                metadata=metadata,
        ):

            metadata = metadata.copy()
            metadata['Name'] = name

            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]}, )

            with start_action(
                    action_type=
                    u"flocker:provision:aws:create_node:run_instances",
            ) as context:
                reservation = self._connection.run_instances(
                    images[0].id,
                    key_name=self._keyname,
                    instance_type=size,
                    security_groups=self._security_groups,
                    block_device_map=diskmap,
                    placement=self._zone,
                    # On some operating systems, a tty is requried for sudo.
                    # Since AWS systems have a non-root user as the login,
                    # disable this, so we can use sudo with conch.
                    user_data=dedent("""\
                        #!/bin/sh
                        sed -i '/Defaults *requiretty/d' /etc/sudoers
                        """),
                )

                instance = reservation.instances[0]
                context.add_success_fields(instance_id=instance.id)

            self._connection.create_tags([instance.id], metadata)

            # Display state as instance starts up, to keep user informed that
            # things are happening.
            _wait_until_running(instance)

            return AWSNode(
                name=name,
                _provisioner=self,
                _instance=instance,
                distribution=distribution,
            )
예제 #22
0
def launch_cluster(conn, opts, num_nodes, cluster_name):
    if opts.identity_file is None:
        print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr)
        sys.exit(1)

    if opts.key_pair is None:
        print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr)
        sys.exit(1)

    print("Setting up security groups...")

    slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id)
    authorized_address = opts.authorized_address
    if slave_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            slave_group.authorize(src_group=slave_group)
        else:
            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=slave_group)
            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=slave_group)
            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)

    # Check if instances are already running in our groups
    existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
    if existing_slaves:
        print("ERROR: There are already instances running in group %s" %
              slave_group.name, file=stderr)
        sys.exit(1)

    if opts.ami is None:
        print("ERROR: AMI is not set, exit")
        sys.exit(1)

    # we use group ids to work around https://github.com/boto/boto/issues/350
    additional_group_ids = []
    if opts.additional_security_group:
        additional_group_ids = [sg.id
                                for sg in conn.get_all_security_groups()
                                if opts.additional_security_group in (sg.name, sg.id)]
    print("Launching instances...")

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print("Could not find AMI " + opts.ami, file=stderr)
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map["/dev/sd" + chr(ord('s') + i)] = device

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (num_nodes, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(num_nodes, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_group_ids=[slave_group.id] + additional_group_ids,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                instance_profile_name=opts.instance_profile_name)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print("Waiting for spot instances to be granted...")
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == num_nodes:
                    print("All %d spot instances granted" % (num_nodes + 1))
                    reservations = conn.get_all_reservations(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print("%d of %d slave spot instances granted, waiting longer" % (
                            len(active_instance_ids), num_nodes))
        except:
            print("Canceling spot instance requests")
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            slave_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
            running = len(slave_nodes)
            if running:
                print(("WARNING: %d instances are still running" % running), file=stderr)
            sys.exit(0)
    else:
        print ("WARNING: --spot-price was not set; consider launch slaves as spot instances to save money")
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(num_nodes, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id] + additional_group_ids,
                    instance_type=opts.instance_type,
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
                    instance_profile_name=opts.instance_profile_name)
                slave_nodes += slave_res.instances
                print("Launched {s} slave{plural_s} in {z}, regid = {r}".format(
                      s=num_slaves_this_zone,
                      plural_s=('' if num_slaves_this_zone == 1 else 's'),
                      z=zone,
                      r=slave_res.id))
            i += 1


    print("Waiting for AWS to propagate instance metadata...")
    time.sleep(15)

    # Give the instances descriptive names and set additional tags
    additional_tags = {}
    if opts.additional_tags.strip():
        additional_tags = dict(
            map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
        )

    for slave in slave_nodes:
        slave.add_tags(
            dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
        )

    # Return all the instances
    return slave_nodes
예제 #23
0
def node_install(cn=def_cn,
                 inst_type_idx=def_inst_type,
                 idn=0,
                 avz=def_default_avz,
                 rt=def_default_requesttype,
                 group_name='oggmssh',
                 ssh_port=22,
                 cidr='0.0.0.0/0'):
    """
    Request and prepare single instance
    """
    # FSO---connect
    cloud = boto.ec2.connect_to_region(avz[:-1], profile_name=ec2Profile)
    aminfo = cloud.get_image(def_ami[avz[:-1]])
    vpcconn = VPCConnection(region=cloud.region)

    try:
        vpc_id, subnet_id = def_subnet[avz]
        vpc = vpcconn.get_all_vpcs(vpc_ids=[vpc_id])[0]
    except:
        vpc_id = None
        subnet_id = None
        vpc = None

    # FSO---check if node with same name already exists
    if node_exists(cn + '_node' + str(idn)):
        print("Node already exists")
        sys.exit()

    # Check if ssh keypair exists
    key_name = get_keypair_name(avz[:-1])
    check_keypair(cloud, key_name)

    # FSO---create a bigger root device
    dev_sda1 = EBSBlockDeviceType()
    dev_sda1.size = rootfs_size_gb
    dev_sda1.delete_on_termination = True
    bdm = BlockDeviceMapping()
    bdm['/dev/sda1'] = dev_sda1

    dev_sdf_vol = get_user_persist_ebs(cloud, avz)

    # Check to see if specified security group already exists.
    # If we get an InvalidGroup.NotFound error back from EC2,
    # it means that it doesn't exist and we need to create it.
    try:
        group = cloud.get_all_security_groups(groupnames=[group_name])[0]
    except cloud.ResponseError as e:
        if e.code == 'InvalidGroup.NotFound':
            print('Creating Security Group: %s' % group_name)
            # Create a security group to control access to instance via SSH.
            group = cloud.create_security_group(
                group_name, 'A group that allows SSH access')
        else:
            raise

    # Authorize all Intra-VPC traffic
    if vpc is not None:
        try:
            group.authorize('-1', -1, -1, vpc.cidr_block)
        except cloud.ResponseError as e:
            if e.code != 'InvalidPermission.Duplicate':
                raise

    # Add a rule to the security group to authorize SSH traffic
    # on the specified port.
    try:
        group.authorize('tcp', ssh_port, ssh_port, cidr)
    except cloud.ResponseError as e:
        if e.code == 'InvalidPermission.Duplicate':
            print('Security Group: %s already authorized' % group_name)
        else:
            raise

    log_with_ts("request node " + str(idn))
    print('Reserving instance for node', aminfo.id,
          instance_infos[inst_type_idx]['type'], aminfo.name, aminfo.region)

    if rt == 'spot':
        print("placing node in ", avz)
        requests = cloud.request_spot_instances(
            def_price,
            def_ami[avz[:-1]],
            count=1,
            type='one-time',
            security_group_ids=[group.id],
            key_name=key_name,
            placement=avz,
            subnet_id=subnet_id,
            ebs_optimized=True,
            instance_type=instance_infos[inst_type_idx]['type'],
            block_device_map=bdm)
        req_ids = [request.id for request in requests]
        instance_ids = wait_for_fulfillment(cloud, req_ids)
        instances = cloud.get_only_instances(instance_ids=instance_ids)
        node = instances[0]
        log_with_ts("fullfilled spot node " + str(idn))
    else:
        print("placing node in ", avz)
        reservation = cloud.run_instances(
            image_id=def_ami[avz[:-1]],
            key_name=key_name,
            placement=avz,
            subnet_id=subnet_id,
            security_group_ids=[group.id],
            ebs_optimized=True,
            instance_type=instance_infos[inst_type_idx]['type'],
            block_device_map=bdm)
        node = reservation.instances[0]
        log_with_ts("fullfilled ondemand node " + str(idn))

    time.sleep(2)
    while not node.update() == 'running':
        print('waiting for', cn, 'node', idn, 'to boot...')
        time.sleep(5)

    log_with_ts("booted node " + str(idn))

    if dev_sdf_vol is not None:
        cloud.attach_volume(dev_sdf_vol.id, node.id, "/dev/sdf")

    node.add_tag('Name', cn + '_node' + str(idn))
    node.add_tag('type', cn + 'node')
    node.add_tag('node-owner', user_identifier)

    # FSO---set delete on termination flag to true for ebs block device
    node.modify_attribute('blockDeviceMapping', {'/dev/sda1': True})

    # FSO--- test socket connect to ssh service
    ssh_test(node)
    log_with_ts("reachable node " + str(idn))

    update_key_filename(node.region.name)

    # Mount potential user volume
    if dev_sdf_vol is not None:
        use_user_volume(node.dns_name)

    log_with_ts("finished node " + str(idn))
예제 #24
0
def create_ami(region,
               snap_id,
               force=None,
               root_dev='/dev/sda1',
               zone_name=None,
               default_arch=None,
               default_type='t1.micro',
               security_groups=''):
    """
    Creates AMI image from given snapshot.

    Force option removes prompt request and creates new instance from
    created ami image.

    region, snap_id
        specify snapshot to be processed. Snapshot description in json
        format will be used to restore instance with same parameters.
        Will automaticaly process snapshots for same instance with near
        time (10 minutes or shorter), but for other devices (/dev/sdb,
        /dev/sdc, etc);
    force
        Run instance from ami after creation without confirmation. To
        enable set value to "RUN";
    default_arch
        architecture to use if not mentioned in snapshot description;
    default_type
        instance type to use if not mentioned in snapshot description.
        Used only if ``force`` is "RUN";
    security_groups
        list of AWS Security Groups names formatted as string separated
        with semicolon ';'. Used only if ``force`` is "RUN".
    """
    conn = get_region_conn(region)
    snap = conn.get_all_snapshots(snapshot_ids=[
        snap_id,
    ])[0]
    instance_id = get_snap_instance(snap)
    _device = get_snap_device(snap)
    snaps = conn.get_all_snapshots(owner='self')
    snapshots = [
        snp for snp in snaps if get_snap_instance(snp) == instance_id
        and get_snap_device(snp) != _device and
        abs(get_snap_time(snap) - get_snap_time(snp)) <= timedelta(minutes=10)
    ]
    snapshot = sorted(snapshots, key=get_snap_time,
                      reverse=True) if snapshots else None
    # setup for building an EBS boot snapshot
    default_arch = default_arch or config.get('DEFAULT', 'ARCHITECTURE')
    arch = get_descr_attr(snap, 'Arch') or default_arch
    kernel = config.get(conn.region.name, 'KERNEL' + arch.upper())
    dev = re.match(r'^/dev/sda$', _device)  # if our instance encrypted
    if dev:
        kernel = config.get(conn.region.name, 'KERNEL_ENCR_' + arch.upper())
    ebs = EBSBlockDeviceType()
    ebs.snapshot_id = snap_id
    ebs.delete_on_termination = True
    block_map = BlockDeviceMapping()
    block_map[_device] = ebs
    sdb = BlockDeviceType()
    sdb.ephemeral_name = 'ephemeral0'
    block_map['/dev/sdb'] = sdb

    if snapshot:
        for s in snapshot:
            s_dev = get_snap_device(s)
            s_ebs = EBSBlockDeviceType()
            s_ebs.delete_on_termination = True
            s_ebs.snapshot_id = s.id
            block_map[s_dev] = s_ebs

    name = 'Created {0} using access key {1}'.format(timestamp(),
                                                     conn.access_key)
    name = name.replace(":", ".").replace(" ", "_")

    # create the new AMI all options from snap JSON description:
    wait_for(snap, '100%', limit=SNAP_TIME)
    result = conn.register_image(
        name=name,
        description=snap.description,
        architecture=get_descr_attr(snap, 'Arch') or default_arch,
        root_device_name=get_descr_attr(snap, 'Root_dev_name') or root_dev,
        block_device_map=block_map,
        kernel_id=kernel)
    sleep(2)
    image = conn.get_all_images(image_ids=[
        result,
    ])[0]
    wait_for(image, 'available', limit=10 * 60)
    add_tags(image, snap.tags)

    logger.info('The new AMI ID = {0}'.format(result))

    info = ('\nEnter RUN if you want to launch instance using '
            'just created {0}: '.format(image))
    new_instance = None
    if force == 'RUN' or raw_input(info).strip() == 'RUN':
        instance_type = get_descr_attr(snap, 'Type') or default_type
        new_instance = launch_instance_from_ami(
            region,
            image.id,
            inst_type=instance_type,
            security_groups=security_groups,
            zone_name=zone_name)
    return image, new_instance
예제 #25
0
def launch_cluster(conn, opts, cluster_name):
    template_vars = {
        'cluster_name':cluster_name,
        'master_security_group': cluster_name + "-master",
        'slave_security_group': cluster_name + "-slaves",
        'discovery_security_group': cluster_name + "-discovery"
    }

    if opts.copy_aws_credentials:
        if opts.deploy_aws_key_id:
            template_vars['aws_key']=opts.deploy_aws_key_id
        else:
            template_vars['aws_key']=opts.aws_access_key_id

        if opts.deploy_aws_key_secret:
            template_vars['aws_secret']=opts.deploy_aws_key_secret
        else:
            template_vars['aws_secret']=opts.aws_secret_access_key

    if opts.identity_file is None:
        print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr)
        sys.exit(1)

    if opts.key_pair is None:
        print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr)
        sys.exit(1)

    print("Setting up security groups...")
    master_group = get_or_make_group(conn, template_vars['master_security_group'], opts.vpc_id)
    slave_group = get_or_make_group(conn, template_vars['slave_security_group'], opts.vpc_id)
    discovery_group = get_or_make_group(conn, template_vars['discovery_security_group'], opts.vpc_id)
    authorized_address = opts.authorized_address

    if master_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            master_group.authorize(src_group=master_group)
            master_group.authorize(src_group=slave_group)
            master_group.authorize(src_group=discovery_group)
        else:
            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                   src_group=discovery_group)
            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                   src_group=discovery_group)
            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                   src_group=discovery_group)
        master_group.authorize('tcp', 22, 22, authorized_address)

    if slave_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            slave_group.authorize(src_group=master_group)
            slave_group.authorize(src_group=slave_group)
            slave_group.authorize(src_group=discovery_group)
        else:
            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=discovery_group)
            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=discovery_group)
            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=discovery_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)

    if discovery_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            discovery_group.authorize(src_group=master_group)
            discovery_group.authorize(src_group=slave_group)
            discovery_group.authorize(src_group=discovery_group)
        else:
            discovery_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=discovery_group)
            discovery_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=discovery_group)
            discovery_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=discovery_group)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                             die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print("ERROR: There are already instances running in group %s or %s" %
              (master_group.name, slave_group.name), file=stderr)
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_ami(opts)

    # we use group ids to work around https://github.com/boto/boto/issues/350
    additional_group_ids = []
    if opts.additional_security_group:
        all_groups = conn.get_all_security_groups()
        additional_group_ids = []
        for group in opts.additional_security_group.split(','):
            additional_group_ids += [sg.id for sg in all_groups if group in (sg.name, sg.id)]

    template_vars['security_groups']= template_vars['discovery_security_group']

    print("Launching instances...")

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print("Could not find AMI " + opts.ami, file=stderr)
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        ebs_devices=[]
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device_id = "/dev/sd" + chr(ord('s') + i)
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map[device_id] = device
            ebs_devices+=device_id
        template_vars['ebs_devices']=' '.join(ebs_devices)

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        local_devices=[]
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.ascii_letters[i + 1]
            block_map[name] = dev
            local_devices+=name
        template_vars['local_devices']=' '.join(local_devices)

    master_user_data_content = get_user_data(opts.master_user_data,template_vars)
    slave_user_data_content = get_user_data(opts.slave_user_data,template_vars)

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                user_data=slave_user_data_content,
                instance_profile_name=opts.instance_profile_name)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print("Waiting for spot instances to be granted...")
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print("All %d slaves granted" % opts.slaves)
                    reservations = conn.get_all_reservations(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print("%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves))
        except:
            print("Canceling spot instance requests")
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(
                conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print(("WARNING: %d instances are still running" % running), file=stderr)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids,
                    instance_type=opts.instance_type,
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    user_data=slave_user_data_content,
                    instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
                    instance_profile_name=opts.instance_profile_name)
                slave_nodes += slave_res.instances
                print("Launched {s} slave{plural_s} in {z}, regid = {r}".format(
                      s=num_slaves_this_zone,
                      plural_s=('' if num_slaves_this_zone == 1 else 's'),
                      z=zone,
                      r=slave_res.id))
            i += 1

    # Launch or resume masters
    if existing_masters:
        print("Starting master...")
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name

        if opts.spot_price is not None:
            # Launch spot instance with the requested price
            print("Requesting master as spot instance with price $%.3f" % (opts.spot_price))
            master_reqs = conn.request_spot_instances(
                    price=opts.spot_price,
                    image_id=opts.ami,
                    key_name=opts.key_pair,
                    launch_group="master-group-%s" % cluster_name,
                    security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids,
                    instance_type=master_type,
                    placement=opts.zone,
                    count=1,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    user_data=master_user_data_content,
                    instance_profile_name=opts.instance_profile_name)
            master_req_id = master_reqs[0].id

            print("Waiting for spot instances to be granted...")
            try:
                while True:
                    time.sleep(10)
                    reqs = conn.get_all_spot_instance_requests()
                    id_to_req = {}
                    for r in reqs:
                        id_to_req[r.id] = r
                    master_instance_ids = []
                    if master_req_id in id_to_req and id_to_req[master_req_id].state == "active":
                        master_instance_ids.append(id_to_req[master_req_id].instance_id)
                        print("Master granted")
                        reservations = conn.get_all_reservations(master_instance_ids)
                        master_nodes = []
                        for r in reservations:
                            master_nodes += r.instances
                        break
                    else:
                        print("Master not granted yet, waiting longer")
            except:
                print("Canceling spot instance request for master")
                conn.cancel_spot_instance_requests([master_req_id])
                sys.exit(0)
        else:
            master_res = image.run(
                key_name=opts.key_pair,
                security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids,
                instance_type=master_type,
                placement=opts.zone,
                min_count=1,
                max_count=1,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                user_data=master_user_data_content,
                instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
                instance_profile_name=opts.instance_profile_name)

            master_nodes = master_res.instances
            print("Launched master in %s, regid = %s" % (zone, master_res.id))

    # This wait time corresponds to SPARK-4983
    print("Waiting for AWS to propagate instance metadata...")
    time.sleep(15)

    # Give the instances descriptive names and set additional tags
    additional_tags = {}
    if opts.additional_tags.strip():
        additional_tags = dict(
            map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
        )

    for master in master_nodes:
        master.add_tags(
            dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
        )

    for slave in slave_nodes:
        slave.add_tags(
            dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
        )

    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #26
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print "Setting up security groups..."
    if opts.security_group_prefix is None:
        master_group = get_or_make_group(conn, cluster_name + "-master")
        slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    else:
        master_group = get_or_make_group(
            conn, opts.security_group_prefix + "-master")
        slave_group = get_or_make_group(conn,
                                        opts.security_group_prefix + "-slaves")
    authorized_address = opts.authorized_address
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, authorized_address)
        master_group.authorize('tcp', 8080, 8081, authorized_address)
        master_group.authorize('tcp', 18080, 18080, authorized_address)
        master_group.authorize('tcp', 19999, 19999, authorized_address)
        master_group.authorize('tcp', 50030, 50030, authorized_address)
        master_group.authorize('tcp', 50070, 50070, authorized_address)
        master_group.authorize('tcp', 60070, 60070, authorized_address)
        master_group.authorize('tcp', 4040, 4045, authorized_address)
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, authorized_address)
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)
        slave_group.authorize('tcp', 8080, 8081, authorized_address)
        slave_group.authorize('tcp', 50060, 50060, authorized_address)
        slave_group.authorize('tcp', 50075, 50075, authorized_address)
        slave_group.authorize('tcp', 60060, 60060, authorized_address)
        slave_group.authorize('tcp', 60075, 60075, authorized_address)

    # Check if instances are already running with the cluster name
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances for name: %s " %
                          cluster_name)
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)

    additional_groups = []
    if opts.additional_security_group:
        additional_groups = [
            sg for sg in conn.get_all_security_groups()
            if opts.additional_security_group in (sg.name, sg.id)
        ]
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group] + additional_groups,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data_content)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                outstanding_request_ids = []
                for i in my_req_ids:
                    if i in id_to_req:
                        if id_to_req[i].state == "active":
                            active_instance_ids.append(
                                id_to_req[i].instance_id)
                        else:
                            outstanding_request_ids.append(i)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer for request ids including %s" % (
                        len(active_instance_ids), opts.slaves,
                        outstanding_request_ids[0:10])
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group] +
                                      additional_groups,
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data_content)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group] +
                               additional_groups,
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    # TODO: Add retry logic for tagging with name since it's used to identify a cluster.
    for master in master_nodes:
        name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)
        for i in range(0, 5):
            try:
                master.add_tag(key='Name', value=name)
            except:
                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
                if (i == 5):
                    raise "Error - failed max attempts to add name tag"
                time.sleep(5)

    for slave in slave_nodes:
        name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)
        for i in range(0, 5):
            try:
                slave.add_tag(key='Name', value=name)
            except:
                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
                if (i == 5):
                    raise "Error - failed max attempts to add name tag"
                time.sleep(5)

    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #27
0
파일: spark_ec2.py 프로젝트: AI-Org/spark
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    authorized_address = opts.authorized_address
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, authorized_address)
        master_group.authorize('tcp', 8080, 8081, authorized_address)
        master_group.authorize('tcp', 18080, 18080, authorized_address)
        master_group.authorize('tcp', 19999, 19999, authorized_address)
        master_group.authorize('tcp', 50030, 50030, authorized_address)
        master_group.authorize('tcp', 50070, 50070, authorized_address)
        master_group.authorize('tcp', 60070, 60070, authorized_address)
        master_group.authorize('tcp', 4040, 4045, authorized_address)
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, authorized_address)
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)
        slave_group.authorize('tcp', 8080, 8081, authorized_address)
        slave_group.authorize('tcp', 50060, 50060, authorized_address)
        slave_group.authorize('tcp', 50075, 50075, authorized_address)
        slave_group.authorize('tcp', 60060, 60060, authorized_address)
        slave_group.authorize('tcp', 60075, 60075, authorized_address)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                             die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances running in " +
                          "group %s or %s" % (master_group.name, slave_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)

    additional_groups = []
    if opts.additional_security_group:
        additional_groups = [sg
                             for sg in conn.get_all_security_groups()
                             if opts.additional_security_group in (sg.name, sg.id)]
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map["/dev/sd" + chr(ord('s') + i)] = device

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.letters[i + 1]
            block_map[name] = dev

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print ("Requesting %d slaves as spot instances with price $%.3f" %
               (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group] + additional_groups,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data_content)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(
                conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" % running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group] + additional_groups,
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data_content)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                                zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group] + additional_groups,
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map,
                               user_data=user_data_content)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    for master in master_nodes:
        master.add_tag(
            key='Name',
            value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
    for slave in slave_nodes:
        slave.add_tag(
            key='Name',
            value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))

    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #28
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr)
        sys.exit(1)

    if opts.key_pair is None:
        print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr)
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print("Setting up security groups...")
    master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id)
    slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id)
    authorized_address = opts.authorized_address
    if master_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            master_group.authorize(src_group=master_group)
            master_group.authorize(src_group=slave_group)
        else:
            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                   src_group=master_group)
            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                   src_group=master_group)
            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                   src_group=master_group)
            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                   src_group=slave_group)
            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                   src_group=slave_group)
            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                   src_group=slave_group)
        master_group.authorize('tcp', 22, 22, authorized_address)
        master_group.authorize('tcp', 8080, 8081, authorized_address)
        master_group.authorize('tcp', 18080, 18080, authorized_address)
        master_group.authorize('tcp', 19999, 19999, authorized_address)
        master_group.authorize('tcp', 50030, 50030, authorized_address)
        master_group.authorize('tcp', 50070, 50070, authorized_address)
        master_group.authorize('tcp', 60070, 60070, authorized_address)
        master_group.authorize('tcp', 4040, 4045, authorized_address)
        # Rstudio (GUI for R) needs port 8787 for web access
        master_group.authorize('tcp', 8787, 8787, authorized_address)
        # HDFS NFS gateway requires 111,2049,4242 for tcp & udp
        master_group.authorize('tcp', 111, 111, authorized_address)
        master_group.authorize('udp', 111, 111, authorized_address)
        master_group.authorize('tcp', 2049, 2049, authorized_address)
        master_group.authorize('udp', 2049, 2049, authorized_address)
        master_group.authorize('tcp', 4242, 4242, authorized_address)
        master_group.authorize('udp', 4242, 4242, authorized_address)
        # RM in YARN mode uses 8088
        master_group.authorize('tcp', 8088, 8088, authorized_address)
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, authorized_address)
    if slave_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            slave_group.authorize(src_group=master_group)
            slave_group.authorize(src_group=slave_group)
        else:
            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=master_group)
            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=master_group)
            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=master_group)
            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=slave_group)
            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=slave_group)
            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)
        slave_group.authorize('tcp', 8080, 8081, authorized_address)
        slave_group.authorize('tcp', 50060, 50060, authorized_address)
        slave_group.authorize('tcp', 50075, 50075, authorized_address)
        slave_group.authorize('tcp', 60060, 60060, authorized_address)
        slave_group.authorize('tcp', 60075, 60075, authorized_address)
#Kylix
        slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060,
                               src_group=slave_group)
        slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060,
                               src_group=slave_group)
        slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060,
                               src_group=master_group)
        slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060,
                               src_group=master_group)
        master_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060,
                               src_group=slave_group)
        master_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060,
                               src_group=slave_group)


    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                             die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print("ERROR: There are already instances running in group %s or %s" %
              (master_group.name, slave_group.name), file=stderr)
        sys.exit(1)

    # we use group ids to work around https://github.com/boto/boto/issues/350
    additional_group_ids = []
    if opts.additional_security_group:
        additional_group_ids = [sg.id
                                for sg in conn.get_all_security_groups()
                                if opts.additional_security_group in (sg.name, sg.id)]
    print("Launching instances...")

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print("Could not find AMI " + opts.ami, file=stderr)
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map["/dev/sd" + chr(ord('s') + i)] = device

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.letters[i + 1]
            block_map[name] = dev

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_group_ids=[slave_group.id] + additional_group_ids,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                user_data=user_data_content,
                instance_profile_name=opts.instance_profile_name)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print("Waiting for spot instances to be granted...")
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print("All %d slaves granted" % opts.slaves)
                    reservations = conn.get_all_reservations(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print("%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves))
        except:
            print("Canceling spot instance requests")
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(
                conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print(("WARNING: %d instances are still running" % running), file=stderr)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id] + additional_group_ids,
                    instance_type=opts.instance_type,
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    user_data=user_data_content,
                    instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
                    instance_profile_name=opts.instance_profile_name)
                slave_nodes += slave_res.instances
                print("Launched {s} slave{plural_s} in {z}, regid = {r}".format(
                      s=num_slaves_this_zone,
                      plural_s=('' if num_slaves_this_zone == 1 else 's'),
                      z=zone,
                      r=slave_res.id))
            i += 1

    # Launch or resume masters
    if existing_masters:
        print("Starting master...")
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(
            key_name=opts.key_pair,
            security_group_ids=[master_group.id] + additional_group_ids,
            instance_type=master_type,
            placement=opts.zone,
            min_count=1,
            max_count=1,
            block_device_map=block_map,
            subnet_id=opts.subnet_id,
            placement_group=opts.placement_group,
            user_data=user_data_content,
            instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
            instance_profile_name=opts.instance_profile_name)

        master_nodes = master_res.instances
        print("Launched master in %s, regid = %s" % (zone, master_res.id))

    # This wait time corresponds to SPARK-4983
    print("Waiting for AWS to propagate instance metadata...")
    time.sleep(15)

    # Give the instances descriptive names and set additional tags
    additional_tags = {}
    if opts.additional_tags.strip():
        additional_tags = dict(
            map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
        )

    for master in master_nodes:
        master.add_tags(
            dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
        )

    for slave in slave_nodes:
        slave.add_tags(
            dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
        )

    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #29
0
파일: mesos_ec2.py 프로젝트: Jay-Zeng/mesos
def launch_cluster(conn, opts, cluster_name):
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=zoo_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=zoo_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    if zoo_group.rules == []:  # Group was just now created
        zoo_group.authorize(src_group=master_group)
        zoo_group.authorize(src_group=slave_group)
        zoo_group.authorize(src_group=zoo_group)
        zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

    # Check if instances are already running in our groups
    print "Checking for running cluster..."
    reservations = conn.get_all_instances()
    for res in reservations:
        group_names = [g.id for g in res.groups]
        if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
            active = [i for i in res.instances if is_active(i)]
            if len(active) > 0:
                print >> stderr, (
                    "ERROR: There are already instances running in " +
                    "group %s, %s or %s" %
                    (master_group.name, slave_group.name, zoo_group.name))
                sys.exit(1)
    print "Launching instances..."
    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    slave_res = image.run(key_name=opts.key_pair,
                          security_groups=[slave_group],
                          instance_type=opts.instance_type,
                          placement=opts.zone,
                          min_count=opts.slaves,
                          max_count=opts.slaves,
                          block_device_map=block_map)
    slave_nodes = slave_res.instances
    print "Launched slaves, regid = " + slave_res.id

    # Launch masters
    master_type = opts.master_instance_type
    if master_type == "":
        master_type = opts.instance_type
    master_res = image.run(key_name=opts.key_pair,
                           security_groups=[master_group],
                           instance_type=master_type,
                           placement=opts.zone,
                           min_count=opts.ft,
                           max_count=opts.ft,
                           block_device_map=block_map)
    master_nodes = master_res.instances
    print "Launched master, regid = " + master_res.id

    # Launch ZooKeeper nodes if required
    if opts.ft > 1:
        zoo_res = image.run(key_name=opts.key_pair,
                            security_groups=[zoo_group],
                            instance_type=opts.instance_type,
                            placement=opts.zone,
                            min_count=3,
                            max_count=3,
                            block_device_map=block_map)
        zoo_nodes = zoo_res.instances
        print "Launched zoo, regid = " + zoo_res.id
    else:
        zoo_nodes = []

    # Return all the instances
    return (master_nodes, slave_nodes, zoo_nodes)
예제 #30
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print "Setting up security groups..."
    if opts.security_group_prefix is None:
        master_group = get_or_make_group(conn, cluster_name + "-master")
        slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    else:
        master_group = get_or_make_group(conn, opts.security_group_prefix + "-master")
        slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0')
        master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    # Check if instances are already running with the cluster name
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                             die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances for name: %s " % cluster_name)
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print ("Requesting %d slaves as spot instances with price $%.3f" %
               (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group],
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data_content)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                outstanding_request_ids = []
                for i in my_req_ids:
                    if i in id_to_req:
                        if id_to_req[i].state == "active":
                            active_instance_ids.append(id_to_req[i].instance_id)
                        else:
                            outstanding_request_ids.append(i)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer for request ids including %s" % (
                        len(active_instance_ids), opts.slaves, outstanding_request_ids[0:10])
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(
                conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" % running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group],
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data_content)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                                zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group],
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    # TODO: Add retry logic for tagging with name since it's used to identify a cluster.
    for master in master_nodes:
        name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)
        for i in range(0, 5):
            try:
                master.add_tag(key='Name', value=name)
            except:
                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
                if (i == 5):
                    raise "Error - failed max attempts to add name tag"
                time.sleep(5)


    for slave in slave_nodes:
        name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)
        for i in range(0, 5):
            try:
                slave.add_tag(key='Name', value=name)
            except:
                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
                if (i == 5):
                    raise "Error - failed max attempts to add name tag"
                time.sleep(5)

    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #31
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    master_group.owner_id = os.getenv('EC2_USER_ID')
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    slave_group.owner_id = os.getenv('EC2_USER_ID')
    zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
    zoo_group.owner_id = os.getenv('EC2_USER_ID')

    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=zoo_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
        master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=zoo_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    if zoo_group.rules == []:  # Group was just now created
        zoo_group.authorize(src_group=master_group)
        zoo_group.authorize(src_group=slave_group)
        zoo_group.authorize(src_group=zoo_group)
        zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

    # Check if instances are already running in our groups
    existing_masters, existing_slaves, existing_zoos = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, (
            "ERROR: There are already instances running in " +
            "group %s or %s" %
            (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    logging.debug("Calling boto BlockDeviceMapping()...")
    block_map = BlockDeviceMapping()
    logging.debug(" Printing block_map..")
    #print block_map
    if opts.ebs_vol_size > 0:
        logging.debug("Calling boto EBSBlockDeviceType()...")
        device = EBSBlockDeviceType()
        #print "device: ", device
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        device.ephemeral_name = "ephemeral0"
        #block_map["/dev/sdv"] = device
        #block_map["/dev/sdv"] = device
        block_map["/dev/vdb"] = device

    if opts.user_data_file != None:
        user_data_file = open(opts.user_data_file)
        try:
            opts.user_data = user_data_file.read()
            #print "user data (encoded) = ", opts.user_data
        finally:
            user_data_file.close()

    # Launch non-spot instances
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    slave_nodes = []
    for zone in zones:
        num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
        if num_slaves_this_zone > 0:
            slave_res = image.run(key_name=opts.key_pair,
                                  security_groups=[slave_group],
                                  instance_type=opts.instance_type,
                                  placement=zone,
                                  min_count=num_slaves_this_zone,
                                  max_count=num_slaves_this_zone,
                                  block_device_map=block_map,
                                  user_data=opts.user_data)
            slave_nodes += slave_res.instances
            print "Launched %d slaves in %s, regid = %s" % (
                num_slaves_this_zone, zone, slave_res.id)
        i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group],
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map,
                               user_data=opts.user_data)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Launch ZooKeeper nodes if required
    if int(opts.ft) > 1:
        print "Running " + opts.ft + " zookeepers"
        zoo_res = image.run(key_name=opts.key_pair,
                            security_groups=[zoo_group],
                            instance_type=opts.instance_type,
                            placement=opts.zone,
                            min_count=3,
                            max_count=3,
                            block_device_map=block_map,
                            user_data=opts.user_data)
        zoo_nodes = zoo_res.instances
        print "Launched zoo, regid = " + zoo_res.id
    else:
        zoo_nodes = []

    # Return all the instances
    return (master_nodes, slave_nodes, zoo_nodes)
예제 #32
0
파일: _aws.py 프로젝트: Andrew8305/flocker
    def create_nodes(self, reactor, names, distribution, metadata={}):
        """
        Create nodes with the given names.

        :param reactor: The reactor.
        :param name: The names of the nodes.
        :type name: list of str
        :param str distribution: The name of the distribution to
            install on the nodes.
        :param dict metadata: Metadata to associate with the nodes.

        :return: A list of ``Deferred``s each firing with an INode
            when the corresponding node is created.   The list has
            the same order as :param:`names`.
        """
        size = self._default_size
        disk_size = 8

        action = start_action(
            action_type=u"flocker:provision:aws:create_nodes",
            instance_count=len(names),
            distribution=distribution,
            image_size=size,
            disk_size=disk_size,
            metadata=metadata,
        )
        with action.context():
            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]},
            )

            instances = self._run_nodes(
                count=len(names),
                image_id=images[0].id,
                size=size,
                diskmap=diskmap
            )

            def make_node(ignored, name, instance):
                return AWSNode(
                    name=name,
                    _provisioner=self,
                    _instance=instance,
                    distribution=distribution,
                )

            results = []
            for name, instance in izip_longest(names, instances):
                if instance is None:
                    results.append(fail(Exception("Could not run instance")))
                else:
                    node_metadata = metadata.copy()
                    node_metadata['Name'] = name
                    d = self._async_get_node(reactor, instance, node_metadata)
                    d = DeferredContext(d)
                    d.addCallback(make_node, name, instance)
                    results.append(d.result)
            action_completion = DeferredContext(DeferredList(results))
            action_completion.addActionFinish()
            # Individual results and errors should be consumed by the caller,
            # so we can leave action_completion alone now.
            return results
예제 #33
0
    def _launch_wait_snapshot(
        self,
        ami,
        user_data,
        img_size=10,
        inst_type="m1.small",
        img_name=None,
        img_desc=None,
        remote_access_command=None,
    ):
        ebs_root = EBSBlockDeviceType()
        ebs_root.size = img_size
        ebs_root.delete_on_termination = True
        block_map = BlockDeviceMapping()
        block_map["/dev/sda"] = ebs_root
        sgroup_name = "ec2helper-ssh-%x" % random.randrange(2 ** 32)
        self.create_sgroup(sgroup_name)

        # Now launch it
        self.log.debug("Starting %s in %s with as %s" % (ami, self.region.name, inst_type))
        reservation = self.conn.run_instances(
            ami,
            max_count=1,
            instance_type=inst_type,
            user_data=user_data,
            security_groups=[sgroup_name],
            block_device_map=block_map,
        )
        if len(reservation.instances) == 0:
            raise Exception("Attempt to start instance failed")
        self.instance = reservation.instances[0]
        wait_for_ec2_instance_state(self.instance, self.log, final_state="running", timeout=300)
        self.instance.add_tag("Name", resource_tag)
        self.log.debug("Instance (%s) is now running" % self.instance.id)
        self.log.debug("Public DNS will be: %s" % self.instance.public_dns_name)
        self.log.debug("Now waiting up to 30 minutes for instance to stop")

        wait_for_ec2_instance_state(self.instance, self.log, final_state="stopped", timeout=1800)

        # Snapshot
        self.log.debug("Creating a new EBS image from completed/stopped EBS instance")
        new_ami_id = self.conn.create_image(self.instance.id, img_name, img_desc)
        self.log.debug("boto creat_image call returned AMI ID: %s" % new_ami_id)
        self.log.debug("Waiting for newly generated AMI to become available")
        # As with launching an instance we have seen occasional issues when
        # trying to query this AMI right away - give it a moment to settle
        sleep(10)
        new_amis = self.conn.get_all_images([new_ami_id])
        new_ami = new_amis[0]
        timeout = 120
        interval = 10
        for i in range(timeout):
            new_ami.update()
            if new_ami.state == "available":
                new_ami.add_tag("Name", resource_tag)
                break
            elif new_ami.state == "failed":
                raise Exception("Amazon reports EBS image creation failed")
            self.log.debug(
                "AMI status (%s) is not 'available' - [%d of %d seconds]"
                % (new_ami.state, i * interval, timeout * interval)
            )
            sleep(interval)
        self.log.debug("Terminating/deleting instance")
        safe_call(instance.terminate, (), self.log)
        sleep(5)
        if new_ami.state != "available":
            raise Exception("Failed to produce an AMI ID")
        self.log.debug("SUCCESS: %s is now available for launch" % new_ami_id)
        return new_ami_id
예제 #34
0
파일: _aws.py 프로젝트: zendad/flocker
    def create_nodes(self, reactor, names, distribution, metadata={}):
        """
        Create nodes with the given names.

        :param reactor: The reactor.
        :param name: The names of the nodes.
        :type name: list of str
        :param str distribution: The name of the distribution to
            install on the nodes.
        :param dict metadata: Metadata to associate with the nodes.

        :return: A list of ``Deferred``s each firing with an INode
            when the corresponding node is created.   The list has
            the same order as :param:`names`.
        """
        size = self._default_size
        disk_size = 8

        action = start_action(
            action_type=u"flocker:provision:aws:create_nodes",
            instance_count=len(names),
            distribution=distribution,
            image_size=size,
            disk_size=disk_size,
            metadata=metadata,
        )
        with action.context():
            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]}, )

            instances = self._run_nodes(count=len(names),
                                        image_id=images[0].id,
                                        size=size,
                                        diskmap=diskmap)

            def make_node(ignored, name, instance):
                return AWSNode(
                    name=name,
                    _provisioner=self,
                    _instance=instance,
                    distribution=distribution,
                )

            results = []
            for name, instance in izip_longest(names, instances):
                if instance is None:
                    results.append(fail(Exception("Could not run instance")))
                else:
                    node_metadata = metadata.copy()
                    node_metadata['Name'] = name
                    d = self._async_get_node(reactor, instance, node_metadata)
                    d = DeferredContext(d)
                    d.addCallback(make_node, name, instance)
                    results.append(d.result)
            action_completion = DeferredContext(DeferredList(results))
            action_completion.addActionFinish()
            # Individual results and errors should be consumed by the caller,
            # so we can leave action_completion alone now.
            return results
예제 #35
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    if opts.vpc_id is None:
        print "Setting up EC2-Classic security groups..."
        master_group = get_or_make_group(conn, cluster_name + "-master")
        slave_group = get_or_make_group(conn, cluster_name + "-slaves")
        if master_group.rules == []:  # Group was just now created
            master_group.authorize(src_group=master_group)
            master_group.authorize(src_group=slave_group)
            master_group.authorize("tcp", 22, 22, "0.0.0.0/0")
            master_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
            master_group.authorize("tcp", 19999, 19999, "0.0.0.0/0")
            master_group.authorize("tcp", 50030, 50030, "0.0.0.0/0")
            master_group.authorize("tcp", 50070, 50070, "0.0.0.0/0")
            master_group.authorize("tcp", 60070, 60070, "0.0.0.0/0")
            master_group.authorize("tcp", 4040, 4045, "0.0.0.0/0")
            if opts.ganglia:
                master_group.authorize("tcp", 5080, 5080, "0.0.0.0/0")
        if slave_group.rules == []:  # Group was just now created
            slave_group.authorize(src_group=master_group)
            slave_group.authorize(src_group=slave_group)
            slave_group.authorize("tcp", 22, 22, "0.0.0.0/0")
            slave_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
            slave_group.authorize("tcp", 50060, 50060, "0.0.0.0/0")
            slave_group.authorize("tcp", 50075, 50075, "0.0.0.0/0")
            slave_group.authorize("tcp", 60060, 60060, "0.0.0.0/0")
            slave_group.authorize("tcp", 60075, 60075, "0.0.0.0/0")

    else:
        print "Setting up VPC security groups..."
        master_group = get_or_make_group(conn, cluster_name + "-master", vpc_id=opts.vpc_id)
        slave_group = get_or_make_group(conn, cluster_name + "-slaves", vpc_id=opts.vpc_id)
        if master_group.rules == []:  # Group was just now created
            master_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=master_group)
            master_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=slave_group)
            master_group.authorize("tcp", 22, 22, "0.0.0.0/0")
            master_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
            master_group.authorize("tcp", 19999, 19999, "0.0.0.0/0")
            master_group.authorize("tcp", 50030, 50030, "0.0.0.0/0")
            master_group.authorize("tcp", 50070, 50070, "0.0.0.0/0")
            master_group.authorize("tcp", 60070, 60070, "0.0.0.0/0")
            master_group.authorize("tcp", 4040, 4045, "0.0.0.0/0")
            if opts.ganglia:
                master_group.authorize("tcp", 5080, 5080, "0.0.0.0/0")
        if slave_group.rules == []:  # Group was just now created
            slave_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=master_group)
            slave_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=slave_group)
            slave_group.authorize("tcp", 22, 22, "0.0.0.0/0")
            slave_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
            slave_group.authorize("tcp", 50060, 50060, "0.0.0.0/0")
            slave_group.authorize("tcp", 50075, 50075, "0.0.0.0/0")
            slave_group.authorize("tcp", 60060, 60060, "0.0.0.0/0")
            slave_group.authorize("tcp", 60075, 60075, "0.0.0.0/0")

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, (
            "ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)
        )
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Hack to set VPC private hostname //fix later

    user_data = """#!/bin/bash
    hostname $(curl http://169.254.169.254/latest/meta-data/local-hostname)
    """

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if opts.vpc_id is None:
                slave_reqs = conn.request_spot_instances(
                    price=opts.spot_price,
                    image_id=opts.ami,
                    launch_group="launch-group-%s" % cluster_name,
                    placement=zone,
                    count=num_slaves_this_zone,
                    key_name=opts.key_pair,
                    security_groups=[slave_group],
                    instance_type=opts.instance_type,
                    block_device_map=block_map,
                    placement_group=opts.placement_group,
                )

            if opts.vpc_id is not None:

                interface = ec2.networkinterface.NetworkInterfaceSpecification(
                    device_index=0, subnet_id=opts.subnet_id, groups=[slave_group.id], associate_public_ip_address=True
                )

                interfaces = ec2.networkinterface.NetworkInterfaceCollection(interface)

                slave_reqs = conn.request_spot_instances(
                    price=opts.spot_price,
                    image_id=opts.ami,
                    launch_group="launch-group-%s" % cluster_name,
                    count=num_slaves_this_zone,
                    key_name=opts.key_pair,
                    instance_type=opts.instance_type,
                    block_device_map=block_map,
                    network_interfaces=interfaces,
                    user_data=user_data,
                    placement_group=opts.placement_group,
                )
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" % running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                if opts.vpc_id is None:
                    slave_res = image.run(
                        key_name=opts.key_pair,
                        security_groups=[slave_group],
                        instance_type=opts.instance_type,
                        placement=zone,
                        min_count=num_slaves_this_zone,
                        max_count=num_slaves_this_zone,
                        block_device_map=block_map,
                        placement_group=opts.placement_group,
                    )

                if opts.vpc_id is not None:

                    interface = ec2.networkinterface.NetworkInterfaceSpecification(
                        device_index=0,
                        subnet_id=opts.subnet_id,
                        groups=[slave_group.id],
                        associate_public_ip_address=True,
                    )
                    interfaces = ec2.networkinterface.NetworkInterfaceCollection(interface)

                    slave_res = conn.run_instances(
                        image_id=opts.ami,
                        key_name=opts.key_pair,
                        instance_type=opts.instance_type,
                        min_count=num_slaves_this_zone,
                        max_count=num_slaves_this_zone,
                        block_device_map=block_map,
                        network_interfaces=interfaces,
                        user_data=user_data,
                        placement_group=opts.placement_group,
                    )

                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == "all":
            opts.zone = random.choice(conn.get_all_zones()).name
        if opts.vpc_id is None:
            master_res = image.run(
                key_name=opts.key_pair,
                security_groups=[master_group],
                instance_type=master_type,
                placement=opts.zone,
                min_count=1,
                max_count=1,
                block_device_map=block_map,
            )

        if opts.vpc_id is not None:

            interface = ec2.networkinterface.NetworkInterfaceSpecification(
                device_index=0, subnet_id=opts.subnet_id, groups=[master_group.id], associate_public_ip_address=True
            )
            interfaces = ec2.networkinterface.NetworkInterfaceCollection(interface)

            master_res = conn.run_instances(
                image_id=opts.ami,
                key_name=opts.key_pair,
                instance_type=master_type,
                min_count=1,
                max_count=1,
                block_device_map=block_map,
                network_interfaces=interfaces,
                user_data=user_data,
            )

            master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    for master in master_nodes:
        master.add_tag(key="Name", value="spark-{cn}-master-{iid}".format(cn=cluster_name, iid=master.id))
    for slave in slave_nodes:
        slave.add_tag(key="Name", value="spark-{cn}-slave-{iid}".format(cn=cluster_name, iid=slave.id))

    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #36
0
def launch_cluster(conn, opts, cluster_name):
  if opts.identity_file is None:
    print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
    sys.exit(1)
  if opts.key_pair is None:
    print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
    sys.exit(1)
  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name + "-master")
  master_group.owner_id = os.getenv('EC2_USER_ID')
  slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  slave_group.owner_id = os.getenv('EC2_USER_ID')
  zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
  zoo_group.owner_id = os.getenv('EC2_USER_ID')
  
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
    master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
    master_group.authorize('tcp', 40000, 40000, '0.0.0.0/0') #apache hama
    master_group.authorize('tcp', 40013, 40013, '0.0.0.0/0') #apache hama
    if opts.ganglia:
      master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    slave_group.authorize('tcp', 40015, 40015, '0.0.0.0/0') ##apache hama web UI
  
  if zoo_group.rules == []: # Group was just now created
      zoo_group.authorize(src_group=master_group)
      zoo_group.authorize(src_group=slave_group)
      zoo_group.authorize(src_group=zoo_group)
      zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
      zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
      zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
      zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')   
   


  # Check if instances are already running in our groups
  existing_masters, existing_slaves, existing_zoos = get_existing_cluster(conn, opts, cluster_name,
                                                           die_on_error=False)
  if existing_slaves or (existing_masters and not opts.use_existing_master):
    print >> stderr, ("ERROR: There are already instances running in " +
        "group %s or %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
    sys.exit(1)

  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.emi])[0]
  except:
    print >> stderr, "Could not find emi " + opts.emi
    sys.exit(1)
    
  try:
    image_master = conn.get_all_images(image_ids=[opts.emi_master])[0]
  except:
    print >> stderr, "Could not find emi " + opts.emi_master
    sys.exit(1)
  
  if (opts.emi_zoo != ""):  
      try:
        image_zoo = conn.get_all_images(image_ids=[opts.emi_zoo])[0]
      except:
        print >> stderr, "Could not find emi " + opts.emi_zoo
        sys.exit(1)     
    
    
    

  # Create block device mapping so that we can add an EBS volume if asked to
  logging.debug( "Calling boto BlockDeviceMapping()...")
  block_map = BlockDeviceMapping()
  logging.debug(" Printing block_map..") 
  #print block_map
  if opts.ebs_vol_size > 0:
    logging.debug("Calling boto EBSBlockDeviceType()...")
    device = EBSBlockDeviceType()
    #print "device: ", device
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    device.ephemeral_name = "ephemeral0"
    #block_map["/dev/sdv"] = device
    #block_map["/dev/sdv"] = device
    block_map["/dev/vdb"] = device
    
  if opts.user_data_file != None:
      user_data_file = open(opts.user_data_file)
      try:
          opts.user_data = user_data_file.read()
          #print "user data (encoded) = ", opts.user_data
      finally:
          user_data_file.close()
  
  # Launch non-spot instances
  zones = get_zones(conn, opts)    
  num_zones = len(zones)
  i = 0
  slave_nodes = []
  for zone in zones:
    num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
    if num_slaves_this_zone > 0:
        slave_res = image.run(key_name = opts.key_pair,
                              security_groups = [slave_group],
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map,
                              user_data = opts.user_data)
        slave_nodes += slave_res.instances
        print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, slave_res.id)
    i += 1  

  # Launch or resume masters
  if existing_masters:
    print "Starting master..."
    for inst in existing_masters:
      if inst.state not in ["shutting-down", "terminated"]:  
        inst.start()
    master_nodes = existing_masters
  else:
    master_type = opts.master_instance_type
    if master_type == "":
      master_type = opts.instance_type
    if opts.zone == 'all':
      opts.zone = random.choice(conn.get_all_zones()).name
    master_res = image_master.run(key_name = opts.key_pair,
                           security_groups = [master_group],
                           instance_type = master_type,
                           placement = opts.zone,
                           min_count = 1,
                           max_count = 1,
                           block_device_map = block_map,
                           user_data = opts.user_data)
    master_nodes = master_res.instances
    print "Launched master in %s, regid = %s" % (zone, master_res.id)
    
  # Launch ZooKeeper nodes if required
  if int(opts.ft) > 1:
    print "Running " + opts.ft + " zookeepers"
    zoo_res = image_zoo.run(key_name = opts.key_pair,
                        security_groups = [zoo_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = 3,
                        max_count = 3,
                        block_device_map = block_map,
                        user_data = opts.user_data)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
예제 #37
0
def launch_cluster(conn, opts, cluster_name):
  if opts.identity_file is None:
    print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
    sys.exit(1)
  if opts.key_pair is None:
    print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
    sys.exit(1)
  print "Setting up security groups..."
  
  if opts.one_security_group:
    master_group = get_or_make_group(conn, cluster_name + "-group")
    master_group.owner_id = os.getenv('EC2_USER_ID')
    slave_group = master_group
    zoo_group = master_group
  
  else:
      master_group = get_or_make_group(conn, cluster_name + "-master")
      master_group.owner_id = os.getenv('EC2_USER_ID')
      slave_group = get_or_make_group(conn, cluster_name + "-slaves")
      slave_group.owner_id = os.getenv('EC2_USER_ID')
      zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
      zoo_group.owner_id = os.getenv('EC2_USER_ID')
      
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
    master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50031, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
    master_group.authorize('tcp', 40000, 40000, '0.0.0.0/0') #apache hama
    master_group.authorize('tcp', 40013, 40013, '0.0.0.0/0') #apache hama
    master_group.authorize('tcp', 8020, 8020, '0.0.0.0/0') #hdfs HA nameservice
    master_group.authorize('tcp', 8485, 8485, '0.0.0.0/0') #journal nodes
    master_group.authorize('tcp', 8023, 8023, '0.0.0.0/0') #jt HA   
    master_group.authorize('tcp', 8021, 8021, '0.0.0.0/0') #jt HA
    master_group.authorize('tcp', 8018, 8019, '0.0.0.0/0') #zkfc
    master_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui    
    
    #If cohosted with zookeeper open necessary ports
    if opts.cohost:
        print "Opening additional ports for zookeeper... "
        master_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        master_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        master_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') 
        
    if opts.ganglia:
      master_group.authorize('tcp', 80, 80, '0.0.0.0/0')
      #Also needed 8649 and 8651 but check if only for master
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    slave_group.authorize('tcp', 40015, 40015, '0.0.0.0/0') ##apache hama web UI
    slave_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui
    slave_group.authorize('tcp', 31000, 32000, '0.0.0.0/0') #task tracker web ui    
  
  if zoo_group.rules == []: # Group was just now created
      zoo_group.authorize(src_group=master_group)
      zoo_group.authorize(src_group=slave_group)
      zoo_group.authorize(src_group=zoo_group)
      zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
      zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
      zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
      zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')
      zoo_group.authorize('tcp', 8018, 8020, '0.0.0.0/0') #hdfs HA nameservic
      zoo_group.authorize('tcp', 8485, 8485, '0.0.0.0/0') #journal nodes
      zoo_group.authorize('tcp', 8023, 8023, '0.0.0.0/0') #jt HA
      zoo_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui        
   


  # Check if instances are already running in our groups
  # Grouped instances are instances that run on the same security group in order to allow communication
  # using private IPs and without DNS resolving
  existing_masters, existing_slaves, existing_zoos, existing_grouped = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
  if existing_slaves or (existing_masters and not opts.use_existing_master) or existing_grouped:
    print >> stderr, ("ERROR: There are already instances running in " +
        "group %s or %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
    sys.exit(1)

  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.emi])[0]
  except:
    print >> stderr, "Could not find emi " + opts.emi
    sys.exit(1)
    
  try:
    image_master = conn.get_all_images(image_ids=[opts.emi_master])[0]
  except:
    print >> stderr, "Could not find emi " + opts.emi_master
    sys.exit(1)
  
  # Launch additional ZooKeeper nodes if required - ex: if mesos masters specified are 2 and the zoo_num=3 (default)
  if int(opts.ft) > 1:
    if(opts.cohost):
        zoo_num = str(int(opts.zoo_num) - int(opts.ft)) #extra zoo instances needed
    else:
        zoo_num = opts.zoo_num
  else:
      zoo_num = opts.zoo_num
      
  if (zoo_num > 0):
      if opts.emi_zoo == "":
          emi_zoo = opts.emi_master 
      else:
          emi_zoo = opts.emi_zoo
              
      try:
        image_zoo = conn.get_all_images(image_ids=[emi_zoo])[0]
      except:
        print >> stderr, "Could not find emi " + emi_zoo
        sys.exit(1)
       

  # Create block device mapping so that we can add an EBS volume if asked to
  logging.debug( "Calling boto BlockDeviceMapping()...")
  block_map = BlockDeviceMapping()
  logging.debug(" Printing block_map..") 
  #print block_map
  if opts.ebs_vol_size > 0:
    logging.debug("Calling boto EBSBlockDeviceType()...")
    device = EBSBlockDeviceType()
    #print "device: ", device
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    device.ephemeral_name = "ephemeral0"
    #block_map["/dev/sdv"] = device
    #block_map["/dev/sdv"] = device
    block_map["/dev/vdb"] = device
    
  if opts.user_data_file != None:
      user_data_file = open(opts.user_data_file)
      try:
          opts.user_data = user_data_file.read()
          #print "user data (encoded) = ", opts.user_data
      finally:
          user_data_file.close()
  
  # Launch non-spot instances
  zones = get_zones(conn, opts)    
  num_zones = len(zones)
  i = 0
  slave_nodes = []
  for zone in zones:
    num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
    if num_slaves_this_zone > 0:
        slave_res = image.run(key_name = opts.key_pair,
                              security_groups = [slave_group],
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map,
                              user_data = opts.user_data)
        slave_nodes += slave_res.instances
        print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, slave_res.id)
    i += 1  

  # Launch or resume masters
  if existing_masters:
    print "Starting master..."
    for inst in existing_masters:
      if inst.state not in ["shutting-down", "terminated"]:  
        inst.start()
    master_nodes = existing_masters
  else:
    master_type = opts.master_instance_type
    if master_type == "":
      master_type = opts.instance_type
    if opts.zone == 'all':
      opts.zone = random.choice(conn.get_all_zones()).name
    
    print "Running " + opts.ft + " masters"
    master_res = image_master.run(key_name = opts.key_pair,
                           security_groups = [master_group],
                           instance_type = master_type,
                           placement = opts.zone,
                           min_count = opts.ft,
                           max_count = opts.ft,
                           block_device_map = block_map,
                           user_data = opts.user_data)
    master_nodes = master_res.instances
    print "Launched master in %s, regid = %s" % (zone, master_res.id)

  if(zoo_num > 0):
    
    print "Running additional " + zoo_num + " zookeepers"
    zoo_res = image_zoo.run(key_name = opts.key_pair,
                        security_groups = [zoo_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = zoo_num,
                        max_count = zoo_num,
                        block_device_map = block_map,
                        user_data = opts.user_data)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []
    
  if (opts.cohost):
      print "Zookeepers are co-hosted on mesos instances..."

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
예제 #38
0
def launch_cluster(conn, opts, cluster_name):

  conn = AWSConnection(conn, VPCConnection(region=conn.region))

  print "Setting up VPC..."
  vpc = get_or_make_vpc(conn, cluster_name, 'mesos-vpc')
  print "Using vpc: %s" % (vpc.id)


  print "Setting up subnet..."
  subnet = get_or_make_subnet(conn, vpc.id, opts.zone, cluster_name, 'mesos-subnet')
  print "Using subnet: %s" % (subnet.id)

  # Add internet gateway to VPC.
  print "Creating internet gateway"
  ig = get_or_make_ig(conn, vpc.id, cluster_name, 'mesos-vpc')
  print "Using internet gateway: %s" % (ig.id)
  
  # Add route to route table
  rt = get_or_make_rt(conn, vpc.id, cluster_name, 'mesos-rt')
  conn.vpc.create_route(rt.id, '0.0.0.0/0', gateway_id=ig.id)

  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-masters")
  slave_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-slaves")
  zoo_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-zoo")

  if master_group.rules == []: # Group was just now created
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  print "Checking for running cluster..."
  reservations = conn.ec2.get_all_instances()
  for res in reservations:
    group_names = [g.name for g in res.groups]
    if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
      active = [i for i in res.instances if is_active(i)]
      if len(active) > 0:
        print >> stderr, ("ERROR: There are already instances running in " +
            "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)
  
  print "Launching instances..."
  if opts.ami == "latest":
    # Figure out the latest AMI from our static URL
    try:
      opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip()
    except:
      print >> stderr, "Could not read " + LATEST_AMI_URL

  try:
    image = conn.ec2.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    slave_reqs = conn.ec2.request_spot_instances(
        price = opts.spot_price,
        image_id = opts.ami,
        launch_group = "launch-group-%s" % cluster_name,
        placement = opts.zone,
        count = opts.slaves,
        key_name = opts.key_pair,
        security_groups = [slave_group],
        instance_type = opts.instance_type,
        block_device_map = block_map)
    my_req_ids = [req.id for req in slave_reqs]
    print "Waiting for spot instances to be granted..."
    while True:
      time.sleep(10)
      reqs = conn.get_all_spot_instance_requests()
      id_to_req = {}
      for r in reqs:
        id_to_req[r.id] = r
      active = 0
      instance_ids = []
      for i in my_req_ids:
        if id_to_req[i].state == "active":
          active += 1
          instance_ids.append(id_to_req[i].instance_id)
      if active == opts.slaves:
        print "All %d slaves granted" % opts.slaves
        reservations = conn.get_all_instances(instance_ids)
        slave_nodes = []
        for r in reservations:
          slave_nodes += r.instances
        break
      else:
        print "%d of %d slaves granted, waiting longer" % (active, opts.slaves)
  else:
    # Launch non-spot instances
    slave_res = conn.ec2.run_instances(opts.ami,
                          key_name = opts.key_pair,
                          subnet_id = subnet.id,
                          security_group_ids = [slave_group.id],
                          instance_type = opts.instance_type,
                          placement = opts.zone,
                          min_count = opts.slaves,
                          max_count = opts.slaves,
                          block_device_map = block_map)
    slave_nodes = slave_res.instances
    print "Launched slaves, regid = " + slave_res.id

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnet.id,
                                                                    groups=[master_group.id],
                                                                    associate_public_ip_address=True)
  interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
  master_res = conn.ec2.run_instances(opts.ami,
                          key_name = opts.key_pair,
                         instance_type = master_type,
                         placement = opts.zone,
                         network_interfaces = interfaces,
                         min_count = opts.ft,
                         max_count = opts.ft,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master, regid = " + master_res.id

  # Launch ZooKeeper nodes if required
  if opts.ft > 1:
    zoo_res = conn.ec2.run_instances(opts.ami,
                        key_name = opts.key_pair,
                        subnet_id = subnet.id,
                        security_group_ids = [zoo_group.id],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = 3,
                        max_count = 3,
                        block_device_map = block_map)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
예제 #39
0
def launch_cluster(conn, opts, cluster_name):

    # Remove known hosts to avoid "Offending key for IP ..." errors.
    known_hosts = os.environ["HOME"] + "/.ssh/known_hosts"
    if os.path.isfile(known_hosts):
        os.remove(known_hosts)
    if opts.key_pair is None:
        opts.key_pair = keypair()
        if opts.key_pair is None:
            print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
            sys.exit(1)

    if opts.profile is None:
        opts.profile = profile()
        if opts.profile is None:
            print >> stderr, "ERROR: No profile found in current host. It be provided with -p option."
            sys.exit(1)

    public_key = pub_key()
    user_data = Template(
        """#!/bin/bash
  set -e -x
  echo '$public_key' >> ~root/.ssh/authorized_keys
  echo '$public_key' >> ~ec2-user/.ssh/authorized_keys"""
    ).substitute(public_key=public_key)

    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    sparknotebook_group = get_or_make_group(conn, "SparkNotebookApplication")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=sparknotebook_group)
        master_group.authorize("tcp", 22, 22, "0.0.0.0/0")
        master_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
        master_group.authorize("tcp", 18080, 18080, "0.0.0.0/0")
        master_group.authorize("tcp", 19999, 19999, "0.0.0.0/0")
        master_group.authorize("tcp", 50030, 50030, "0.0.0.0/0")
        master_group.authorize("tcp", 50070, 50070, "0.0.0.0/0")
        master_group.authorize("tcp", 60070, 60070, "0.0.0.0/0")
        master_group.authorize("tcp", 4040, 4045, "0.0.0.0/0")
        master_group.authorize("tcp", 7077, 7077, "0.0.0.0/0")
        if opts.ganglia:
            master_group.authorize("tcp", 5080, 5080, "0.0.0.0/0")
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=sparknotebook_group)
        slave_group.authorize("tcp", 22, 22, "0.0.0.0/0")
        slave_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
        slave_group.authorize("tcp", 50060, 50060, "0.0.0.0/0")
        slave_group.authorize("tcp", 50075, 50075, "0.0.0.0/0")
        slave_group.authorize("tcp", 60060, 60060, "0.0.0.0/0")
        slave_group.authorize("tcp", 60075, 60075, "0.0.0.0/0")

    if not any(r for r in sparknotebook_group.rules for g in r.grants if master_group.id == g.group_id):
        sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=master_group)
        sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=master_group)

    if not any(r for r in sparknotebook_group.rules for g in r.grants if slave_group.id == g.group_id):
        sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=slave_group)
        sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=slave_group)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, (
            "ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)
        )
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        zones = get_zones(conn, opts)

        num_zones = len(zones)
        i = 0
        my_req_ids = []

        for zone in zones:
            best_price = find_best_price(conn, opts.instance_type, zone, opts.spot_price)
            # Launch spot instances with the requested price
            print >> stderr, (
                "Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)"
                % (opts.slaves, best_price, opts.slaves * best_price)
            )

            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
                subnet_id=subnetId(), groups=[slave_group.id], associate_public_ip_address=True
            )
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)

            slave_reqs = conn.request_spot_instances(
                price=best_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
                network_interfaces=interfaces,
            )
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print >> stderr, "Waiting for spot instances to be granted"
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print >> stderr, "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    # print >> stderr, ".",
                    print "%d of %d slaves granted, waiting longer" % (len(active_instance_ids), opts.slaves)
        except:
            print >> stderr, "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" % running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id],
                    instance_type=opts.instance_type,
                    subnet_id=subnetId(),
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    user_data=user_data,
                    instance_profile_arn=opts.profile,
                )
                slave_nodes += slave_res.instances
                print >> stderr, "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == "all":
            opts.zone = random.choice(conn.get_all_zones()).name
        if opts.spot_price != None:
            best_price = find_best_price(conn, master_type, opts.zone, opts.spot_price)
            # Launch spot instances with the requested price
            print >> stderr, ("Requesting master as spot instances with price $%.3f/hour" % (best_price))

            interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
                subnet_id=subnetId(), groups=[master_group.id], associate_public_ip_address=True
            )
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)

            master_reqs = conn.request_spot_instances(
                price=best_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=opts.zone,
                count=1,
                key_name=opts.key_pair,
                instance_type=master_type,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
                network_interfaces=interfaces,
            )
            my_req_ids = [r.id for r in master_reqs]
            print >> stderr, "Waiting for spot instance to be granted"
            try:
                while True:
                    time.sleep(10)
                    reqs = conn.get_all_spot_instance_requests(request_ids=my_req_ids)
                    id_to_req = {}
                    for r in reqs:
                        id_to_req[r.id] = r
                    active_instance_ids = []
                    for i in my_req_ids:
                        if i in id_to_req and id_to_req[i].state == "active":
                            active_instance_ids.append(id_to_req[i].instance_id)
                    if len(active_instance_ids) == 1:
                        print >> stderr, "Master granted"
                        reservations = conn.get_all_instances(active_instance_ids)
                        master_nodes = []
                        for r in reservations:
                            master_nodes += r.instances
                        break
                    else:
                        # print >> stderr, ".",
                        print "%d of %d masters granted, waiting longer" % (len(active_instance_ids), 1)
            except:
                print >> stderr, "Canceling spot instance requests"
                conn.cancel_spot_instance_requests(my_req_ids)
                # Log a warning if any of these requests actually launched instances:
                (master_nodes, master_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
                running = len(master_nodes) + len(master_nodes)
                if running:
                    print >> stderr, ("WARNING: %d instances are still running" % running)
                sys.exit(0)
        else:
            master_res = image.run(
                key_name=opts.key_pair,
                security_group_ids=[master_group.id],
                instance_type=master_type,
                subnet_id=subnetId(),
                placement=opts.zone,
                min_count=1,
                max_count=1,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
            )
            master_nodes = master_res.instances
            print >> stderr, "Launched master in %s, regid = %s" % (zone, master_res.id)
    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #40
0
파일: mesos_ec2.py 프로젝트: apanda/mesos
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name + "-master")
  slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize(src_group=master_group)
    zoo_group.authorize(src_group=slave_group)
    zoo_group.authorize(src_group=zoo_group)
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  print "Checking for running cluster..."
  reservations = conn.get_all_instances()
  for res in reservations:
    group_names = [g.id for g in res.groups]
    if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
      active = [i for i in res.instances if is_active(i)]
      if len(active) > 0:
        print >> stderr, ("ERROR: There are already instances running in " +
            "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)
  print "Launching instances..."
  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    slave_reqs = conn.request_spot_instances(
        price = opts.spot_price,
        image_id = opts.ami,
        launch_group = "launch-group-%s" % cluster_name,
        placement = opts.zone,
        count = opts.slaves,
        key_name = opts.key_pair,
        security_groups = [slave_group],
        instance_type = opts.instance_type,
        block_device_map = block_map)
    my_req_ids = [req.id for req in slave_reqs]
    print "Waiting for spot instances to be granted..."
    while True:
      time.sleep(10)
      reqs = conn.get_all_spot_instance_requests()
      id_to_req = {}
      for r in reqs:
        id_to_req[r.id] = r
      active = 0
      instance_ids = []
      for i in my_req_ids:
        if id_to_req[i].state == "active":
          active += 1
          instance_ids.append(id_to_req[i].instance_id)
      if active == opts.slaves:
        print "All %d slaves granted" % opts.slaves
        reservations = conn.get_all_instances(instance_ids)
        slave_nodes = []
        for r in reservations:
          slave_nodes += r.instances
        break
      else:
        print "%d of %d slaves granted, waiting longer" % (active, opts.slaves)
  else:
    # Launch non-spot instances
    slave_res = image.run(key_name = opts.key_pair,
                          security_groups = [slave_group],
                          instance_type = opts.instance_type,
                          placement = opts.zone,
                          min_count = opts.slaves,
                          max_count = opts.slaves,
                          block_device_map = block_map)
    slave_nodes = slave_res.instances
    print "Launched slaves, regid = " + slave_res.id

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  master_res = image.run(key_name = opts.key_pair,
                         security_groups = [master_group],
                         instance_type = master_type,
                         placement = opts.zone,
                         min_count = opts.ft,
                         max_count = opts.ft,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master, regid = " + master_res.id

  # Launch ZooKeeper nodes if required
  if opts.ft > 1:
    zoo_res = image.run(key_name = opts.key_pair,
                        security_groups = [zoo_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = 3,
                        max_count = 3,
                        block_device_map = block_map)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
예제 #41
0
def launch_cluster(conn, opts, cluster_name):

    #Remove known hosts to avoid "Offending key for IP ..." errors.
    known_hosts = os.environ['HOME'] + "/.ssh/known_hosts"
    if os.path.isfile(known_hosts):
        os.remove(known_hosts)
    if opts.key_pair is None:
        opts.key_pair = keypair()
        if opts.key_pair is None:
            print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
            sys.exit(1)

    if opts.profile is None:
        opts.profile = profile()
        if opts.profile is None:
            print >> stderr, "ERROR: No profile found in current host. It be provided with -p option."
            sys.exit(1)

    public_key = pub_key()
    user_data = Template("""#!/bin/bash
  set -e -x
  echo '$public_key' >> ~root/.ssh/authorized_keys
  echo '$public_key' >> ~ec2-user/.ssh/authorized_keys""").substitute(
        public_key=public_key)

    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    sparknotebook_group = get_or_make_group(conn, "SparkNotebookApplication")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=sparknotebook_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0')
        master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        master_group.authorize('tcp', 7077, 7077, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=sparknotebook_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    if not any(r for r in sparknotebook_group.rules
               for g in r.grants if master_group.id == g.group_id):
        sparknotebook_group.authorize(ip_protocol="tcp",
                                      from_port="1",
                                      to_port="65535",
                                      src_group=master_group)
        sparknotebook_group.authorize(ip_protocol="icmp",
                                      from_port="-1",
                                      to_port="-1",
                                      src_group=master_group)

    if not any(r for r in sparknotebook_group.rules
               for g in r.grants if slave_group.id == g.group_id):
        sparknotebook_group.authorize(ip_protocol="tcp",
                                      from_port="1",
                                      to_port="65535",
                                      src_group=slave_group)
        sparknotebook_group.authorize(ip_protocol="icmp",
                                      from_port="-1",
                                      to_port="-1",
                                      src_group=slave_group)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances running in " +
                          "group %s or %s" %
                          (master_group.name, slave_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        zones = get_zones(conn, opts)

        num_zones = len(zones)
        i = 0
        my_req_ids = []

        for zone in zones:
            best_price = find_best_price(conn, opts.instance_type, zone,
                                         opts.spot_price)
            # Launch spot instances with the requested price
            print >> stderr, (
                "Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)"
                % (opts.slaves, best_price, opts.slaves * best_price))

            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
                subnet_id=subnetId(),
                groups=[slave_group.id],
                associate_public_ip_address=True)
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(
                interface)

            slave_reqs = conn.request_spot_instances(
                price=best_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
                network_interfaces=interfaces)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print >> stderr, "Waiting for spot instances to be granted"
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print >> stderr, "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    # print >> stderr, ".",
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print >> stderr, "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_group_ids=[slave_group.id],
                                      instance_type=opts.instance_type,
                                      subnet_id=subnetId(),
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data,
                                      instance_profile_arn=opts.profile)
                slave_nodes += slave_res.instances
                print >> stderr, "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        if opts.spot_price != None:
            best_price = find_best_price(conn, master_type, opts.zone,
                                         opts.spot_price)
            # Launch spot instances with the requested price
            print >> stderr, (
                "Requesting master as spot instances with price $%.3f/hour" %
                (best_price))

            interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
                subnet_id=subnetId(),
                groups=[master_group.id],
                associate_public_ip_address=True)
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(
                interface)

            master_reqs = conn.request_spot_instances(
                price=best_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=opts.zone,
                count=1,
                key_name=opts.key_pair,
                instance_type=master_type,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
                network_interfaces=interfaces)
            my_req_ids = [r.id for r in master_reqs]
            print >> stderr, "Waiting for spot instance to be granted"
            try:
                while True:
                    time.sleep(10)
                    reqs = conn.get_all_spot_instance_requests(
                        request_ids=my_req_ids)
                    id_to_req = {}
                    for r in reqs:
                        id_to_req[r.id] = r
                    active_instance_ids = []
                    for i in my_req_ids:
                        if i in id_to_req and id_to_req[i].state == "active":
                            active_instance_ids.append(
                                id_to_req[i].instance_id)
                    if len(active_instance_ids) == 1:
                        print >> stderr, "Master granted"
                        reservations = conn.get_all_instances(
                            active_instance_ids)
                        master_nodes = []
                        for r in reservations:
                            master_nodes += r.instances
                        break
                    else:
                        # print >> stderr, ".",
                        print "%d of %d masters granted, waiting longer" % (
                            len(active_instance_ids), 1)
            except:
                print >> stderr, "Canceling spot instance requests"
                conn.cancel_spot_instance_requests(my_req_ids)
                # Log a warning if any of these requests actually launched instances:
                (master_nodes,
                 master_nodes) = get_existing_cluster(conn,
                                                      opts,
                                                      cluster_name,
                                                      die_on_error=False)
                running = len(master_nodes) + len(master_nodes)
                if running:
                    print >> stderr, (
                        "WARNING: %d instances are still running" % running)
                sys.exit(0)
        else:
            master_res = image.run(key_name=opts.key_pair,
                                   security_group_ids=[master_group.id],
                                   instance_type=master_type,
                                   subnet_id=subnetId(),
                                   placement=opts.zone,
                                   min_count=1,
                                   max_count=1,
                                   block_device_map=block_map,
                                   user_data=user_data,
                                   instance_profile_arn=opts.profile)
            master_nodes = master_res.instances
            print >> stderr, "Launched master in %s, regid = %s" % (
                zone, master_res.id)
    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #42
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print("ERROR: Must provide an identity file (-i) for ssh connections.",
              file=stderr)
        sys.exit(1)

    if opts.key_pair is None:
        print("ERROR: Must provide a key pair name (-k) to use on instances.",
              file=stderr)
        sys.exit(1)

    user_data_content = None

    print("Setting up security groups...")
    master_group = get_or_make_group(conn, cluster_name + "-master",
                                     opts.vpc_id)
    slave_group = get_or_make_group(conn, cluster_name + "-slaves",
                                    opts.vpc_id)
    authorized_address = opts.authorized_address
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, authorized_address)
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print("ERROR: There are already instances running in group %s or %s" %
              (master_group.name, slave_group.name),
              file=stderr)
        sys.exit(1)

    # Use the default Ubuntu AMI.
    if opts.ami is None:
        if opts.region == "us-east-1":
            opts.ami = "ami-2d39803a"
        elif opts.region == "us-west-1":
            opts.ami = "ami-06116566"
        elif opts.region == "us-west-2":
            opts.ami = "ami-9abea4fb"
        elif opts.region == "eu-west-1":
            opts.ami = "ami-f95ef58a"
        elif opts.region == "eu-central-1":
            opts.ami = "ami-87564feb"
        elif opts.region == "ap-northeast-1":
            opts.ami = "ami-a21529cc"
        elif opts.region == "ap-northeast-2":
            opts.ami = "ami-09dc1267"
        elif opts.region == "ap-southeast-1":
            opts.ami = "ami-25c00c46"
        elif opts.region == "ap-southeast-2":
            opts.ami = "ami-6c14310f"
        elif opts.region == "ap-south-1":
            opts.ami = "ami-4a90fa25"
        elif opts.region == "sa-east-1":
            opts.ami = "ami-0fb83963"
        else:
            raise Exception("The specified region is unknown.")

    # we use group ids to work around https://github.com/boto/boto/issues/350
    additional_group_ids = []
    if opts.additional_security_group:
        additional_group_ids = [
            sg.id for sg in conn.get_all_security_groups()
            if opts.additional_security_group in (sg.name, sg.id)
        ]
    print("Launching instances...")

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print("Could not find AMI " + opts.ami, file=stderr)
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map["/dev/sd" + chr(ord('s') + i)] = device

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.ascii_letters[i + 1]
            block_map[name] = dev

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_group_ids=[slave_group.id] + additional_group_ids,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                user_data=user_data_content,
                instance_profile_name=opts.instance_profile_name)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print("Waiting for spot instances to be granted...")
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print("All %d slaves granted" % opts.slaves)
                    reservations = conn.get_all_reservations(
                        active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print("%d of %d slaves granted, waiting longer" %
                          (len(active_instance_ids), opts.slaves))
        except:
            print("Canceling spot instance requests")
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print(("WARNING: %d instances are still running" % running),
                      file=stderr)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id] + additional_group_ids,
                    instance_type=opts.instance_type,
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    user_data=user_data_content,
                    instance_initiated_shutdown_behavior=opts.
                    instance_initiated_shutdown_behavior,
                    instance_profile_name=opts.instance_profile_name)
                slave_nodes += slave_res.instances
                print(
                    "Launched {s} slave{plural_s} in {z}, regid = {r}".format(
                        s=num_slaves_this_zone,
                        plural_s=('' if num_slaves_this_zone == 1 else 's'),
                        z=zone,
                        r=slave_res.id))
            i += 1

    # Launch or resume masters
    if existing_masters:
        print("Starting master...")
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(
            key_name=opts.key_pair,
            security_group_ids=[master_group.id] + additional_group_ids,
            instance_type=master_type,
            placement=opts.zone,
            min_count=1,
            max_count=1,
            block_device_map=block_map,
            subnet_id=opts.subnet_id,
            placement_group=opts.placement_group,
            user_data=user_data_content,
            instance_initiated_shutdown_behavior=opts.
            instance_initiated_shutdown_behavior,
            instance_profile_name=opts.instance_profile_name)

        master_nodes = master_res.instances
        print("Launched master in %s, regid = %s" % (zone, master_res.id))

    # This wait time corresponds to SPARK-4983
    print("Waiting for AWS to propagate instance metadata...")
    time.sleep(15)

    # Give the instances descriptive names and set additional tags
    additional_tags = {}
    if opts.additional_tags.strip():
        additional_tags = dict(
            map(str.strip, tag.split(':', 1))
            for tag in opts.additional_tags.split(','))

    for master in master_nodes:
        master.add_tags(
            dict(additional_tags,
                 Name='{cn}-master-{iid}'.format(cn=cluster_name,
                                                 iid=master.id)))

    for slave in slave_nodes:
        slave.add_tags(
            dict(additional_tags,
                 Name='{cn}-slave-{iid}'.format(cn=cluster_name,
                                                iid=slave.id)))

    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #43
0
    def _launch_wait_snapshot(self, ami, user_data, img_size = 10, img_name = None, img_desc = None,
                             remote_access_command = None):

        rand_id = random.randrange(2**32)
        # Modified from code taken from Image Factory 
        # Create security group
        security_group_name = "ebs-helper-vnc-tmp-%x" % (rand_id)
        security_group_desc = "Temporary security group with SSH access generated by EBSHelper python object"
        self.log.debug("Creating temporary security group (%s)" % (security_group_name))
        self.security_group = self.conn.create_security_group(security_group_name, security_group_desc)
        self.security_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        self.security_group.authorize('tcp', 5900, 5950, '0.0.0.0/0')

        ebs_root = EBSBlockDeviceType()
        ebs_root.size=img_size
        ebs_root.delete_on_termination = True
        block_map = BlockDeviceMapping()
        block_map['/dev/sda'] = ebs_root

        # Now launch it
        instance_type="m1.small"
        self.log.debug("Starting ami %s in region %s with instance_type %s" % (ami, self.region.name, instance_type))

        reservation = self.conn.run_instances(ami, max_count=1, instance_type=instance_type, 
                                              user_data = user_data,
                                              security_groups = [ security_group_name ],
                                              block_device_map = block_map)
        # I used to have a check for more than one instance here -- but that would be a profound bug in boto
        if len(reservation.instances) == 0:
            raise Exception("Attempt to start instance failed")

        self.instance = reservation.instances[0]

        wait_for_ec2_instance_state(self.instance, self.log, final_state='running', timeout=300)

        self.log.debug("Instance (%s) is now running" % self.instance.id)
        self.log.debug("Public DNS will be: %s" % self.instance.public_dns_name)
        self.log.debug("Now waiting up to 30 minutes for instance to stop")

        wait_for_ec2_instance_state(self.instance, self.log, final_state='stopped', timeout=1800)

        # Snapshot
        self.log.debug("Creating a new EBS backed image from completed/stopped EBS instance")
        new_ami_id = self.conn.create_image(self.instance.id, img_name, img_desc)
        self.log.debug("boto creat_image call returned AMI ID: %s" % (new_ami_id))
        self.log.debug("Waiting for newly generated AMI to become available")
        # As with launching an instance we have seen occasional issues when trying to query this AMI right
        # away - give it a moment to settle
        sleep(10)
        new_amis = self.conn.get_all_images([ new_ami_id ])
        new_ami = new_amis[0]
        timeout = 120
        interval = 10
        for i in range(timeout):
            new_ami.update()
            if new_ami.state == "available":
                break
            elif new_ami.state == "failed":
                raise Exception("Amazon reports EBS image creation failed")
            self.log.debug("AMI status (%s) - waiting for 'available' - [%d of %d seconds elapsed]" % (new_ami.state, i * interval, timeout * interval))
            sleep(interval)

        self.log.debug("Terminating/deleting instance")
        terminate_instance(self.instance)
 
        if new_ami.state != "available":
            raise Exception("Failed to produce an AMI ID")

        self.log.debug("SUCCESS: %s is now available for launch" % (new_ami_id))

        return new_ami_id
예제 #44
0
def launch_cluster(conn, opts, cluster_name):
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances running in " +
                          "group %s or %s" %
                          (master_group.name, slave_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group],
                instance_type=opts.instance_type,
                block_device_map=block_map)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group],
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      instance_profile_name="spark-node",
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group],
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               instance_profile_name="spark-node",
                               block_device_map=block_map)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Return all the instances
    return (master_nodes, slave_nodes)
예제 #45
0
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  input_group = get_or_make_group(conn, cluster_name + "-input")
  compute_group = get_or_make_group(conn, cluster_name + "-compute")
  if input_group.rules == []: # Group was just now created
    input_group.authorize(src_group=input_group)
    input_group.authorize(src_group=compute_group)
    input_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    input_group.authorize('tcp', 4000, 4000, '0.0.0.0/0')
    input_group.authorize('tcp', 4001, 4001, '0.0.0.0/0')
  if compute_group.rules == []: # Group was just now created
    compute_group.authorize(src_group=input_group)
    compute_group.authorize(src_group=compute_group)
    compute_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    compute_group.authorize('tcp', 4000, 4000, '0.0.0.0/0')
    compute_group.authorize('tcp', 4001, 4001, '0.0.0.0/0')
    compute_group.authorize('tcp', 5001, 5001, '0.0.0.0/0')

  # Check if instances are already running in our groups
  active_nodes = get_existing_cluster(conn, opts, cluster_name,
                                      die_on_error=False)
  if any(active_nodes):
    print >> stderr, ("ERROR: There are already instances running in " +
        "group %s, %s or %s" % (input_group.name, compute_group.name))
    sys.exit(1)
  
  # CHANGE THIS IF CHANGING REGIONS
  opts.ami = 'ami-d76605be'
  
  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device
  launch_groups = opts.compute_groups + 1
  # Launch compute nodes
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d compute nodes as spot instances with price $%.3f" %
           (launch_groups * opts.slaves, opts.spot_price))
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    my_req_ids = []
    for zone in zones:
      num_slaves_this_zone = get_partition(launch_groups * opts.slaves, num_zones, i)
      compute_reqs = conn.request_spot_instances(
          price = opts.spot_price,
          image_id = opts.ami,
          launch_group = "launch-group-%s" % cluster_name,
          placement = zone,
          count = num_slaves_this_zone,
          key_name = opts.key_pair,
          security_groups = [compute_group],
          instance_type = opts.instance_type,
          block_device_map = block_map)
      my_req_ids += [req.id for req in compute_reqs]
      i += 1
    
    print "Waiting for spot instances to be granted..."
    try:
      while True:
        time.sleep(10)
        reqs = conn.get_all_spot_instance_requests()
        id_to_req = {}
        for r in reqs:
          id_to_req[r.id] = r
        active_instance_ids = []
        for i in my_req_ids:
          if i in id_to_req and id_to_req[i].state == "active":
            active_instance_ids.append(id_to_req[i].instance_id)
        if len(active_instance_ids) == opts.slaves * launch_groups:
          print "All %d compute nodes granted" %(opts.slaves * launch_groups)
          reservations = conn.get_all_instances(active_instance_ids)
          compute_nodes = []
          for r in reservations:
            compute_nodes += r.instances
          break
        else:
          print "%d of %d compute nodes granted, waiting longer" % (
            len(active_instance_ids), opts.slaves * launch_groups)
    except:
      print "Canceling spot instance requests"
      conn.cancel_spot_instance_requests(my_req_ids)
      # Log a warning if any of these requests actually launched instances:
      (input_nodes, compute_nodes) = get_existing_cluster(
          conn, opts, cluster_name, die_on_error=False)
      running = len(input_nodes) + len(compute_nodes)
      if running:
        print >> stderr, ("WARNING: %d instances are still running" % running)
      sys.exit(0)
  else:
    # Launch non-spot instances
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    compute_nodes = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves * launch_groups, num_zones, i)
      if num_slaves_this_zone > 0:
        compute_res = image.run(key_name = opts.key_pair,
                              security_groups = [compute_group],
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map)
        compute_nodes += compute_res.instances
        print "Launched %d compute nodes in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, compute_res.id)
      i += 1

  # Launch input nodes
  input_type = opts.instance_type
  if input_type == "":
    input_type = opts.instance_type
  if opts.zone == 'all':
    opts.zone = random.choice(conn.get_all_zones()).name
  input_res = image.run(key_name = opts.key_pair,
                         security_groups = [input_group],
                         instance_type = input_type,
                         placement = opts.zone,
                         min_count = 1,
                         max_count = 1,
                         block_device_map = block_map)
  input_nodes = input_res.instances
  print "Launched input in %s, regid = %s" % (zone, input_res.id)

  # Return all the instances
  return (input_nodes, compute_nodes)
예제 #46
0
파일: _aws.py 프로젝트: Kaffa-MY/flocker
    def create_node(self, name, distribution,
                    size=None, disk_size=8,
                    metadata={}):
        if size is None:
            size = self._default_size

        with start_action(
            action_type=u"flocker:provision:aws:create_node",
            name=name,
            distribution=distribution,
            image_size=size,
            disk_size=disk_size,
            metadata=metadata,
        ):

            metadata = metadata.copy()
            metadata['Name'] = name

            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]},
            )

            with start_action(
                action_type=u"flocker:provision:aws:create_node:run_instances",
            ) as context:
                reservation = self._connection.run_instances(
                    images[0].id,
                    key_name=self._keyname,
                    instance_type=size,
                    security_groups=self._security_groups,
                    block_device_map=diskmap,
                    placement=self._zone,
                    # On some operating systems, a tty is requried for sudo.
                    # Since AWS systems have a non-root user as the login,
                    # disable this, so we can use sudo with conch.
                    user_data=dedent("""\
                        #!/bin/sh
                        sed -i '/Defaults *requiretty/d' /etc/sudoers
                        """),
                )

                instance = reservation.instances[0]
                context.add_success_fields(instance_id=instance.id)

            self._connection.create_tags([instance.id], metadata)

            # Display state as instance starts up, to keep user informed that
            # things are happening.
            _wait_until_running(instance)

            return AWSNode(
                name=name,
                _provisioner=self,
                _instance=instance,
                distribution=distribution,
            )
예제 #47
0
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name + "-master")
  slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize(src_group=master_group)
    zoo_group.authorize(src_group=slave_group)
    zoo_group.authorize(src_group=zoo_group)
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  print "Checking for running cluster..."
  reservations = conn.get_all_instances()
  for res in reservations:
    group_names = [g.id for g in res.groups]
    if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
      active = [i for i in res.instances if is_active(i)]
      if len(active) > 0:
        print >> stderr, ("ERROR: There are already instances running in " +
            "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)
  print "Launching instances..."
  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  slave_res = image.run(key_name = opts.key_pair,
                        security_groups = [slave_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = opts.slaves,
                        max_count = opts.slaves,
                        block_device_map = block_map)
  slave_nodes = slave_res.instances
  print "Launched slaves, regid = " + slave_res.id

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  master_res = image.run(key_name = opts.key_pair,
                         security_groups = [master_group],
                         instance_type = master_type,
                         placement = opts.zone,
                         min_count = opts.ft,
                         max_count = opts.ft,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master, regid = " + master_res.id

  # Launch ZooKeeper nodes if required
  if opts.ft > 1:
    zoo_res = image.run(key_name = opts.key_pair,
                        security_groups = [zoo_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = 3,
                        max_count = 3,
                        block_device_map = block_map)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
예제 #48
0
def launch_cluster(conn, opts, cluster_name):
  if opts.identity_file is None:
    print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
    sys.exit(1)
  if opts.key_pair is None:
    print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
    sys.exit(1)
  #print "Setting up security groups..."
  #master_group = get_or_make_group(conn, cluster_name + "-master")
  #slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  #if master_group.rules == []: # Group was just now created
  #  master_group.authorize(src_group=master_group)
  #  master_group.authorize(src_group=slave_group)
  #  master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
  #  master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
  #  master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
  #  master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
  #  master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
  #  master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
  #  master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
  #  if opts.ganglia:
  #    master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
  #if slave_group.rules == []: # Group was just now created
  #  slave_group.authorize(src_group=master_group)
  #  slave_group.authorize(src_group=slave_group)
  #  slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

  # Check if instances are already running in our groups
  existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                           die_on_error=False)
  #if existing_slaves or (existing_masters and not opts.use_existing_master):
  #  print >> stderr, ("ERROR: There are already instances running in " +
  #      "group %s or %s" % (master_group.name, slave_group.name))
  #  sys.exit(1)

  # Figure out Spark AMI
  if opts.ami is None:
    opts.ami = get_spark_ami(opts)
  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    my_req_ids = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
      slave_reqs = conn.request_spot_instances(
          price = opts.spot_price,
          image_id = opts.ami,
          launch_group = "launch-group-%s" % cluster_name,
          placement = zone,
          count = num_slaves_this_zone,
          key_name = opts.key_pair,
          #security_groups = [slave_group],
          instance_type = opts.instance_type,
          block_device_map = block_map)
      my_req_ids += [req.id for req in slave_reqs]
      i += 1

    print "Waiting for spot instances to be granted..."
    try:
      while True:
        time.sleep(10)
        reqs = conn.get_all_spot_instance_requests()
        id_to_req = {}
        for r in reqs:
          id_to_req[r.id] = r
        active_instance_ids = []
        for i in my_req_ids:
          if i in id_to_req and id_to_req[i].state == "active":
            active_instance_ids.append(id_to_req[i].instance_id)
        if len(active_instance_ids) == opts.slaves:
          print "All %d slaves granted" % opts.slaves
          reservations = conn.get_all_instances(active_instance_ids)
          slave_nodes = []
          for r in reservations:
            slave_nodes += r.instances
          break
        else:
          print "%d of %d slaves granted, waiting longer" % (
            len(active_instance_ids), opts.slaves)
    except:
      print "Canceling spot instance requests"
      conn.cancel_spot_instance_requests(my_req_ids)
      # Log a warning if any of these requests actually launched instances:
      (master_nodes, slave_nodes) = get_existing_cluster(
          conn, opts, cluster_name, die_on_error=False)
      running = len(master_nodes) + len(slave_nodes)
      if running:
        print >> stderr, ("WARNING: %d instances are still running" % running)
      sys.exit(0)
  else:
    # Launch non-spot instances
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    slave_nodes = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
      if num_slaves_this_zone > 0:
        slave_res = image.run(key_name = opts.key_pair,
                              security_group_ids = ["sg-87956be2","sg-1ac33f7f", "sg-1ec33f7b"],
                              subnet_id = "subnet-4182b007",
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map)
        slave_nodes += slave_res.instances
        print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, slave_res.id)
      i += 1

  # Launch or resume masters
  if existing_masters:
    print "Starting master..."
    for inst in existing_masters:
      if inst.state not in ["shutting-down", "terminated"]:
        inst.start()
    master_nodes = existing_masters
  else:
    master_type = opts.master_instance_type
    if master_type == "":
      master_type = opts.instance_type
    if opts.zone == 'all':
      opts.zone = random.choice(conn.get_all_zones()).name
    master_res = image.run(key_name = opts.key_pair,
                           security_group_ids = ["sg-bd956bd8","sg-1ac33f7f", "sg-1ec33f7b"],
                           subnet_id = "subnet-4182b007",
                           instance_type = master_type,
                           placement = opts.zone,
                           min_count = 1,
                           max_count = 1,
                           block_device_map = block_map)
    master_nodes = master_res.instances
    print "Launched master in %s, regid = %s" % (zone, master_res.id)

  # Return all the instances
  return (master_nodes, slave_nodes)
예제 #49
0
def create_ami(region, snap_id, force=None, root_dev='/dev/sda1', zone_name=None,
               default_arch=None, default_type='t1.micro', security_groups=''):
    """
    Creates AMI image from given snapshot.

    Force option removes prompt request and creates new instance from
    created ami image.

    region, snap_id
        specify snapshot to be processed. Snapshot description in json
        format will be used to restore instance with same parameters.
        Will automaticaly process snapshots for same instance with near
        time (10 minutes or shorter), but for other devices (/dev/sdb,
        /dev/sdc, etc);
    force
        Run instance from ami after creation without confirmation. To
        enable set value to "RUN";
    default_arch
        architecture to use if not mentioned in snapshot description;
    default_type
        instance type to use if not mentioned in snapshot description.
        Used only if ``force`` is "RUN";
    security_groups
        list of AWS Security Groups names formatted as string separated
        with semicolon ';'. Used only if ``force`` is "RUN".
    """
    conn = get_region_conn(region)
    snap = conn.get_all_snapshots(snapshot_ids=[snap_id, ])[0]
    instance_id = get_snap_instance(snap)
    _device = get_snap_device(snap)
    snaps = conn.get_all_snapshots(owner='self')
    snapshots = [snp for snp in snaps if
        get_snap_instance(snp) == instance_id and
        get_snap_device(snp) != _device and
        abs(get_snap_time(snap) - get_snap_time(snp)) <= timedelta(minutes=10)]
    snapshot = sorted(snapshots, key=get_snap_time,
                      reverse=True) if snapshots else None
    # setup for building an EBS boot snapshot
    default_arch = default_arch or config.get('DEFAULT', 'ARCHITECTURE')
    arch = get_descr_attr(snap, 'Arch') or default_arch
    kernel = config.get(conn.region.name, 'KERNEL' + arch.upper())
    dev = re.match(r'^/dev/sda$', _device)  # if our instance encrypted
    if dev:
        kernel = config.get(conn.region.name, 'KERNEL_ENCR_' + arch.upper())
    ebs = EBSBlockDeviceType()
    ebs.snapshot_id = snap_id
    ebs.delete_on_termination = True
    block_map = BlockDeviceMapping()
    block_map[_device] = ebs
    sdb = BlockDeviceType()
    sdb.ephemeral_name = 'ephemeral0'
    block_map['/dev/sdb'] = sdb

    if snapshot:
        for s in snapshot:
            s_dev = get_snap_device(s)
            s_ebs = EBSBlockDeviceType()
            s_ebs.delete_on_termination = True
            s_ebs.snapshot_id = s.id
            block_map[s_dev] = s_ebs

    name = 'Created {0} using access key {1}'.format(timestamp(),
                                                     conn.access_key)
    name = name.replace(":", ".").replace(" ", "_")

    # create the new AMI all options from snap JSON description:
    wait_for(snap, '100%', limit=SNAP_TIME)
    result = conn.register_image(
        name=name,
        description=snap.description,
        architecture=get_descr_attr(snap, 'Arch') or default_arch,
        root_device_name=get_descr_attr(snap, 'Root_dev_name') or root_dev,
        block_device_map=block_map, kernel_id=kernel)
    sleep(2)
    image = conn.get_all_images(image_ids=[result, ])[0]
    wait_for(image, 'available', limit=10 * 60)
    add_tags(image, snap.tags)

    logger.info('The new AMI ID = {0}'.format(result))

    new_instance = None
    if force == 'RUN':
        instance_type = get_descr_attr(snap, 'Type') or default_type
        new_instance = launch_instance_from_ami(
            region, image.id, inst_type=instance_type,
            security_groups=security_groups, zone_name=zone_name)
    return image, new_instance
예제 #50
0
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name + "-master")
  slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    if opts.cluster_type == "mesos":
      master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    if opts.ganglia:
      master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize(src_group=master_group)
    zoo_group.authorize(src_group=slave_group)
    zoo_group.authorize(src_group=zoo_group)
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  active_nodes = get_existing_cluster(conn, opts, cluster_name,
                                      die_on_error=False)
  if any(active_nodes):
    print >> stderr, ("ERROR: There are already instances running in " +
        "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
    sys.exit(1)

  # Figure out the latest AMI from our static URL
  if opts.ami == "latest":
    try:
      opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip()
      print "Latest Spark AMI: " + opts.ami
    except:
      print >> stderr, "Could not read " + LATEST_AMI_URL
      sys.exit(1)

  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    my_req_ids = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
      slave_reqs = conn.request_spot_instances(
          price = opts.spot_price,
          image_id = opts.ami,
          launch_group = "launch-group-%s" % cluster_name,
          placement = zone,
          count = num_slaves_this_zone,
          key_name = opts.key_pair,
          security_groups = [slave_group],
          instance_type = opts.instance_type,
          block_device_map = block_map)
      my_req_ids += [req.id for req in slave_reqs]
      i += 1
    
    print "Waiting for spot instances to be granted..."
    try:
      while True:
        time.sleep(10)
        reqs = conn.get_all_spot_instance_requests()
        id_to_req = {}
        for r in reqs:
          id_to_req[r.id] = r
        active_instance_ids = []
        for i in my_req_ids:
          if i in id_to_req and id_to_req[i].state == "active":
            active_instance_ids.append(id_to_req[i].instance_id)
        if len(active_instance_ids) == opts.slaves:
          print "All %d slaves granted" % opts.slaves
          reservations = conn.get_all_instances(active_instance_ids)
          slave_nodes = []
          for r in reservations:
            slave_nodes += r.instances
          break
        else:
          print "%d of %d slaves granted, waiting longer" % (
            len(active_instance_ids), opts.slaves)
    except:
      print "Canceling spot instance requests"
      conn.cancel_spot_instance_requests(my_req_ids)
      # Log a warning if any of these requests actually launched instances:
      (master_nodes, slave_nodes, zoo_nodes) = get_existing_cluster(
          conn, opts, cluster_name, die_on_error=False)
      running = len(master_nodes) + len(slave_nodes) + len(zoo_nodes)
      if running:
        print >> stderr, ("WARNING: %d instances are still running" % running)
      sys.exit(0)
  else:
    # Launch non-spot instances
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    slave_nodes = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
      if num_slaves_this_zone > 0:
        slave_res = image.run(key_name = opts.key_pair,
                              security_groups = [slave_group],
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map)
        slave_nodes += slave_res.instances
        print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, slave_res.id)
      i += 1

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  if opts.zone == 'all':
    opts.zone = random.choice(conn.get_all_zones()).name
  master_res = image.run(key_name = opts.key_pair,
                         security_groups = [master_group],
                         instance_type = master_type,
                         placement = opts.zone,
                         min_count = 1,
                         max_count = 1,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master in %s, regid = %s" % (zone, master_res.id)

  zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
예제 #51
0
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  
  master_group = get_or_make_group(conn, "shark-exp-master")
  slave_group = get_or_make_group(conn, "shark-exp-slaves")
  zoo_group = get_or_make_group(conn, "ampcamp-zoo")
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    if opts.cluster_type == "mesos":
      master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
      master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
      master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
      master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
      # hbase
      master_group.authorize('tcp', 60010, 60010, '0.0.0.0/0')
      master_group.authorize('tcp', 60050, 60050, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    if opts.cluster_type == "mesos":
      slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
      slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
      slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
      slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
      # hbase
      slave_group.authorize('tcp', 60050, 60050, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize(src_group=master_group)
    zoo_group.authorize(src_group=slave_group)
    zoo_group.authorize(src_group=zoo_group)
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  print "Checking for running cluster..."
  reservations = conn.get_all_instances()
  for res in reservations:
    for instance in res.instances:
      if 'tags' in instance.__dict__ and 'cluster' in instance.tags:
        if instance.tags['cluster'] == cluster_name and is_active(instance):
          print >> stderr, ("ERROR: Instances %s is already running in cluster %s"
                            % (instance.id, cluster_name))
          sys.exit(1)

  if opts.ami in ["latest", "standalone"]:
    opts.ami = get_ami(opts.ami)

  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    slave_reqs = conn.request_spot_instances(
        price = opts.spot_price,
        image_id = opts.ami,
        launch_group = "launch-group-%s" % cluster_name,
        placement = opts.zone,
        count = opts.slaves,
        key_name = opts.key_pair,
        security_groups = [slave_group],
        instance_type = opts.instance_type,
        block_device_map = block_map)
    my_req_ids = [req.id for req in slave_reqs]
    print "Waiting for spot instances to be granted..."
    while True:
      time.sleep(10)
      reqs = conn.get_all_spot_instance_requests()
      id_to_req = {}
      for r in reqs:
        id_to_req[r.id] = r
      active = 0
      instance_ids = []
      for i in my_req_ids:
        if id_to_req[i].state == "active":
          active += 1
          instance_ids.append(id_to_req[i].instance_id)
      if active == opts.slaves:
        print "All %d slaves granted" % opts.slaves
        reservations = conn.get_all_instances(instance_ids)
        slave_nodes = []
        for r in reservations:
          slave_nodes += r.instances
        break
      else:
        print "%d of %d slaves granted, waiting longer" % (active, opts.slaves)
  else:
    # Launch non-spot instances
    slave_res = image.run(key_name = opts.key_pair,
                          security_groups = [slave_group],
                          instance_type = opts.instance_type,
                          placement = opts.zone,
                          min_count = opts.slaves,
                          max_count = opts.slaves,
                          block_device_map = block_map)
    slave_nodes = slave_res.instances
    print "Launched slaves, regid = " + slave_res.id

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  master_res = image.run(key_name = opts.key_pair,
                         security_groups = [master_group],
                         instance_type = master_type,
                         placement = opts.zone,
                         min_count = 1,
                         max_count = 1,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master, regid = " + master_res.id

  # Create the right tags
  tags = {}
  tags['cluster'] = cluster_name

  tags['type'] = 'slave'
  for node in slave_nodes:
    conn.create_tags([node.id], tags)
  
  tags['type'] = 'master'
  for node in master_nodes:
    conn.create_tags([node.id], tags)

  zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)