Ejemplo n.º 1
0
def run_encryptor_instance(aws_svc, encryptor_image_id, snapshot, root_size,
                           guest_image_id, sg_id, update_ami=False):
    bdm = BlockDeviceMapping()
    guest_unencrypted_root = EBSBlockDeviceType(
        volume_type='gp2',
        snapshot_id=snapshot,
        delete_on_termination=True)
    # Use gp2 for fast burst I/O copying root drive
    bdm['/dev/sda4'] = guest_unencrypted_root
    if not update_ami:
        log.info('Launching encryptor instance with snapshot %s', snapshot)
        # They are creating an encrypted AMI instead of updating it
        # Use gp2 for fast burst I/O copying root drive
        guest_encrypted_root = EBSBlockDeviceType(
            volume_type='gp2',
            delete_on_termination=True)
        guest_encrypted_root.size = 2 * root_size + 1
        bdm['/dev/sda5'] = guest_encrypted_root
    else:
        log.info('Launching encryptor instance for updating %s',
                 guest_image_id)
        guest_encrypted_root = EBSBlockDeviceType(
            volume_type='gp2',
            snapshot_id=snapshot,
            delete_on_termination=True)

        guest_encrypted_root.size = root_size
        bdm['/dev/sda5'] = guest_encrypted_root

    instance = aws_svc.run_instance(encryptor_image_id,
                                    security_group_ids=[sg_id],
                                    block_device_map=bdm)
    aws_svc.create_tags(
        instance.id,
        name=NAME_ENCRYPTOR,
        description=DESCRIPTION_ENCRYPTOR % {'image_id': guest_image_id}
    )
    instance = _wait_for_instance(aws_svc, instance.id)
    log.info('Launched encryptor instance %s', instance.id)
    # Tag volumes.
    bdm = instance.block_device_mapping
    if not update_ami:
        aws_svc.create_tags(
            bdm['/dev/sda5'].volume_id, name=NAME_ENCRYPTED_ROOT_VOLUME)
    aws_svc.create_tags(
        bdm['/dev/sda2'].volume_id, name=NAME_METAVISOR_ROOT_VOLUME)
    aws_svc.create_tags(
        bdm['/dev/sda1'].volume_id, name=NAME_METAVISOR_GRUB_VOLUME)
    aws_svc.create_tags(
        bdm['/dev/sda3'].volume_id, name=NAME_METAVISOR_LOG_VOLUME)
    return instance
Ejemplo n.º 2
0
def create_instance():
    """Support function to create a new AWS instance."""
    from boto.ec2.blockdevicemapping import EBSBlockDeviceType, BlockDeviceMapping

    kwargs = dict(
        instance_type = conf.type, 
        key_name=conf.key_pair,
        placement=conf.zone,
    )

    if conf.disk_size:
        # We want a larger EBS root volume, so override /dev/sda1.
        dev_root = EBSBlockDeviceType()
        dev_root.size = conf.disk_size
    
        # Create the mapping.
        dev_mapping = BlockDeviceMapping()
        dev_mapping['/dev/sda1'] = dev_root 

        kwargs['block_device_map'] = dev_mapping

    reservation = env.aws.run_instances(
        conf.ami,
        **kwargs)

    instance = env.aws.instance = reservation.instances[0]
    wait_for_status(instance, "Creating server", "running")
    env.aws.create_tags([instance.id], {'Name': env.server.name})

    print " Done. \nInstance built:", instance.public_dns_name

    return instance
Ejemplo n.º 3
0
    def create_node(self, name, distribution, metadata={}):
        size = self._default_size
        disk_size = 8

        with start_action(
            action_type=u"flocker:provision:aws:create_node",
            name=name,
            distribution=distribution,
            image_size=size,
            disk_size=disk_size,
            metadata=metadata,
        ):

            metadata = metadata.copy()
            metadata["Name"] = name

            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap["/dev/sda1"] = disk1

            images = self._connection.get_all_images(filters={"name": IMAGE_NAMES[distribution]})
            # Retry several times, no sleep between retries is needed.
            instance = poll_until(
                lambda: self._get_node(images[0].id, size, diskmap, metadata), repeat(0, 10), lambda x: None
            )
            return AWSNode(name=name, _provisioner=self, _instance=instance, distribution=distribution)
Ejemplo n.º 4
0
    def create_node(self, name, distribution, metadata={}):
        size = self._default_size
        disk_size = 10

        with start_action(
                action_type=u"flocker:provision:aws:create_node",
                name=name,
                distribution=distribution,
                image_size=size,
                disk_size=disk_size,
                metadata=metadata,
        ):

            metadata = metadata.copy()
            metadata['Name'] = name

            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]}, )
            # Retry several times, no sleep between retries is needed.
            instance = poll_until(
                lambda: self._get_node(images[0].id, size, diskmap, metadata),
                repeat(0, 10), lambda x: None)
            return AWSNode(
                name=name,
                _provisioner=self,
                _instance=instance,
                distribution=distribution,
            )
Ejemplo n.º 5
0
def create_image(conn):

    reservation = None
    if launch_type == 'on-demand':
        reservation = launch_and_wait(conn, base_instance_type, 1, base_image)

    instance = get_instances_from_reservation(reservation)[0]
    instance_ip = instance.ip_address
    instance_id = instance.id

    log_file = setup_instance(instance_ip)

    boot_disk = EBSBlockDeviceType()
    boot_disk.size = 50
    bdm = BlockDeviceMapping()
    bdm['/dev/sda1'] = boot_disk

    global node_image

    try:
        images = conn.get_all_images(owners=['self'])
        for image in images:
            image.deregister()
        f = open('cloud_configs/' + cloud + '/' + cloud + '_node_image.py',
                 'w')
        f.write("node_image = 'DEREGISTERED!'")
        f.close()
    except:
        1

    node_image = conn.create_image(instance_id,
                                   'AWS-pwa-node-image',
                                   block_device_mapping=bdm)

    image = conn.get_all_images(image_ids=[node_image])[0]

    f = open(log_file, 'a+')

    while image.state == 'pending':
        sleep(15)
        f.write("Image upload state: " + image.state + '\n')
        image.update()
    f.write("Image upload state: " + image.state + '\n')

    if image.state == 'failed':
        sys.exit("AMI CREATION FAILED!")

    f.write('\n' * 2)
    f.write('#' * 30 + '\n')
    f.write('#' * 30 + '\n\n')
    f.write("node_image = '" + str(node_image) + "'\n\n")
    f.write('#' * 30 + '\n')
    f.write('#' * 30 + '\n')
    f.close()

    f = open('cloud_configs/' + cloud + '/' + cloud + '_node_image.py', 'w')
    f.write("node_image = '" + str(node_image) + "'")
    f.close()
Ejemplo n.º 6
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'})
     if is_instance_store:
         block_map = None
     else:
         block_map = BlockDeviceMapping()
         root_device = self.config['ec2_root_device']
         block_map[root_device] = EBSBlockDeviceType()
         if self.config['ec2_size']:
             block_map[root_device].size = self.config['ec2_size']
         block_map[root_device].delete_on_termination = True
     reservation = self.conn.run_instances(
         self.config['ec2_ami_id'],
         key_name=self.config['ec2_key_name'],
         security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']],
         instance_type=self.config['ec2_instance_type'],
         placement=self.config['ec2_zone'],
         placement_group=self.config['ec2_placement_group'],
         monitoring_enabled=self.config['ec2_monitoring_enabled'],
         block_device_map=block_map,
         user_data=self.user_data)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
         msg1 = "Started Instance: {0}\n".format(self.instance.id)
         LOG.info(msg1)
         print msg1
         p = int(self.config['ssh_port'])
         port = "-p {0} ".format(p) if p and not p == 22 else ''
         ## change user to 'root' for all non-Ubuntu systems
         user = self.config['sudouser'] if self.config['sudouser'] and self.config['ssh_import'] else 'ubuntu'
         #XXX - TODO: replace public dns with fqdn, where appropriate
         msg2 = "To access: ssh {0}{1}@{2}\n".format(
             '-p {0} '.format(port) if port else '',
             user,
             self.instance.public_dns_name)
         msg3 = "To terminate: shaker-terminate {0}".format(
                    self.instance.id)
         LOG.info(msg2)
         LOG.info(msg3)
         print msg2
         print msg3
def launch_instance(skip_updates=False):
    '''
    Launch an Oracle database instance.
    '''
    # Assume the keypair name is based on our env.key_filename.
    instance_key_name = os.path.basename(env.key_filename).replace('.pem', '')
    
    # Check that we have a security group configured already.
    security_group_list = ec2_connection.get_all_security_groups()
    security_group_found = False
    for security_group in security_group_list:
        if security_group.name == security_group_name:
            security_group_found = True
            break
    
    # If we didn't find it, create it.
    if not security_group_found:
        create_security_group()    
    
    # We want a larger EBS root volume, so override /dev/sda1.
    # Create an EBS device with 40GB allocated.
    dev_root = EBSBlockDeviceType()
    dev_root.size = 40
    
    # Create the mapping.
    dev_mapping = BlockDeviceMapping()
    dev_mapping['/dev/sda1'] = dev_root 
    
    reservation = ec2_connection.run_instances(ami_id, 
                       instance_type=instance_type, key_name=instance_key_name, 
                       security_groups=[security_group_name], 
                       block_device_map = dev_mapping)
    
    # This is hacky but (mostly) works.
    instance = reservation.instances[0]
    print(green("Launching instance on reservation {}.".format(instance, reservation)))
    
    '''
    Wait for instance state to change;
    if it doesn't change to running, then fail.
    '''    
    print(yellow('Waiting for instance to start...'))
    set_tags = False
    while instance.state == u'pending':
        # Try to set tags.
        if set_tags == False:
            try:
                ec2_connection.create_tags([instance.id], {"Name": instance_name})
                set_tags = True
                print(green("Instance {} tagged.".format(instance)))
            except EC2ResponseError, e:
                print(red("Tagging failed; sleeping, updating instance, and trying again."))
        
        # Check up on its status every so often
        time.sleep(10)
        instance.update()
Ejemplo n.º 8
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     block_map = BlockDeviceMapping()
     root_device = self.config['ec2_root_device']
     block_map[root_device] = EBSBlockDeviceType()
     if self.config['ec2_size']:
         block_map[root_device].size = self.config['ec2_size']
     block_map[root_device].delete_on_termination = True
     reservation = self.conn.run_instances(
         self.config['ec2_ami_id'],
         key_name=self.config['ec2_key_name'],
         security_groups=self.config['ec2_security_groups']
         or [self.config['ec2_security_group']],
         instance_type=self.config['ec2_instance_type'],
         placement_group=self.config['ec2_placement_group'],
         monitoring_enabled=self.config['ec2_monitoring_enabled'],
         block_device_map=block_map,
         user_data=self.user_data)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
         msg1 = "Started Instance: {0}\n".format(self.instance.id)
         LOG.info(msg1)
         print msg1
         p = int(self.config['ssh_port'])
         port = "-p {0} ".format(p) if p and not p == 22 else ''
         ## change user to 'root' for all non-Ubuntu systems
         user = self.config['sudouser'] if self.config[
             'sudouser'] and self.config['ssh_import'] else 'ubuntu'
         #XXX - TODO: replace public dns with fqdn, where appropriate
         msg2 = "To access: ssh {0}{1}@{2}\n".format(
             '-p {0} '.format(port) if port else '', user,
             self.instance.public_dns_name)
         msg3 = "To terminate: shaker-terminate {0}".format(
             self.instance.id)
         LOG.info(msg2)
         LOG.info(msg3)
         print msg2
         print msg3
Ejemplo n.º 9
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     block_map = BlockDeviceMapping()
     root_device = self.config["ec2_root_device"]
     block_map[root_device] = EBSBlockDeviceType()
     if self.config["ec2_size"]:
         block_map[root_device].size = self.config["ec2_size"]
     block_map[root_device].delete_on_termination = True
     for num, device_location in enumerate(self.config["ec2_ephemeral_devices"]):
         device = BlockDeviceType()
         device.ephemeral_name = "ephemeral%d" % num
         block_map[device_location] = device
     reservation = self.conn.run_instances(
         self.config["ec2_ami_id"],
         key_name=self.config["ec2_key_name"],
         security_groups=self.config["ec2_security_groups"] or [self.config["ec2_security_group"]],
         instance_type=self.config["ec2_instance_type"],
         placement=self.config["ec2_zone"],
         monitoring_enabled=self.config["ec2_monitoring_enabled"],
         block_device_map=block_map,
         user_data=self.user_data,
     )
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == "running":
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance %s failed after %d seconds" % (self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config["hostname"]:
             self.assign_name_tag()
         msg1 = "Started Instance: {0}\n".format(self.instance.id)
         LOG.info(msg1)
         print msg1
         p = int(self.config["ssh_port"])
         port = "-p {0} ".format(p) if p and not p == 22 else ""
         ## change user to 'root' for all non-Ubuntu systems
         user = self.config["sudouser"] if self.config["sudouser"] and self.config["ssh_import"] else "ubuntu"
         # XXX - TODO: replace public dns with fqdn, where appropriate
         msg2 = "To access: ssh {0}{1}@{2}\n" "To terminate: shaker-terminate {3}".format(
             port, user, self.instance.public_dns_name, self.instance.id
         )
         LOG.info(msg2)
         print msg2
Ejemplo n.º 10
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'})
     if is_instance_store:
         block_map = None
     else:
         block_map = BlockDeviceMapping()
         root_device = self.config['ec2_root_device']
         block_map[root_device] = EBSBlockDeviceType()
         if self.config['ec2_size']:
             block_map[root_device].size = self.config['ec2_size']
         block_map[root_device].delete_on_termination = True
     opts = {
         'key_name': self.config['ec2_key_name'],
         'security_groups': self.config['ec2_security_groups'] or [self.config['ec2_security_group']],
         'instance_type': self.config['ec2_instance_type'],
         'placement': self.config['ec2_zone'],
         'placement_group': self.config['ec2_placement_group'],
         'monitoring_enabled': self.config['ec2_monitoring_enabled'],
         'block_device_map': block_map,
         'user_data': self.user_data
     }
     if self.config.get('ec2_subnet_id',False):
         # when providing subnet_id, must use security_group_ids and not
         # named security_groups or API call will fail.
         opts.pop('security_groups',None)
         opts['security_group_ids'] = self.config['ec2_security_group_ids'] or [self.config['ec2_security_group_id']]
         if not opts['security_group_ids']:
             raise AssertionError('Must specify ec2_security_group_id or ec2_security_group_ids with subnet_id')
         opts['subnet_id'] = self.config['ec2_subnet_id']
     reservation = self.conn.run_instances(self.config['ec2_ami_id'], **opts)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
def get_block_device(instance_type, ebs_vol_size):
    block_map = BlockDeviceMapping()

    if ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = ebs_vol_size
        device.delete_on_termination = True
        block_map['/dev/sdv'] = device

    for i in range(get_num_disks(instance_type)):
        dev = BlockDeviceType()
        dev.ephemeral_name = 'ephemeral%d' % i
        # The first ephemeral drive is /dev/sdb.
        name = '/dev/sd' + string.ascii_letters[i + 1]
        block_map[name] = dev

    return block_map
Ejemplo n.º 12
0
def get_block_device(instance_type, ebs_vol_size):
    block_map = BlockDeviceMapping()

    if ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    for i in range(get_num_disks(instance_type)):
        dev = BlockDeviceType()
        dev.ephemeral_name = 'ephemeral%d' % i
        # The first ephemeral drive is /dev/sdb.
        name = '/dev/sd' + string.ascii_letters[i + 1]
        block_map[name] = dev

    return block_map
Ejemplo n.º 13
0
def startInstance(ec2connection, hardwareProfile, ARCH, RHEL, AMI, SSHKEYNAME):
    conn_region = ec2connection
    map = BlockDeviceMapping()
    t = EBSBlockDeviceType()
    t.size = "15"
    # map = {'DeviceName':'/dev/sda','VolumeSize':'15'}
    map["/dev/sda1"] = t

    # blockDeviceMap = []
    # blockDeviceMap.append( {'DeviceName':'/dev/sda', 'Ebs':{'VolumeSize' : '100'} })

    if ARCH == "i386" and RHEL == "6.1":
        reservation = conn_region.run_instances(
            AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map
        )
    elif ARCH == "x86_64" and RHEL == "6.1":
        reservation = conn_region.run_instances(
            AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map
        )
    elif ARCH == "i386":
        reservation = conn_region.run_instances(
            AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map
        )
    elif ARCH == "x86_64":
        reservation = conn_region.run_instances(
            AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map
        )
    else:
        print "arch type is neither i386 or x86_64.. will exit"
        exit(1)

    myinstance = reservation.instances[0]

    time.sleep(5)
    while not myinstance.update() == "running":
        time.sleep(5)
        print myinstance.update()

    instanceDetails = myinstance.__dict__
    pprint(instanceDetails)
    # region = instanceDetails['placement']
    # print 'region =' + region
    publicDNS = instanceDetails["public_dns_name"]
    print "public hostname = " + publicDNS
    # check for console output here to make sure ssh is up
    return publicDNS
Ejemplo n.º 14
0
def create_server():
    """
    Creates EC2 Instance and saves it state in a local json file
    """
    # looks for an existing 'data.json' file, so that we don't start
    # additional ec2 instances when we don't need them.
    #
    if is_there_state():
        return True
    else:
        conn = connect_to_ec2()

        print(_green("Started..."))
        print(_yellow("...Creating EC2 instance..."))

        # we need a larger boot device to store our cached images
        dev_sda1 = EBSBlockDeviceType()
        dev_sda1.size = 120
        bdm = BlockDeviceMapping()
        bdm['/dev/sda1'] = dev_sda1

        # get an ec2 ami image object with our choosen ami
        image = conn.get_all_images(env.ec2_ami)[0]
        # start a new instance
        reservation = image.run(1,
                                1,
                                key_name=env.ec2_key_pair,
                                security_groups=env.ec2_security,
                                block_device_map=bdm,
                                instance_type=env.ec2_instancetype)

        # and get our instance_id
        instance = reservation.instances[0]
        # add a tag to our instance
        conn.create_tags([instance.id], {"Name": env.ec2_instance_name})
        #  and loop and wait until ssh is available
        while instance.state == u'pending':
            yellow("Instance state: %s" % instance.state)
            sleep(10)
            instance.update()
        wait_for_ssh(instance.public_dns_name)

        green("Instance state: %s" % instance.state)
        green("Public dns: %s" % instance.public_dns_name)
        # finally save the details or our new instance into the local state file
        save_state_locally(instance.id)
Ejemplo n.º 15
0
def create_server():
    """
    Creates EC2 Instance and saves it state in a local json file
    """
    # looks for an existing 'data.json' file, so that we don't start
    # additional ec2 instances when we don't need them.
    #
    if is_there_state():
        return True
    else:
        conn = connect_to_ec2()

        print(_green("Started..."))
        print(_yellow("...Creating EC2 instance..."))

        # we need a larger boot device to store our cached images
        dev_sda1 = EBSBlockDeviceType()
        dev_sda1.size = 120
        bdm = BlockDeviceMapping()
        bdm['/dev/sda1'] = dev_sda1

        # get an ec2 ami image object with our choosen ami
        image = conn.get_all_images(env.ec2_ami)[0]
        # start a new instance
        reservation = image.run(1, 1,
                                key_name=env.ec2_key_pair,
                                security_groups=env.ec2_security,
                                block_device_map = bdm,
                                instance_type=env.ec2_instancetype)

        # and get our instance_id
        instance = reservation.instances[0]
        # add a tag to our instance
        conn.create_tags([instance.id], {"Name": env.ec2_instance_name})
        #  and loop and wait until ssh is available
        while instance.state == u'pending':
            yellow("Instance state: %s" % instance.state)
            sleep(10)
            instance.update()
        wait_for_ssh(instance.public_dns_name)

        green("Instance state: %s" % instance.state)
        green("Public dns: %s" % instance.public_dns_name)
        # finally save the details or our new instance into the local state file
        save_state_locally(instance.id)
Ejemplo n.º 16
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     is_instance_store = self.conn.get_all_images(
         self.config['ec2_ami_id'],
         filters={'root-device-type': 'instance-store'})
     if is_instance_store:
         block_map = None
     else:
         block_map = BlockDeviceMapping()
         root_device = self.config['ec2_root_device']
         block_map[root_device] = EBSBlockDeviceType()
         if self.config['ec2_size']:
             block_map[root_device].size = self.config['ec2_size']
         block_map[root_device].delete_on_termination = True
     reservation = self.conn.run_instances(
         self.config['ec2_ami_id'],
         key_name=self.config['ec2_key_name'],
         security_groups=self.config['ec2_security_groups']
         or [self.config['ec2_security_group']],
         instance_type=self.config['ec2_instance_type'],
         placement=self.config['ec2_zone'],
         placement_group=self.config['ec2_placement_group'],
         monitoring_enabled=self.config['ec2_monitoring_enabled'],
         block_device_map=block_map,
         user_data=self.user_data)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
Ejemplo n.º 17
0
    def startInstance(self, ami, ec2_keyName, sec_group, hwp):
        map = BlockDeviceMapping()
        t = EBSBlockDeviceType()
        t.size = '15'
        #map = {'DeviceName':'/dev/sda','VolumeSize':'15'}
        map['/dev/sda1'] = t
        reservation = self.connection.run_instances(ami,
             instance_type=hwp, key_name=ec2_keyName,
             security_groups=sec_group, block_device_map=map)

        myinstance = reservation.instances[0]

        time.sleep(5)
        while(not myinstance.update() == 'running'):
            time.sleep(5)
            print myinstance.update()

        #pprint(instanceDetails)
        return myinstance
Ejemplo n.º 18
0
 def parse_block_device_args(self, block_device_maps_args):
     block_device_map = BlockDeviceMapping()
     for block_device_map_arg in block_device_maps_args:
         parts = block_device_map_arg.split('=')
         if len(parts) > 1:
             device_name = parts[0]
             block_dev_type = EBSBlockDeviceType()
             value_parts = parts[1].split(':')
             if value_parts[0].startswith('snap'):
                 block_dev_type.snapshot_id = value_parts[0]
             else:
                 if value_parts[0].startswith('ephemeral'):
                     block_dev_type.ephemeral_name = value_parts[0]
             if len(value_parts) > 1:
                 block_dev_type.size = int(value_parts[1])
             if len(value_parts) > 2:
                 if value_parts[2] == 'true':
                     block_dev_type.delete_on_termination = True
             block_device_map[device_name] = block_dev_type
     return block_device_map
Ejemplo n.º 19
0
def startInstance(ec2connection, hardwareProfile):
    conn_region = ec2connection
    map = BlockDeviceMapping() 
    t = EBSBlockDeviceType()
    t.size = '15'
    #map = {'DeviceName':'/dev/sda','VolumeSize':'15'}
    map['/dev/sda1'] = t  

    #blockDeviceMap = []
    #blockDeviceMap.append( {'DeviceName':'/dev/sda', 'Ebs':{'VolumeSize' : '100'} })

    if ARCH == 'i386' and RHEL == '6.1':
        reservation = conn_region.run_instances(AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map )
    elif ARCH == 'x86_64' and RHEL == '6.1':
        reservation = conn_region.run_instances(AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map )
    elif ARCH == 'i386':
        reservation = conn_region.run_instances(AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map )
    elif ARCH == 'x86_64':
        reservation = conn_region.run_instances(AMI, instance_type=hardwareProfile, key_name=SSHKEYNAME, block_device_map=map)
    else:
        print "arch type is neither i386 or x86_64.. will exit"
        exit(1)
        
    myinstance = reservation.instances[0]
    
    time.sleep(5)
    while(not myinstance.update() == 'running'):
        time.sleep(5)
        print myinstance.update()
        
    instanceDetails = myinstance.__dict__
    pprint(instanceDetails)
    #region = instanceDetails['placement']
    #print 'region =' + region
    publicDNS = instanceDetails['public_dns_name']
    print 'public hostname = ' + publicDNS
   
    
    # check for console output here to make sure ssh is up
    return publicDNS
Ejemplo n.º 20
0
 def launch_instance(self):
     if not self.verify_settings():
         return
     is_instance_store = self.conn.get_all_images(self.config['ec2_ami_id'], filters={'root-device-type': 'instance-store'})
     if is_instance_store:
         block_map = None
     else:
         block_map = BlockDeviceMapping()
         root_device = self.config['ec2_root_device']
         block_map[root_device] = EBSBlockDeviceType()
         if self.config['ec2_size']:
             block_map[root_device].size = self.config['ec2_size']
         block_map[root_device].delete_on_termination = True
     reservation = self.conn.run_instances(
         self.config['ec2_ami_id'],
         key_name=self.config['ec2_key_name'],
         security_groups=self.config['ec2_security_groups'] or [self.config['ec2_security_group']],
         instance_type=self.config['ec2_instance_type'],
         placement=self.config['ec2_zone'],
         placement_group=self.config['ec2_placement_group'],
         monitoring_enabled=self.config['ec2_monitoring_enabled'],
         block_device_map=block_map,
         user_data=self.user_data)
     self.instance = reservation.instances[0]
     secs = RUN_INSTANCE_TIMEOUT
     rest_interval = 5
     while secs and not self.instance.state == 'running':
         time.sleep(rest_interval)
         secs = secs - rest_interval
         try:
             self.instance.update()
         except boto.exception.EC2ResponseError:
             pass
     if secs <= 0:
         errmsg = "run instance {0} failed after {1} seconds".format(
             self.instance.id, RUN_INSTANCE_TIMEOUT)
         LOG.error(errmsg)
     else:
         if self.config['hostname']:
             self.assign_name_tag()
Ejemplo n.º 21
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print("ERROR: Must provide an identity file (-i) for ssh connections.",
              file=stderr)
        sys.exit(1)

    if opts.key_pair is None:
        print("ERROR: Must provide a key pair name (-k) to use on instances.",
              file=stderr)
        sys.exit(1)

    user_data_content = None

    print("Setting up security groups...")
    master_group = get_or_make_group(conn, cluster_name + "-master",
                                     opts.vpc_id)
    slave_group = get_or_make_group(conn, cluster_name + "-slaves",
                                    opts.vpc_id)
    authorized_address = opts.authorized_address
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, authorized_address)
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print("ERROR: There are already instances running in group %s or %s" %
              (master_group.name, slave_group.name),
              file=stderr)
        sys.exit(1)

    # Use the default Ubuntu AMI.
    if opts.ami is None:
        if opts.region == "us-east-1":
            opts.ami = "ami-2d39803a"
        elif opts.region == "us-west-1":
            opts.ami = "ami-06116566"
        elif opts.region == "us-west-2":
            opts.ami = "ami-9abea4fb"
        elif opts.region == "eu-west-1":
            opts.ami = "ami-f95ef58a"
        elif opts.region == "eu-central-1":
            opts.ami = "ami-87564feb"
        elif opts.region == "ap-northeast-1":
            opts.ami = "ami-a21529cc"
        elif opts.region == "ap-northeast-2":
            opts.ami = "ami-09dc1267"
        elif opts.region == "ap-southeast-1":
            opts.ami = "ami-25c00c46"
        elif opts.region == "ap-southeast-2":
            opts.ami = "ami-6c14310f"
        elif opts.region == "ap-south-1":
            opts.ami = "ami-4a90fa25"
        elif opts.region == "sa-east-1":
            opts.ami = "ami-0fb83963"
        else:
            raise Exception("The specified region is unknown.")

    # we use group ids to work around https://github.com/boto/boto/issues/350
    additional_group_ids = []
    if opts.additional_security_group:
        additional_group_ids = [
            sg.id for sg in conn.get_all_security_groups()
            if opts.additional_security_group in (sg.name, sg.id)
        ]
    print("Launching instances...")

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print("Could not find AMI " + opts.ami, file=stderr)
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map["/dev/sd" + chr(ord('s') + i)] = device

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.ascii_letters[i + 1]
            block_map[name] = dev

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_group_ids=[slave_group.id] + additional_group_ids,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                user_data=user_data_content,
                instance_profile_name=opts.instance_profile_name)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print("Waiting for spot instances to be granted...")
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print("All %d slaves granted" % opts.slaves)
                    reservations = conn.get_all_reservations(
                        active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print("%d of %d slaves granted, waiting longer" %
                          (len(active_instance_ids), opts.slaves))
        except:
            print("Canceling spot instance requests")
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print(("WARNING: %d instances are still running" % running),
                      file=stderr)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id] + additional_group_ids,
                    instance_type=opts.instance_type,
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    user_data=user_data_content,
                    instance_initiated_shutdown_behavior=opts.
                    instance_initiated_shutdown_behavior,
                    instance_profile_name=opts.instance_profile_name)
                slave_nodes += slave_res.instances
                print(
                    "Launched {s} slave{plural_s} in {z}, regid = {r}".format(
                        s=num_slaves_this_zone,
                        plural_s=('' if num_slaves_this_zone == 1 else 's'),
                        z=zone,
                        r=slave_res.id))
            i += 1

    # Launch or resume masters
    if existing_masters:
        print("Starting master...")
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(
            key_name=opts.key_pair,
            security_group_ids=[master_group.id] + additional_group_ids,
            instance_type=master_type,
            placement=opts.zone,
            min_count=1,
            max_count=1,
            block_device_map=block_map,
            subnet_id=opts.subnet_id,
            placement_group=opts.placement_group,
            user_data=user_data_content,
            instance_initiated_shutdown_behavior=opts.
            instance_initiated_shutdown_behavior,
            instance_profile_name=opts.instance_profile_name)

        master_nodes = master_res.instances
        print("Launched master in %s, regid = %s" % (zone, master_res.id))

    # This wait time corresponds to SPARK-4983
    print("Waiting for AWS to propagate instance metadata...")
    time.sleep(15)

    # Give the instances descriptive names and set additional tags
    additional_tags = {}
    if opts.additional_tags.strip():
        additional_tags = dict(
            map(str.strip, tag.split(':', 1))
            for tag in opts.additional_tags.split(','))

    for master in master_nodes:
        master.add_tags(
            dict(additional_tags,
                 Name='{cn}-master-{iid}'.format(cn=cluster_name,
                                                 iid=master.id)))

    for slave in slave_nodes:
        slave.add_tags(
            dict(additional_tags,
                 Name='{cn}-slave-{iid}'.format(cn=cluster_name,
                                                iid=slave.id)))

    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 22
0
def launch_cluster(conn, opts, cluster_name):
    template_vars = {
        'cluster_name':cluster_name,
        'master_security_group': cluster_name + "-master",
        'slave_security_group': cluster_name + "-slaves",
        'discovery_security_group': cluster_name + "-discovery"
    }

    if opts.copy_aws_credentials:
        if opts.deploy_aws_key_id:
            template_vars['aws_key']=opts.deploy_aws_key_id
        else:
            template_vars['aws_key']=opts.aws_access_key_id

        if opts.deploy_aws_key_secret:
            template_vars['aws_secret']=opts.deploy_aws_key_secret
        else:
            template_vars['aws_secret']=opts.aws_secret_access_key

    if opts.identity_file is None:
        print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr)
        sys.exit(1)

    if opts.key_pair is None:
        print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr)
        sys.exit(1)

    print("Setting up security groups...")
    master_group = get_or_make_group(conn, template_vars['master_security_group'], opts.vpc_id)
    slave_group = get_or_make_group(conn, template_vars['slave_security_group'], opts.vpc_id)
    discovery_group = get_or_make_group(conn, template_vars['discovery_security_group'], opts.vpc_id)
    authorized_address = opts.authorized_address

    if master_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            master_group.authorize(src_group=master_group)
            master_group.authorize(src_group=slave_group)
            master_group.authorize(src_group=discovery_group)
        else:
            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                   src_group=discovery_group)
            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                   src_group=discovery_group)
            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                   src_group=discovery_group)
        master_group.authorize('tcp', 22, 22, authorized_address)

    if slave_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            slave_group.authorize(src_group=master_group)
            slave_group.authorize(src_group=slave_group)
            slave_group.authorize(src_group=discovery_group)
        else:
            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=discovery_group)
            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=discovery_group)
            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=discovery_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)

    if discovery_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            discovery_group.authorize(src_group=master_group)
            discovery_group.authorize(src_group=slave_group)
            discovery_group.authorize(src_group=discovery_group)
        else:
            discovery_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=discovery_group)
            discovery_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=discovery_group)
            discovery_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=discovery_group)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                             die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print("ERROR: There are already instances running in group %s or %s" %
              (master_group.name, slave_group.name), file=stderr)
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_ami(opts)

    # we use group ids to work around https://github.com/boto/boto/issues/350
    additional_group_ids = []
    if opts.additional_security_group:
        all_groups = conn.get_all_security_groups()
        additional_group_ids = []
        for group in opts.additional_security_group.split(','):
            additional_group_ids += [sg.id for sg in all_groups if group in (sg.name, sg.id)]

    template_vars['security_groups']= template_vars['discovery_security_group']

    print("Launching instances...")

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print("Could not find AMI " + opts.ami, file=stderr)
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        ebs_devices=[]
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device_id = "/dev/sd" + chr(ord('s') + i)
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map[device_id] = device
            ebs_devices+=device_id
        template_vars['ebs_devices']=' '.join(ebs_devices)

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        local_devices=[]
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.ascii_letters[i + 1]
            block_map[name] = dev
            local_devices+=name
        template_vars['local_devices']=' '.join(local_devices)

    master_user_data_content = get_user_data(opts.master_user_data,template_vars)
    slave_user_data_content = get_user_data(opts.slave_user_data,template_vars)

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                user_data=slave_user_data_content,
                instance_profile_name=opts.instance_profile_name)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print("Waiting for spot instances to be granted...")
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print("All %d slaves granted" % opts.slaves)
                    reservations = conn.get_all_reservations(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print("%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves))
        except:
            print("Canceling spot instance requests")
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(
                conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print(("WARNING: %d instances are still running" % running), file=stderr)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id,discovery_group.id] + additional_group_ids,
                    instance_type=opts.instance_type,
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    user_data=slave_user_data_content,
                    instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
                    instance_profile_name=opts.instance_profile_name)
                slave_nodes += slave_res.instances
                print("Launched {s} slave{plural_s} in {z}, regid = {r}".format(
                      s=num_slaves_this_zone,
                      plural_s=('' if num_slaves_this_zone == 1 else 's'),
                      z=zone,
                      r=slave_res.id))
            i += 1

    # Launch or resume masters
    if existing_masters:
        print("Starting master...")
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name

        if opts.spot_price is not None:
            # Launch spot instance with the requested price
            print("Requesting master as spot instance with price $%.3f" % (opts.spot_price))
            master_reqs = conn.request_spot_instances(
                    price=opts.spot_price,
                    image_id=opts.ami,
                    key_name=opts.key_pair,
                    launch_group="master-group-%s" % cluster_name,
                    security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids,
                    instance_type=master_type,
                    placement=opts.zone,
                    count=1,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    user_data=master_user_data_content,
                    instance_profile_name=opts.instance_profile_name)
            master_req_id = master_reqs[0].id

            print("Waiting for spot instances to be granted...")
            try:
                while True:
                    time.sleep(10)
                    reqs = conn.get_all_spot_instance_requests()
                    id_to_req = {}
                    for r in reqs:
                        id_to_req[r.id] = r
                    master_instance_ids = []
                    if master_req_id in id_to_req and id_to_req[master_req_id].state == "active":
                        master_instance_ids.append(id_to_req[master_req_id].instance_id)
                        print("Master granted")
                        reservations = conn.get_all_reservations(master_instance_ids)
                        master_nodes = []
                        for r in reservations:
                            master_nodes += r.instances
                        break
                    else:
                        print("Master not granted yet, waiting longer")
            except:
                print("Canceling spot instance request for master")
                conn.cancel_spot_instance_requests([master_req_id])
                sys.exit(0)
        else:
            master_res = image.run(
                key_name=opts.key_pair,
                security_group_ids=[master_group.id,discovery_group.id] + additional_group_ids,
                instance_type=master_type,
                placement=opts.zone,
                min_count=1,
                max_count=1,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                user_data=master_user_data_content,
                instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
                instance_profile_name=opts.instance_profile_name)

            master_nodes = master_res.instances
            print("Launched master in %s, regid = %s" % (zone, master_res.id))

    # This wait time corresponds to SPARK-4983
    print("Waiting for AWS to propagate instance metadata...")
    time.sleep(15)

    # Give the instances descriptive names and set additional tags
    additional_tags = {}
    if opts.additional_tags.strip():
        additional_tags = dict(
            map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
        )

    for master in master_nodes:
        master.add_tags(
            dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
        )

    for slave in slave_nodes:
        slave.add_tags(
            dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
        )

    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 23
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0')
        master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances running in " +
                          "group %s or %s" %
                          (master_group.name, slave_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.letters[i + 1]
            block_map[name] = dev

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group],
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data_content)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group],
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data_content)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group],
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map,
                               user_data=user_data_content)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    for master in master_nodes:
        master.add_tag(key='Name',
                       value='{cn}-master-{iid}'.format(cn=cluster_name,
                                                        iid=master.id))
    for slave in slave_nodes:
        slave.add_tag(key='Name',
                      value='{cn}-slave-{iid}'.format(cn=cluster_name,
                                                      iid=slave.id))

    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 24
0
def launch_cluster(conn, opts, cluster_name):
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=zoo_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=zoo_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    if zoo_group.rules == []:  # Group was just now created
        zoo_group.authorize(src_group=master_group)
        zoo_group.authorize(src_group=slave_group)
        zoo_group.authorize(src_group=zoo_group)
        zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

    # Check if instances are already running in our groups
    print "Checking for running cluster..."
    reservations = conn.get_all_instances()
    for res in reservations:
        group_names = [g.id for g in res.groups]
        if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
            active = [i for i in res.instances if is_active(i)]
            if len(active) > 0:
                print >> stderr, (
                    "ERROR: There are already instances running in " +
                    "group %s, %s or %s" %
                    (master_group.name, slave_group.name, zoo_group.name))
                sys.exit(1)
    print "Launching instances..."
    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    slave_res = image.run(key_name=opts.key_pair,
                          security_groups=[slave_group],
                          instance_type=opts.instance_type,
                          placement=opts.zone,
                          min_count=opts.slaves,
                          max_count=opts.slaves,
                          block_device_map=block_map)
    slave_nodes = slave_res.instances
    print "Launched slaves, regid = " + slave_res.id

    # Launch masters
    master_type = opts.master_instance_type
    if master_type == "":
        master_type = opts.instance_type
    master_res = image.run(key_name=opts.key_pair,
                           security_groups=[master_group],
                           instance_type=master_type,
                           placement=opts.zone,
                           min_count=opts.ft,
                           max_count=opts.ft,
                           block_device_map=block_map)
    master_nodes = master_res.instances
    print "Launched master, regid = " + master_res.id

    # Launch ZooKeeper nodes if required
    if opts.ft > 1:
        zoo_res = image.run(key_name=opts.key_pair,
                            security_groups=[zoo_group],
                            instance_type=opts.instance_type,
                            placement=opts.zone,
                            min_count=3,
                            max_count=3,
                            block_device_map=block_map)
        zoo_nodes = zoo_res.instances
        print "Launched zoo, regid = " + zoo_res.id
    else:
        zoo_nodes = []

    # Return all the instances
    return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 25
0
def build(hosts, cred, dry, inventory='hosts'):
    hret = {}
    old_state = {}
    con = None
    for h in hosts:
        logger.info("    Run action on host [%s]" % (h))
        hret[h] = {}
        hv = {}
        hv = vmbuilder.utils.load_host_vars(h, inventory=inventory)
        hvars = hv['VM_PROVIDER']
        if con is None:
            con = _connect(hvars['region'], cred)
        reservations = con.get_all_reservations(filters={"tag:Name": h})
        old_state[h] = "absent"
        for reservation in reservations:
            instance = reservation.instances[0]
            if instance.state != 'terminated':
                hret[h]['instance'] = instance
                old_state[h] = "present"
                logger.info("      Server [%s] is already present" % (h))

        if old_state[h] == 'present':
            continue

        bdm = None
        if 'disk_size' in hvars:
            try:
                dev_sda1 = EBSBlockDeviceType()
                dev_sda1.size = hvars['disk_size']
                dev_sda1.delete_on_termination = True
                bdm = BlockDeviceMapping()
                bdm['/dev/sda1'] = dev_sda1
            except Exception as e:
                logger.error("Error building block device for server: %s" % (e))
                exit(1)

        try:
            reservation = con.run_instances(
                hvars['ami'],
                key_name=hvars['key'],
                instance_type=hvars['vmtype'],
                security_group_ids=[hvars['security']],
                subnet_id=hvars['subnet'],
                block_device_map=bdm,
                dry_run=dry
            )
            hret[h]['instance'] = reservation.instances[0]
        except Exception as e:
            logger.error("Error building server: %s" % (e))
            exit(1)

    for h in hosts:
        hv = vmbuilder.utils.load_host_vars(h, inventory=inventory)
        hvars = hv['VM_PROVIDER']
        instance = hret[h]['instance']
        status = instance.update()
        if old_state[h] == 'absent':
            logger.info("        Waiting for [%s] to be launched..." % (h))
            while status == 'pending':
                time.sleep(5)
                status = instance.update()

        if old_state[h] == 'present':
            logger.info("        State is running with IP [%s]" % (instance.private_ip_address))
        elif status == 'running':
            logger.info("        State changed to running with IP [%s]" % (instance.private_ip_address))
        else:
            logger.error("        Status of [%s] is [%s]" % (h, status))

        instance.add_tag("Name", "%s" % (h))
        for cur_tag in hvars['tags']:
            instance.add_tag(cur_tag, hvars['tags'][cur_tag])

        if 'extra_disks' in hvars and old_state[h] == 'absent':
            try:
                for cur_disk in hvars['extra_disks']:
                    cur_vol = con.create_volume(cur_disk['size'], instance.placement)
                    status = cur_vol.status
                    while status != 'available':
                        logger.info("          Waiting for volume [%s] to be launched..." % (cur_vol))
                        time.sleep(10)
                        status = cur_vol.update()
                    con.attach_volume(cur_vol.id, instance.id, '/dev/' + cur_disk['device'])
            except Exception as e:
                logger.error("Error Attaching new disks: %s" % (e))
                exit(1)

        instance_volumes = con.get_all_volumes(filters={'attachment.instance-id': instance.id})
        for counter, cur_vol in enumerate(instance_volumes):
            cur_vol.add_tag("Name", "%s_disk%d" % (h.split('.')[0], counter))

        hret[h]['private_ip_address'] = instance.private_ip_address
        # If requested assosiate an new elastic IP for the host and create a security group to whitelist external IPs
        if 'assosiate_eip' in hvars and hvars['assosiate_eip'] is True:
            if instance.ip_address is None:
                eip = con.allocate_address()
                con.associate_address(instance.id, eip.public_ip)
                logger.info("          Adding public IP [%s]" % (eip.public_ip))
                hret[h]['public_ip_address'] = eip.public_ip
            if 'whitelisted_ips' in hvars:
                logger.info("          Whitelisting IPs [%s]" % (hvars['whitelisted_ips']))
                ips = hvars['whitelisted_ips'].split(',')
                project = hvars['tags']['Project']
                security = hvars['security']
                _create_security_group(con, instance, project, ips, security)
    return hret
Ejemplo n.º 26
0
def launch_cluster(conn, opts, cluster_name):

  conn = AWSConnection(conn, VPCConnection(region=conn.region))

  print "Setting up VPC..."
  vpc = get_or_make_vpc(conn, cluster_name, 'mesos-vpc')
  print "Using vpc: %s" % (vpc.id)


  print "Setting up subnet..."
  subnet = get_or_make_subnet(conn, vpc.id, opts.zone, cluster_name, 'mesos-subnet')
  print "Using subnet: %s" % (subnet.id)

  # Add internet gateway to VPC.
  print "Creating internet gateway"
  ig = get_or_make_ig(conn, vpc.id, cluster_name, 'mesos-vpc')
  print "Using internet gateway: %s" % (ig.id)
  
  # Add route to route table
  rt = get_or_make_rt(conn, vpc.id, cluster_name, 'mesos-rt')
  conn.vpc.create_route(rt.id, '0.0.0.0/0', gateway_id=ig.id)

  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-masters")
  slave_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-slaves")
  zoo_group = get_or_make_group(conn, cluster_name, vpc.id, "mesos-zoo")

  if master_group.rules == []: # Group was just now created
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  print "Checking for running cluster..."
  reservations = conn.ec2.get_all_instances()
  for res in reservations:
    group_names = [g.name for g in res.groups]
    if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
      active = [i for i in res.instances if is_active(i)]
      if len(active) > 0:
        print >> stderr, ("ERROR: There are already instances running in " +
            "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)
  
  print "Launching instances..."
  if opts.ami == "latest":
    # Figure out the latest AMI from our static URL
    try:
      opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip()
    except:
      print >> stderr, "Could not read " + LATEST_AMI_URL

  try:
    image = conn.ec2.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    slave_reqs = conn.ec2.request_spot_instances(
        price = opts.spot_price,
        image_id = opts.ami,
        launch_group = "launch-group-%s" % cluster_name,
        placement = opts.zone,
        count = opts.slaves,
        key_name = opts.key_pair,
        security_groups = [slave_group],
        instance_type = opts.instance_type,
        block_device_map = block_map)
    my_req_ids = [req.id for req in slave_reqs]
    print "Waiting for spot instances to be granted..."
    while True:
      time.sleep(10)
      reqs = conn.get_all_spot_instance_requests()
      id_to_req = {}
      for r in reqs:
        id_to_req[r.id] = r
      active = 0
      instance_ids = []
      for i in my_req_ids:
        if id_to_req[i].state == "active":
          active += 1
          instance_ids.append(id_to_req[i].instance_id)
      if active == opts.slaves:
        print "All %d slaves granted" % opts.slaves
        reservations = conn.get_all_instances(instance_ids)
        slave_nodes = []
        for r in reservations:
          slave_nodes += r.instances
        break
      else:
        print "%d of %d slaves granted, waiting longer" % (active, opts.slaves)
  else:
    # Launch non-spot instances
    slave_res = conn.ec2.run_instances(opts.ami,
                          key_name = opts.key_pair,
                          subnet_id = subnet.id,
                          security_group_ids = [slave_group.id],
                          instance_type = opts.instance_type,
                          placement = opts.zone,
                          min_count = opts.slaves,
                          max_count = opts.slaves,
                          block_device_map = block_map)
    slave_nodes = slave_res.instances
    print "Launched slaves, regid = " + slave_res.id

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(subnet_id=subnet.id,
                                                                    groups=[master_group.id],
                                                                    associate_public_ip_address=True)
  interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
  master_res = conn.ec2.run_instances(opts.ami,
                          key_name = opts.key_pair,
                         instance_type = master_type,
                         placement = opts.zone,
                         network_interfaces = interfaces,
                         min_count = opts.ft,
                         max_count = opts.ft,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master, regid = " + master_res.id

  # Launch ZooKeeper nodes if required
  if opts.ft > 1:
    zoo_res = conn.ec2.run_instances(opts.ami,
                        key_name = opts.key_pair,
                        subnet_id = subnet.id,
                        security_group_ids = [zoo_group.id],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = 3,
                        max_count = 3,
                        block_device_map = block_map)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 27
0
def launch_cluster(conn, opts, cluster_name):
  if opts.identity_file is None:
    print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
    sys.exit(1)
  if opts.key_pair is None:
    print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
    sys.exit(1)
  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name + "-master")
  master_group.owner_id = os.getenv('EC2_USER_ID')
  slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  slave_group.owner_id = os.getenv('EC2_USER_ID')
  zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
  zoo_group.owner_id = os.getenv('EC2_USER_ID')
  
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
    master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
    master_group.authorize('tcp', 40000, 40000, '0.0.0.0/0') #apache hama
    master_group.authorize('tcp', 40013, 40013, '0.0.0.0/0') #apache hama
    if opts.ganglia:
      master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    slave_group.authorize('tcp', 40015, 40015, '0.0.0.0/0') ##apache hama web UI
  
  if zoo_group.rules == []: # Group was just now created
      zoo_group.authorize(src_group=master_group)
      zoo_group.authorize(src_group=slave_group)
      zoo_group.authorize(src_group=zoo_group)
      zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
      zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
      zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
      zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')   
   


  # Check if instances are already running in our groups
  existing_masters, existing_slaves, existing_zoos = get_existing_cluster(conn, opts, cluster_name,
                                                           die_on_error=False)
  if existing_slaves or (existing_masters and not opts.use_existing_master):
    print >> stderr, ("ERROR: There are already instances running in " +
        "group %s or %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
    sys.exit(1)

  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.emi])[0]
  except:
    print >> stderr, "Could not find emi " + opts.emi
    sys.exit(1)
    
  try:
    image_master = conn.get_all_images(image_ids=[opts.emi_master])[0]
  except:
    print >> stderr, "Could not find emi " + opts.emi_master
    sys.exit(1)
  
  if (opts.emi_zoo != ""):  
      try:
        image_zoo = conn.get_all_images(image_ids=[opts.emi_zoo])[0]
      except:
        print >> stderr, "Could not find emi " + opts.emi_zoo
        sys.exit(1)     
    
    
    

  # Create block device mapping so that we can add an EBS volume if asked to
  logging.debug( "Calling boto BlockDeviceMapping()...")
  block_map = BlockDeviceMapping()
  logging.debug(" Printing block_map..") 
  #print block_map
  if opts.ebs_vol_size > 0:
    logging.debug("Calling boto EBSBlockDeviceType()...")
    device = EBSBlockDeviceType()
    #print "device: ", device
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    device.ephemeral_name = "ephemeral0"
    #block_map["/dev/sdv"] = device
    #block_map["/dev/sdv"] = device
    block_map["/dev/vdb"] = device
    
  if opts.user_data_file != None:
      user_data_file = open(opts.user_data_file)
      try:
          opts.user_data = user_data_file.read()
          #print "user data (encoded) = ", opts.user_data
      finally:
          user_data_file.close()
  
  # Launch non-spot instances
  zones = get_zones(conn, opts)    
  num_zones = len(zones)
  i = 0
  slave_nodes = []
  for zone in zones:
    num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
    if num_slaves_this_zone > 0:
        slave_res = image.run(key_name = opts.key_pair,
                              security_groups = [slave_group],
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map,
                              user_data = opts.user_data)
        slave_nodes += slave_res.instances
        print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, slave_res.id)
    i += 1  

  # Launch or resume masters
  if existing_masters:
    print "Starting master..."
    for inst in existing_masters:
      if inst.state not in ["shutting-down", "terminated"]:  
        inst.start()
    master_nodes = existing_masters
  else:
    master_type = opts.master_instance_type
    if master_type == "":
      master_type = opts.instance_type
    if opts.zone == 'all':
      opts.zone = random.choice(conn.get_all_zones()).name
    master_res = image_master.run(key_name = opts.key_pair,
                           security_groups = [master_group],
                           instance_type = master_type,
                           placement = opts.zone,
                           min_count = 1,
                           max_count = 1,
                           block_device_map = block_map,
                           user_data = opts.user_data)
    master_nodes = master_res.instances
    print "Launched master in %s, regid = %s" % (zone, master_res.id)
    
  # Launch ZooKeeper nodes if required
  if int(opts.ft) > 1:
    print "Running " + opts.ft + " zookeepers"
    zoo_res = image_zoo.run(key_name = opts.key_pair,
                        security_groups = [zoo_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = 3,
                        max_count = 3,
                        block_device_map = block_map,
                        user_data = opts.user_data)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 28
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print "Setting up security groups..."
    if opts.security_group_prefix is None:
        master_group = get_or_make_group(conn, cluster_name + "-master")
        slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    else:
        master_group = get_or_make_group(conn, opts.security_group_prefix + "-master")
        slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0')
        master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    # Check if instances are already running with the cluster name
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                             die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances for name: %s " % cluster_name)
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print ("Requesting %d slaves as spot instances with price $%.3f" %
               (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group],
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data_content)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                outstanding_request_ids = []
                for i in my_req_ids:
                    if i in id_to_req:
                        if id_to_req[i].state == "active":
                            active_instance_ids.append(id_to_req[i].instance_id)
                        else:
                            outstanding_request_ids.append(i)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer for request ids including %s" % (
                        len(active_instance_ids), opts.slaves, outstanding_request_ids[0:10])
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(
                conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" % running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group],
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data_content)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                                zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group],
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    # TODO: Add retry logic for tagging with name since it's used to identify a cluster.
    for master in master_nodes:
        name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)
        for i in range(0, 5):
            try:
                master.add_tag(key='Name', value=name)
            except:
                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
                if (i == 5):
                    raise "Error - failed max attempts to add name tag"
                time.sleep(5)


    for slave in slave_nodes:
        name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)
        for i in range(0, 5):
            try:
                slave.add_tag(key='Name', value=name)
            except:
                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
                if (i == 5):
                    raise "Error - failed max attempts to add name tag"
                time.sleep(5)

    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 29
0
def launch_cluster(conn, opts, num_nodes, cluster_name):
    if opts.identity_file is None:
        print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr)
        sys.exit(1)

    if opts.key_pair is None:
        print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr)
        sys.exit(1)

    print("Setting up security groups...")

    slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id)
    authorized_address = opts.authorized_address
    if slave_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            slave_group.authorize(src_group=slave_group)
        else:
            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=slave_group)
            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=slave_group)
            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)

    # Check if instances are already running in our groups
    existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
    if existing_slaves:
        print("ERROR: There are already instances running in group %s" %
              slave_group.name, file=stderr)
        sys.exit(1)

    if opts.ami is None:
        print("ERROR: AMI is not set, exit")
        sys.exit(1)

    # we use group ids to work around https://github.com/boto/boto/issues/350
    additional_group_ids = []
    if opts.additional_security_group:
        additional_group_ids = [sg.id
                                for sg in conn.get_all_security_groups()
                                if opts.additional_security_group in (sg.name, sg.id)]
    print("Launching instances...")

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print("Could not find AMI " + opts.ami, file=stderr)
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map["/dev/sd" + chr(ord('s') + i)] = device

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (num_nodes, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(num_nodes, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_group_ids=[slave_group.id] + additional_group_ids,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                instance_profile_name=opts.instance_profile_name)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print("Waiting for spot instances to be granted...")
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == num_nodes:
                    print("All %d spot instances granted" % (num_nodes + 1))
                    reservations = conn.get_all_reservations(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print("%d of %d slave spot instances granted, waiting longer" % (
                            len(active_instance_ids), num_nodes))
        except:
            print("Canceling spot instance requests")
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            slave_nodes = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
            running = len(slave_nodes)
            if running:
                print(("WARNING: %d instances are still running" % running), file=stderr)
            sys.exit(0)
    else:
        print ("WARNING: --spot-price was not set; consider launch slaves as spot instances to save money")
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(num_nodes, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id] + additional_group_ids,
                    instance_type=opts.instance_type,
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
                    instance_profile_name=opts.instance_profile_name)
                slave_nodes += slave_res.instances
                print("Launched {s} slave{plural_s} in {z}, regid = {r}".format(
                      s=num_slaves_this_zone,
                      plural_s=('' if num_slaves_this_zone == 1 else 's'),
                      z=zone,
                      r=slave_res.id))
            i += 1


    print("Waiting for AWS to propagate instance metadata...")
    time.sleep(15)

    # Give the instances descriptive names and set additional tags
    additional_tags = {}
    if opts.additional_tags.strip():
        additional_tags = dict(
            map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
        )

    for slave in slave_nodes:
        slave.add_tags(
            dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
        )

    # Return all the instances
    return slave_nodes
Ejemplo n.º 30
0
def launch_cluster(conn, opts, cluster_name):
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, "strata-master")
    slave_group = get_or_make_group(conn, "strata-slaves")
    zoo_group = get_or_make_group(conn, "strata-zoo")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=zoo_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        if opts.cluster_type == "mesos":
            master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 80, 80, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=zoo_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    if zoo_group.rules == []:  # Group was just now created
        zoo_group.authorize(src_group=master_group)
        zoo_group.authorize(src_group=slave_group)
        zoo_group.authorize(src_group=zoo_group)
        zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

    # Check if instances are already running in our groups
    active_nodes = get_existing_cluster(conn,
                                        opts,
                                        cluster_name,
                                        die_on_error=False)
    if any(active_nodes):
        print >> stderr, (
            "ERROR: There are already instances running in " +
            "group %s, %s or %s" %
            (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)

    # Figure out the latest AMI from our static URL
    if opts.ami == "latest":
        try:
            opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip()
            print "Latest Spark AMI: " + opts.ami
        except:
            print >> stderr, "Could not read " + LATEST_AMI_URL
            sys.exit(1)

    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group],
                instance_type=opts.instance_type,
                block_device_map=block_map)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes,
             zoo_nodes) = get_existing_cluster(conn,
                                               opts,
                                               cluster_name,
                                               die_on_error=False)
            running = len(master_nodes) + len(slave_nodes) + len(zoo_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group],
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch masters
    master_type = opts.master_instance_type
    if master_type == "":
        master_type = opts.instance_type
    if opts.zone == 'all':
        opts.zone = random.choice(conn.get_all_zones()).name
    master_res = image.run(key_name=opts.key_pair,
                           security_groups=[master_group],
                           instance_type=master_type,
                           placement=opts.zone,
                           min_count=1,
                           max_count=1,
                           block_device_map=block_map)
    master_nodes = master_res.instances
    print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Create the right tags
    tags = {}
    tags['cluster'] = cluster_name

    tags['type'] = 'slave'
    for node in slave_nodes:
        conn.create_tags([node.id], tags)

    tags['type'] = 'master'
    for node in master_nodes:
        conn.create_tags([node.id], tags)

    zoo_nodes = []

    # Return all the instances
    return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 31
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    if opts.vpc_id is None:
        print "Setting up EC2-Classic security groups..."
        master_group = get_or_make_group(conn, cluster_name + "-master")
        slave_group = get_or_make_group(conn, cluster_name + "-slaves")
        if master_group.rules == []:  # Group was just now created
            master_group.authorize(src_group=master_group)
            master_group.authorize(src_group=slave_group)
            master_group.authorize("tcp", 22, 22, "0.0.0.0/0")
            master_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
            master_group.authorize("tcp", 19999, 19999, "0.0.0.0/0")
            master_group.authorize("tcp", 50030, 50030, "0.0.0.0/0")
            master_group.authorize("tcp", 50070, 50070, "0.0.0.0/0")
            master_group.authorize("tcp", 60070, 60070, "0.0.0.0/0")
            master_group.authorize("tcp", 4040, 4045, "0.0.0.0/0")
            if opts.ganglia:
                master_group.authorize("tcp", 5080, 5080, "0.0.0.0/0")
        if slave_group.rules == []:  # Group was just now created
            slave_group.authorize(src_group=master_group)
            slave_group.authorize(src_group=slave_group)
            slave_group.authorize("tcp", 22, 22, "0.0.0.0/0")
            slave_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
            slave_group.authorize("tcp", 50060, 50060, "0.0.0.0/0")
            slave_group.authorize("tcp", 50075, 50075, "0.0.0.0/0")
            slave_group.authorize("tcp", 60060, 60060, "0.0.0.0/0")
            slave_group.authorize("tcp", 60075, 60075, "0.0.0.0/0")

    else:
        print "Setting up VPC security groups..."
        master_group = get_or_make_group(conn, cluster_name + "-master", vpc_id=opts.vpc_id)
        slave_group = get_or_make_group(conn, cluster_name + "-slaves", vpc_id=opts.vpc_id)
        if master_group.rules == []:  # Group was just now created
            master_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=master_group)
            master_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=slave_group)
            master_group.authorize("tcp", 22, 22, "0.0.0.0/0")
            master_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
            master_group.authorize("tcp", 19999, 19999, "0.0.0.0/0")
            master_group.authorize("tcp", 50030, 50030, "0.0.0.0/0")
            master_group.authorize("tcp", 50070, 50070, "0.0.0.0/0")
            master_group.authorize("tcp", 60070, 60070, "0.0.0.0/0")
            master_group.authorize("tcp", 4040, 4045, "0.0.0.0/0")
            if opts.ganglia:
                master_group.authorize("tcp", 5080, 5080, "0.0.0.0/0")
        if slave_group.rules == []:  # Group was just now created
            slave_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=master_group)
            slave_group.authorize(ip_protocol="tcp", from_port=1, to_port=65535, src_group=slave_group)
            slave_group.authorize("tcp", 22, 22, "0.0.0.0/0")
            slave_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
            slave_group.authorize("tcp", 50060, 50060, "0.0.0.0/0")
            slave_group.authorize("tcp", 50075, 50075, "0.0.0.0/0")
            slave_group.authorize("tcp", 60060, 60060, "0.0.0.0/0")
            slave_group.authorize("tcp", 60075, 60075, "0.0.0.0/0")

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, (
            "ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)
        )
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Hack to set VPC private hostname //fix later

    user_data = """#!/bin/bash
    hostname $(curl http://169.254.169.254/latest/meta-data/local-hostname)
    """

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print ("Requesting %d slaves as spot instances with price $%.3f" % (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if opts.vpc_id is None:
                slave_reqs = conn.request_spot_instances(
                    price=opts.spot_price,
                    image_id=opts.ami,
                    launch_group="launch-group-%s" % cluster_name,
                    placement=zone,
                    count=num_slaves_this_zone,
                    key_name=opts.key_pair,
                    security_groups=[slave_group],
                    instance_type=opts.instance_type,
                    block_device_map=block_map,
                    placement_group=opts.placement_group,
                )

            if opts.vpc_id is not None:

                interface = ec2.networkinterface.NetworkInterfaceSpecification(
                    device_index=0, subnet_id=opts.subnet_id, groups=[slave_group.id], associate_public_ip_address=True
                )

                interfaces = ec2.networkinterface.NetworkInterfaceCollection(interface)

                slave_reqs = conn.request_spot_instances(
                    price=opts.spot_price,
                    image_id=opts.ami,
                    launch_group="launch-group-%s" % cluster_name,
                    count=num_slaves_this_zone,
                    key_name=opts.key_pair,
                    instance_type=opts.instance_type,
                    block_device_map=block_map,
                    network_interfaces=interfaces,
                    user_data=user_data,
                    placement_group=opts.placement_group,
                )
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" % running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                if opts.vpc_id is None:
                    slave_res = image.run(
                        key_name=opts.key_pair,
                        security_groups=[slave_group],
                        instance_type=opts.instance_type,
                        placement=zone,
                        min_count=num_slaves_this_zone,
                        max_count=num_slaves_this_zone,
                        block_device_map=block_map,
                        placement_group=opts.placement_group,
                    )

                if opts.vpc_id is not None:

                    interface = ec2.networkinterface.NetworkInterfaceSpecification(
                        device_index=0,
                        subnet_id=opts.subnet_id,
                        groups=[slave_group.id],
                        associate_public_ip_address=True,
                    )
                    interfaces = ec2.networkinterface.NetworkInterfaceCollection(interface)

                    slave_res = conn.run_instances(
                        image_id=opts.ami,
                        key_name=opts.key_pair,
                        instance_type=opts.instance_type,
                        min_count=num_slaves_this_zone,
                        max_count=num_slaves_this_zone,
                        block_device_map=block_map,
                        network_interfaces=interfaces,
                        user_data=user_data,
                        placement_group=opts.placement_group,
                    )

                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == "all":
            opts.zone = random.choice(conn.get_all_zones()).name
        if opts.vpc_id is None:
            master_res = image.run(
                key_name=opts.key_pair,
                security_groups=[master_group],
                instance_type=master_type,
                placement=opts.zone,
                min_count=1,
                max_count=1,
                block_device_map=block_map,
            )

        if opts.vpc_id is not None:

            interface = ec2.networkinterface.NetworkInterfaceSpecification(
                device_index=0, subnet_id=opts.subnet_id, groups=[master_group.id], associate_public_ip_address=True
            )
            interfaces = ec2.networkinterface.NetworkInterfaceCollection(interface)

            master_res = conn.run_instances(
                image_id=opts.ami,
                key_name=opts.key_pair,
                instance_type=master_type,
                min_count=1,
                max_count=1,
                block_device_map=block_map,
                network_interfaces=interfaces,
                user_data=user_data,
            )

            master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    for master in master_nodes:
        master.add_tag(key="Name", value="spark-{cn}-master-{iid}".format(cn=cluster_name, iid=master.id))
    for slave in slave_nodes:
        slave.add_tag(key="Name", value="spark-{cn}-slave-{iid}".format(cn=cluster_name, iid=slave.id))

    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 32
0
def _create_server_ec2(connection,
                       region,
                       disk_name,
                       disk_size,
                       ami,
                       key_pair,
                       instance_type,
                       tags={},
                       security_groups=None,
                       delete_on_termination=True,
                       log=False,
                       wait_for_ssh_available=True):
    """
    Creates EC2 Instance
    """

    if log:
        log_green("Started...")
        log_yellow("...Creating EC2 instance...")

    ebs_volume = EBSBlockDeviceType()
    ebs_volume.size = disk_size
    bdm = BlockDeviceMapping()
    bdm[disk_name] = ebs_volume

    # get an ec2 ami image object with our choosen ami
    image = connection.get_all_images(ami)[0]
    # start a new instance
    reservation = image.run(1,
                            1,
                            key_name=key_pair,
                            security_groups=security_groups,
                            block_device_map=bdm,
                            instance_type=instance_type)

    # and get our instance_id
    instance = reservation.instances[0]

    #  and loop and wait until ssh is available
    while instance.state == u'pending':
        if log:
            log_yellow("Instance state: %s" % instance.state)
        sleep(10)
        instance.update()
    if log:
        log_green("Instance state: %s" % instance.state)
    if wait_for_ssh_available:
        wait_for_ssh(instance.public_dns_name)

    # update the EBS volumes to be deleted on instance termination
    if delete_on_termination:
        for dev, bd in instance.block_device_mapping.items():
            instance.modify_attribute('BlockDeviceMapping',
                                      ["%s=%d" % (dev, 1)])

    # add a tag to our instance
    if tags:
        connection.create_tags([instance.id], tags)

    if log:
        log_green("Public dns: %s" % instance.public_dns_name)

    # returns our new instance
    return instance
Ejemplo n.º 33
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    master_group.owner_id = os.getenv('EC2_USER_ID')
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    slave_group.owner_id = os.getenv('EC2_USER_ID')
    zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
    zoo_group.owner_id = os.getenv('EC2_USER_ID')

    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=zoo_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
        master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=zoo_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    if zoo_group.rules == []:  # Group was just now created
        zoo_group.authorize(src_group=master_group)
        zoo_group.authorize(src_group=slave_group)
        zoo_group.authorize(src_group=zoo_group)
        zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

    # Check if instances are already running in our groups
    existing_masters, existing_slaves, existing_zoos = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, (
            "ERROR: There are already instances running in " +
            "group %s or %s" %
            (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    logging.debug("Calling boto BlockDeviceMapping()...")
    block_map = BlockDeviceMapping()
    logging.debug(" Printing block_map..")
    #print block_map
    if opts.ebs_vol_size > 0:
        logging.debug("Calling boto EBSBlockDeviceType()...")
        device = EBSBlockDeviceType()
        #print "device: ", device
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        device.ephemeral_name = "ephemeral0"
        #block_map["/dev/sdv"] = device
        #block_map["/dev/sdv"] = device
        block_map["/dev/vdb"] = device

    if opts.user_data_file != None:
        user_data_file = open(opts.user_data_file)
        try:
            opts.user_data = user_data_file.read()
            #print "user data (encoded) = ", opts.user_data
        finally:
            user_data_file.close()

    # Launch non-spot instances
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    slave_nodes = []
    for zone in zones:
        num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
        if num_slaves_this_zone > 0:
            slave_res = image.run(key_name=opts.key_pair,
                                  security_groups=[slave_group],
                                  instance_type=opts.instance_type,
                                  placement=zone,
                                  min_count=num_slaves_this_zone,
                                  max_count=num_slaves_this_zone,
                                  block_device_map=block_map,
                                  user_data=opts.user_data)
            slave_nodes += slave_res.instances
            print "Launched %d slaves in %s, regid = %s" % (
                num_slaves_this_zone, zone, slave_res.id)
        i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group],
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map,
                               user_data=opts.user_data)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Launch ZooKeeper nodes if required
    if int(opts.ft) > 1:
        print "Running " + opts.ft + " zookeepers"
        zoo_res = image.run(key_name=opts.key_pair,
                            security_groups=[zoo_group],
                            instance_type=opts.instance_type,
                            placement=opts.zone,
                            min_count=3,
                            max_count=3,
                            block_device_map=block_map,
                            user_data=opts.user_data)
        zoo_nodes = zoo_res.instances
        print "Launched zoo, regid = " + zoo_res.id
    else:
        zoo_nodes = []

    # Return all the instances
    return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 34
0
    def create_nodes(self, reactor, names, distribution, metadata={}):
        """
        Create nodes with the given names.

        :param reactor: The reactor.
        :param name: The names of the nodes.
        :type name: list of str
        :param str distribution: The name of the distribution to
            install on the nodes.
        :param dict metadata: Metadata to associate with the nodes.

        :return: A list of ``Deferred``s each firing with an INode
            when the corresponding node is created.   The list has
            the same order as :param:`names`.
        """
        size = self._default_size
        disk_size = 8

        action = start_action(
            action_type=u"flocker:provision:aws:create_nodes",
            instance_count=len(names),
            distribution=distribution,
            image_size=size,
            disk_size=disk_size,
            metadata=metadata,
        )
        with action.context():
            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]}, )

            instances = self._run_nodes(count=len(names),
                                        image_id=images[0].id,
                                        size=size,
                                        diskmap=diskmap)

            def make_node(ignored, name, instance):
                return AWSNode(
                    name=name,
                    _provisioner=self,
                    _instance=instance,
                    distribution=distribution,
                )

            results = []
            for name, instance in izip_longest(names, instances):
                if instance is None:
                    results.append(fail(Exception("Could not run instance")))
                else:
                    node_metadata = metadata.copy()
                    node_metadata['Name'] = name
                    d = self._async_get_node(reactor, instance, node_metadata)
                    d = DeferredContext(d)
                    d.addCallback(make_node, name, instance)
                    results.append(d.result)
            action_completion = DeferredContext(DeferredList(results))
            action_completion.addActionFinish()
            # Individual results and errors should be consumed by the caller,
            # so we can leave action_completion alone now.
            return results
Ejemplo n.º 35
0
def launch_cluster(conn, opts, cluster_name):
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
    # master_group = get_or_make_group(conn, cluster_name)
    # slave_group = get_or_make_group(conn, cluster_name)
    # zoo_group = get_or_make_group(conn, cluster_name)

    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=zoo_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=zoo_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    if zoo_group.rules == []:  # Group was just now created
        zoo_group.authorize(src_group=master_group)
        zoo_group.authorize(src_group=slave_group)
        zoo_group.authorize(src_group=zoo_group)
        zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

    # Check if instances are already running in our groups
    print "Checking for running cluster..."
    reservations = conn.get_all_instances()
    for res in reservations:
        group_names = [g.id for g in res.groups]
        if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
            active = [i for i in res.instances if is_active(i)]
            if len(active) > 0:
                print >> stderr, (
                    "ERROR: There are already instances running in " +
                    "group %s, %s or %s" %
                    (master_group.name, slave_group.name, zoo_group.name))
                sys.exit(1)

    if opts.ami == "std":
        try:
            opts.ami = urllib2.urlopen(STD_AMI_URL).read().strip()
            print "GraphLab AMI for Standard Instances: " + opts.ami
        except:
            print >> stderr, "Could not read " + STD_AMI_URL
    elif opts.ami == "hpc":
        try:
            opts.ami = urllib2.urlopen(HVM_AMI_URL).read().strip()
            print "GraphLab AMI for HPC Instances: " + opts.ami
        except:
            print >> stderr, "Could not read " + HVM_AMI_URL

    print "Launching instances..."
    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        slave_reqs = conn.request_spot_instances(
            price=opts.spot_price,
            image_id=opts.ami,
            launch_group="launch-group-%s" % cluster_name,
            placement=opts.zone,
            count=opts.slaves,
            key_name=opts.key_pair,
            security_groups=[slave_group],
            instance_type=opts.instance_type,
            block_device_map=block_map)
        my_req_ids = [req.id for req in slave_reqs]
        print "Waiting for spot instances to be granted..."
        while True:
            time.sleep(10)
            reqs = conn.get_all_spot_instance_requests()
            id_to_req = {}
            for r in reqs:
                id_to_req[r.id] = r
            active = 0
            instance_ids = []
            for i in my_req_ids:
                if id_to_req[i].state == "active":
                    active += 1
                    instance_ids.append(id_to_req[i].instance_id)
            if active == opts.slaves:
                print "All %d slaves granted" % opts.slaves
                reservations = conn.get_all_instances(instance_ids)
                slave_nodes = []
                for r in reservations:
                    slave_nodes += r.instances
                break
            else:
                print "%d of %d slaves granted, waiting longer" % (active,
                                                                   opts.slaves)
    else:
        # Launch non-spot instances
        slave_res = image.run(key_name=opts.key_pair,
                              security_groups=[slave_group],
                              instance_type=opts.instance_type,
                              placement=opts.zone,
                              min_count=opts.slaves,
                              max_count=opts.slaves,
                              block_device_map=block_map)
        slave_nodes = slave_res.instances
        print "Launched slaves, regid = " + slave_res.id

    # # Launch masters
    master_type = opts.master_instance_type
    if master_type == "":
        master_type = opts.instance_type
    master_res = image.run(key_name=opts.key_pair,
                           security_groups=[master_group],
                           instance_type=master_type,
                           placement=opts.zone,
                           min_count=1,
                           max_count=1,
                           block_device_map=block_map)
    master_nodes = master_res.instances
    print "Launched master, regid = " + master_res.id

    zoo_nodes = []

    # Return all the instances
    return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 36
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    authorized_address = opts.authorized_address
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, authorized_address)
        master_group.authorize('tcp', 8080, 8081, authorized_address)
        master_group.authorize('tcp', 18080, 18080, authorized_address)
        master_group.authorize('tcp', 19999, 19999, authorized_address)
        master_group.authorize('tcp', 50030, 50030, authorized_address)
        master_group.authorize('tcp', 50070, 50070, authorized_address)
        master_group.authorize('tcp', 60070, 60070, authorized_address)
        master_group.authorize('tcp', 4040, 4045, authorized_address)
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, authorized_address)
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)
        slave_group.authorize('tcp', 8080, 8081, authorized_address)
        slave_group.authorize('tcp', 50060, 50060, authorized_address)
        slave_group.authorize('tcp', 50075, 50075, authorized_address)
        slave_group.authorize('tcp', 60060, 60060, authorized_address)
        slave_group.authorize('tcp', 60075, 60075, authorized_address)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                             die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances running in " +
                          "group %s or %s" % (master_group.name, slave_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)

    additional_groups = []
    if opts.additional_security_group:
        additional_groups = [sg
                             for sg in conn.get_all_security_groups()
                             if opts.additional_security_group in (sg.name, sg.id)]
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map["/dev/sd" + chr(ord('s') + i)] = device

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.letters[i + 1]
            block_map[name] = dev

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print ("Requesting %d slaves as spot instances with price $%.3f" %
               (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group] + additional_groups,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data_content)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(
                conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" % running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group] + additional_groups,
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data_content)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                                zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group] + additional_groups,
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map,
                               user_data=user_data_content)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    for master in master_nodes:
        master.add_tag(
            key='Name',
            value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
    for slave in slave_nodes:
        slave.add_tag(
            key='Name',
            value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))

    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 37
0
def node_install(cn=def_cn,inst_type_idx=def_inst_type,idn=0,
        avz=def_default_avz,rt=def_default_requesttype,
        group_name='oggmssh',
        ssh_port=22,
        cidr='0.0.0.0/0'):
    """
    Request and prepare single instance
    """
    # FSO---connect
    cloud = boto.ec2.connect_to_region(avz[:-1],profile_name=ec2Profile)
    aminfo = cloud.get_image(def_ami[avz[:-1]])

    # FSO---check if node with same name already exists
    if node_exists(cn + '_node' + str(idn)):
        print("Node already exists")
        sys.exit()

    # Check if ssh keypair exists
    key_name = get_keypair_name()
    check_keypair(cloud, key_name)

    # FSO---create a bigger root device
    dev_sda1 = EBSBlockDeviceType()
    dev_sda1.size = rootfs_size_gb
    dev_sda1.delete_on_termination = True
    bdm = BlockDeviceMapping()
    bdm['/dev/sda1'] = dev_sda1

    dev_sdf_vol = get_user_persist_ebs(cloud, avz)

    # Check to see if specified security group already exists.
    # If we get an InvalidGroup.NotFound error back from EC2,
    # it means that it doesn't exist and we need to create it.
    try:
        group = cloud.get_all_security_groups(groupnames=[group_name])[0]
    except cloud.ResponseError as e:
        if e.code == 'InvalidGroup.NotFound':
            print('Creating Security Group: %s' % group_name)
            # Create a security group to control access to instance via SSH.
            group = cloud.create_security_group(group_name, 'A group that allows SSH access')
        else:
            raise

    # Add a rule to the security group to authorize SSH traffic
    # on the specified port.
    try:
        group.authorize('tcp', ssh_port, ssh_port, cidr)
    except cloud.ResponseError as e:
        if e.code == 'InvalidPermission.Duplicate':
            print('Security Group: %s already authorized' % group_name)
        else:
            raise

    log_with_ts("request node "+str(idn))
    print('Reserving instance for node', aminfo.id, instance_infos[inst_type_idx]['type'], aminfo.name, aminfo.region)

    if rt == 'spot':
        print("placing node in ",avz)
        requests = cloud.request_spot_instances(def_price,
                      def_ami[avz[:-1]],
                      count=1,
                      type='one-time',
                      security_groups=[group_name],
                      key_name=key_name,
                      placement=avz,
                      instance_type=instance_infos[inst_type_idx]['type'],
                      block_device_map=bdm)
        req_ids = [request.id for request in requests]
        instance_ids = wait_for_fulfillment(cloud,req_ids)
        instances = cloud.get_only_instances(instance_ids=instance_ids)
        node = instances[0]
        log_with_ts("fullfilled spot node "+str(idn))
    else:
        print("placing node in ",avz)
        reservation = cloud.run_instances(image_id=def_ami[avz[:-1]],
                key_name=key_name,
                placement = avz,
                security_groups=[group_name],
                instance_type=instance_infos[inst_type_idx]['type'],
                block_device_map= bdm)
        node = reservation.instances[0]
        log_with_ts("fullfilled ondemand node "+str(idn))

    time.sleep(2)
    while not node.update() == 'running':
        print('waiting for', cn, 'node', idn, 'to boot...')
        time.sleep(5)

    log_with_ts("booted node "+str(idn))

    if dev_sdf_vol is not None:
        cloud.attach_volume(dev_sdf_vol.id, node.id, "/dev/sdf")

    node.add_tag('Name', cn+'_node'+str(idn))
    node.add_tag('type', cn+'node')
    node.add_tag('node-owner', user_identifier)

    # FSO---set delete on termination flag to true for ebs block device
    node.modify_attribute('blockDeviceMapping', { '/dev/sda1' : True })

    # FSO--- test socket connect to ssh service
    ssh_test(node)
    log_with_ts("reachable node "+str(idn))

    update_key_filename(node.region.name)

    # Mount potential user volume
    if dev_sdf_vol is not None:
        use_user_volume(node.dns_name)

    log_with_ts("finished node "+str(idn))
Ejemplo n.º 38
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print("ERROR: Must provide an identity file (-i) for ssh connections.", file=stderr)
        sys.exit(1)

    if opts.key_pair is None:
        print("ERROR: Must provide a key pair name (-k) to use on instances.", file=stderr)
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print("Setting up security groups...")
    master_group = get_or_make_group(conn, cluster_name + "-master", opts.vpc_id)
    slave_group = get_or_make_group(conn, cluster_name + "-slaves", opts.vpc_id)
    authorized_address = opts.authorized_address
    if master_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            master_group.authorize(src_group=master_group)
            master_group.authorize(src_group=slave_group)
        else:
            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                   src_group=master_group)
            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                   src_group=master_group)
            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                   src_group=master_group)
            master_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                   src_group=slave_group)
            master_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                   src_group=slave_group)
            master_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                   src_group=slave_group)
        master_group.authorize('tcp', 22, 22, authorized_address)
        master_group.authorize('tcp', 8080, 8081, authorized_address)
        master_group.authorize('tcp', 18080, 18080, authorized_address)
        master_group.authorize('tcp', 19999, 19999, authorized_address)
        master_group.authorize('tcp', 50030, 50030, authorized_address)
        master_group.authorize('tcp', 50070, 50070, authorized_address)
        master_group.authorize('tcp', 60070, 60070, authorized_address)
        master_group.authorize('tcp', 4040, 4045, authorized_address)
        # Rstudio (GUI for R) needs port 8787 for web access
        master_group.authorize('tcp', 8787, 8787, authorized_address)
        # HDFS NFS gateway requires 111,2049,4242 for tcp & udp
        master_group.authorize('tcp', 111, 111, authorized_address)
        master_group.authorize('udp', 111, 111, authorized_address)
        master_group.authorize('tcp', 2049, 2049, authorized_address)
        master_group.authorize('udp', 2049, 2049, authorized_address)
        master_group.authorize('tcp', 4242, 4242, authorized_address)
        master_group.authorize('udp', 4242, 4242, authorized_address)
        # RM in YARN mode uses 8088
        master_group.authorize('tcp', 8088, 8088, authorized_address)
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, authorized_address)
    if slave_group.rules == []:  # Group was just now created
        if opts.vpc_id is None:
            slave_group.authorize(src_group=master_group)
            slave_group.authorize(src_group=slave_group)
        else:
            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=master_group)
            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=master_group)
            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=master_group)
            slave_group.authorize(ip_protocol='icmp', from_port=-1, to_port=-1,
                                  src_group=slave_group)
            slave_group.authorize(ip_protocol='tcp', from_port=0, to_port=65535,
                                  src_group=slave_group)
            slave_group.authorize(ip_protocol='udp', from_port=0, to_port=65535,
                                  src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)
        slave_group.authorize('tcp', 8080, 8081, authorized_address)
        slave_group.authorize('tcp', 50060, 50060, authorized_address)
        slave_group.authorize('tcp', 50075, 50075, authorized_address)
        slave_group.authorize('tcp', 60060, 60060, authorized_address)
        slave_group.authorize('tcp', 60075, 60075, authorized_address)
#Kylix
        slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060,
                               src_group=slave_group)
        slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060,
                               src_group=slave_group)
        slave_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060,
                               src_group=master_group)
        slave_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060,
                               src_group=master_group)
        master_group.authorize(ip_protocol='tcp', from_port=50050, to_port=50060,
                               src_group=slave_group)
        master_group.authorize(ip_protocol='udp', from_port=50050, to_port=50060,
                               src_group=slave_group)


    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                             die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print("ERROR: There are already instances running in group %s or %s" %
              (master_group.name, slave_group.name), file=stderr)
        sys.exit(1)

    # we use group ids to work around https://github.com/boto/boto/issues/350
    additional_group_ids = []
    if opts.additional_security_group:
        additional_group_ids = [sg.id
                                for sg in conn.get_all_security_groups()
                                if opts.additional_security_group in (sg.name, sg.id)]
    print("Launching instances...")

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print("Could not find AMI " + opts.ami, file=stderr)
        sys.exit(1)

    # Create block device mapping so that we can add EBS volumes if asked to.
    # The first drive is attached as /dev/sds, 2nd as /dev/sdt, ... /dev/sdz
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        for i in range(opts.ebs_vol_num):
            device = EBSBlockDeviceType()
            device.size = opts.ebs_vol_size
            device.volume_type = opts.ebs_vol_type
            device.delete_on_termination = True
            block_map["/dev/sd" + chr(ord('s') + i)] = device

    # AWS ignores the AMI-specified block device mapping for M3 (see SPARK-3342).
    if opts.instance_type.startswith('m3.'):
        for i in range(get_num_disks(opts.instance_type)):
            dev = BlockDeviceType()
            dev.ephemeral_name = 'ephemeral%d' % i
            # The first ephemeral drive is /dev/sdb.
            name = '/dev/sd' + string.letters[i + 1]
            block_map[name] = dev

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_group_ids=[slave_group.id] + additional_group_ids,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                subnet_id=opts.subnet_id,
                placement_group=opts.placement_group,
                user_data=user_data_content,
                instance_profile_name=opts.instance_profile_name)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print("Waiting for spot instances to be granted...")
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print("All %d slaves granted" % opts.slaves)
                    reservations = conn.get_all_reservations(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print("%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves))
        except:
            print("Canceling spot instance requests")
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(
                conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print(("WARNING: %d instances are still running" % running), file=stderr)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id] + additional_group_ids,
                    instance_type=opts.instance_type,
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    subnet_id=opts.subnet_id,
                    placement_group=opts.placement_group,
                    user_data=user_data_content,
                    instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
                    instance_profile_name=opts.instance_profile_name)
                slave_nodes += slave_res.instances
                print("Launched {s} slave{plural_s} in {z}, regid = {r}".format(
                      s=num_slaves_this_zone,
                      plural_s=('' if num_slaves_this_zone == 1 else 's'),
                      z=zone,
                      r=slave_res.id))
            i += 1

    # Launch or resume masters
    if existing_masters:
        print("Starting master...")
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(
            key_name=opts.key_pair,
            security_group_ids=[master_group.id] + additional_group_ids,
            instance_type=master_type,
            placement=opts.zone,
            min_count=1,
            max_count=1,
            block_device_map=block_map,
            subnet_id=opts.subnet_id,
            placement_group=opts.placement_group,
            user_data=user_data_content,
            instance_initiated_shutdown_behavior=opts.instance_initiated_shutdown_behavior,
            instance_profile_name=opts.instance_profile_name)

        master_nodes = master_res.instances
        print("Launched master in %s, regid = %s" % (zone, master_res.id))

    # This wait time corresponds to SPARK-4983
    print("Waiting for AWS to propagate instance metadata...")
    time.sleep(15)

    # Give the instances descriptive names and set additional tags
    additional_tags = {}
    if opts.additional_tags.strip():
        additional_tags = dict(
            map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',')
        )

    for master in master_nodes:
        master.add_tags(
            dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
        )

    for slave in slave_nodes:
        slave.add_tags(
            dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
        )

    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 39
0
def launch_cluster(conn, opts, cluster_name):
    if opts.identity_file is None:
        print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
        sys.exit(1)
    if opts.key_pair is None:
        print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
        sys.exit(1)

    user_data_content = None
    if opts.user_data:
        with open(opts.user_data) as user_data_file:
            user_data_content = user_data_file.read()

    print "Setting up security groups..."
    if opts.security_group_prefix is None:
        master_group = get_or_make_group(conn, cluster_name + "-master")
        slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    else:
        master_group = get_or_make_group(
            conn, opts.security_group_prefix + "-master")
        slave_group = get_or_make_group(conn,
                                        opts.security_group_prefix + "-slaves")
    authorized_address = opts.authorized_address
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, authorized_address)
        master_group.authorize('tcp', 8080, 8081, authorized_address)
        master_group.authorize('tcp', 18080, 18080, authorized_address)
        master_group.authorize('tcp', 19999, 19999, authorized_address)
        master_group.authorize('tcp', 50030, 50030, authorized_address)
        master_group.authorize('tcp', 50070, 50070, authorized_address)
        master_group.authorize('tcp', 60070, 60070, authorized_address)
        master_group.authorize('tcp', 4040, 4045, authorized_address)
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, authorized_address)
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, authorized_address)
        slave_group.authorize('tcp', 8080, 8081, authorized_address)
        slave_group.authorize('tcp', 50060, 50060, authorized_address)
        slave_group.authorize('tcp', 50075, 50075, authorized_address)
        slave_group.authorize('tcp', 60060, 60060, authorized_address)
        slave_group.authorize('tcp', 60075, 60075, authorized_address)

    # Check if instances are already running with the cluster name
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances for name: %s " %
                          cluster_name)
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)

    additional_groups = []
    if opts.additional_security_group:
        additional_groups = [
            sg for sg in conn.get_all_security_groups()
            if opts.additional_security_group in (sg.name, sg.id)
        ]
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price is not None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group] + additional_groups,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data_content)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                outstanding_request_ids = []
                for i in my_req_ids:
                    if i in id_to_req:
                        if id_to_req[i].state == "active":
                            active_instance_ids.append(
                                id_to_req[i].instance_id)
                        else:
                            outstanding_request_ids.append(i)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer for request ids including %s" % (
                        len(active_instance_ids), opts.slaves,
                        outstanding_request_ids[0:10])
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group] +
                                      additional_groups,
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data_content)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group] +
                               additional_groups,
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               block_device_map=block_map)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Give the instances descriptive names
    # TODO: Add retry logic for tagging with name since it's used to identify a cluster.
    for master in master_nodes:
        name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)
        for i in range(0, 5):
            try:
                master.add_tag(key='Name', value=name)
            except:
                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
                if (i == 5):
                    raise "Error - failed max attempts to add name tag"
                time.sleep(5)

    for slave in slave_nodes:
        name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)
        for i in range(0, 5):
            try:
                slave.add_tag(key='Name', value=name)
            except:
                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
                if (i == 5):
                    raise "Error - failed max attempts to add name tag"
                time.sleep(5)

    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 40
0
def node_install(cn=def_cn,
                 inst_type_idx=def_inst_type,
                 idn=0,
                 avz=def_default_avz,
                 rt=def_default_requesttype,
                 group_name='oggmssh',
                 ssh_port=22,
                 cidr='0.0.0.0/0'):
    """
    Request and prepare single instance
    """
    # FSO---connect
    cloud = boto.ec2.connect_to_region(avz[:-1], profile_name=ec2Profile)
    aminfo = cloud.get_image(def_ami[avz[:-1]])
    vpcconn = VPCConnection(region=cloud.region)

    try:
        vpc_id, subnet_id = def_subnet[avz]
        vpc = vpcconn.get_all_vpcs(vpc_ids=[vpc_id])[0]
    except:
        vpc_id = None
        subnet_id = None
        vpc = None

    # FSO---check if node with same name already exists
    if node_exists(cn + '_node' + str(idn)):
        print("Node already exists")
        sys.exit()

    # Check if ssh keypair exists
    key_name = get_keypair_name(avz[:-1])
    check_keypair(cloud, key_name)

    # FSO---create a bigger root device
    dev_sda1 = EBSBlockDeviceType()
    dev_sda1.size = rootfs_size_gb
    dev_sda1.delete_on_termination = True
    bdm = BlockDeviceMapping()
    bdm['/dev/sda1'] = dev_sda1

    dev_sdf_vol = get_user_persist_ebs(cloud, avz)

    # Check to see if specified security group already exists.
    # If we get an InvalidGroup.NotFound error back from EC2,
    # it means that it doesn't exist and we need to create it.
    try:
        group = cloud.get_all_security_groups(groupnames=[group_name])[0]
    except cloud.ResponseError as e:
        if e.code == 'InvalidGroup.NotFound':
            print('Creating Security Group: %s' % group_name)
            # Create a security group to control access to instance via SSH.
            group = cloud.create_security_group(
                group_name, 'A group that allows SSH access')
        else:
            raise

    # Authorize all Intra-VPC traffic
    if vpc is not None:
        try:
            group.authorize('-1', -1, -1, vpc.cidr_block)
        except cloud.ResponseError as e:
            if e.code != 'InvalidPermission.Duplicate':
                raise

    # Add a rule to the security group to authorize SSH traffic
    # on the specified port.
    try:
        group.authorize('tcp', ssh_port, ssh_port, cidr)
    except cloud.ResponseError as e:
        if e.code == 'InvalidPermission.Duplicate':
            print('Security Group: %s already authorized' % group_name)
        else:
            raise

    log_with_ts("request node " + str(idn))
    print('Reserving instance for node', aminfo.id,
          instance_infos[inst_type_idx]['type'], aminfo.name, aminfo.region)

    if rt == 'spot':
        print("placing node in ", avz)
        requests = cloud.request_spot_instances(
            def_price,
            def_ami[avz[:-1]],
            count=1,
            type='one-time',
            security_group_ids=[group.id],
            key_name=key_name,
            placement=avz,
            subnet_id=subnet_id,
            ebs_optimized=True,
            instance_type=instance_infos[inst_type_idx]['type'],
            block_device_map=bdm)
        req_ids = [request.id for request in requests]
        instance_ids = wait_for_fulfillment(cloud, req_ids)
        instances = cloud.get_only_instances(instance_ids=instance_ids)
        node = instances[0]
        log_with_ts("fullfilled spot node " + str(idn))
    else:
        print("placing node in ", avz)
        reservation = cloud.run_instances(
            image_id=def_ami[avz[:-1]],
            key_name=key_name,
            placement=avz,
            subnet_id=subnet_id,
            security_group_ids=[group.id],
            ebs_optimized=True,
            instance_type=instance_infos[inst_type_idx]['type'],
            block_device_map=bdm)
        node = reservation.instances[0]
        log_with_ts("fullfilled ondemand node " + str(idn))

    time.sleep(2)
    while not node.update() == 'running':
        print('waiting for', cn, 'node', idn, 'to boot...')
        time.sleep(5)

    log_with_ts("booted node " + str(idn))

    if dev_sdf_vol is not None:
        cloud.attach_volume(dev_sdf_vol.id, node.id, "/dev/sdf")

    node.add_tag('Name', cn + '_node' + str(idn))
    node.add_tag('type', cn + 'node')
    node.add_tag('node-owner', user_identifier)

    # FSO---set delete on termination flag to true for ebs block device
    node.modify_attribute('blockDeviceMapping', {'/dev/sda1': True})

    # FSO--- test socket connect to ssh service
    ssh_test(node)
    log_with_ts("reachable node " + str(idn))

    update_key_filename(node.region.name)

    # Mount potential user volume
    if dev_sdf_vol is not None:
        use_user_volume(node.dns_name)

    log_with_ts("finished node " + str(idn))
Ejemplo n.º 41
0
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name + "-master")
  slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize(src_group=master_group)
    zoo_group.authorize(src_group=slave_group)
    zoo_group.authorize(src_group=zoo_group)
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  print "Checking for running cluster..."
  reservations = conn.get_all_instances()
  for res in reservations:
    group_names = [g.id for g in res.groups]
    if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
      active = [i for i in res.instances if is_active(i)]
      if len(active) > 0:
        print >> stderr, ("ERROR: There are already instances running in " +
            "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)
  print "Launching instances..."
  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    slave_reqs = conn.request_spot_instances(
        price = opts.spot_price,
        image_id = opts.ami,
        launch_group = "launch-group-%s" % cluster_name,
        placement = opts.zone,
        count = opts.slaves,
        key_name = opts.key_pair,
        security_groups = [slave_group],
        instance_type = opts.instance_type,
        block_device_map = block_map)
    my_req_ids = [req.id for req in slave_reqs]
    print "Waiting for spot instances to be granted..."
    while True:
      time.sleep(10)
      reqs = conn.get_all_spot_instance_requests()
      id_to_req = {}
      for r in reqs:
        id_to_req[r.id] = r
      active = 0
      instance_ids = []
      for i in my_req_ids:
        if id_to_req[i].state == "active":
          active += 1
          instance_ids.append(id_to_req[i].instance_id)
      if active == opts.slaves:
        print "All %d slaves granted" % opts.slaves
        reservations = conn.get_all_instances(instance_ids)
        slave_nodes = []
        for r in reservations:
          slave_nodes += r.instances
        break
      else:
        print "%d of %d slaves granted, waiting longer" % (active, opts.slaves)
  else:
    # Launch non-spot instances
    slave_res = image.run(key_name = opts.key_pair,
                          security_groups = [slave_group],
                          instance_type = opts.instance_type,
                          placement = opts.zone,
                          min_count = opts.slaves,
                          max_count = opts.slaves,
                          block_device_map = block_map)
    slave_nodes = slave_res.instances
    print "Launched slaves, regid = " + slave_res.id

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  master_res = image.run(key_name = opts.key_pair,
                         security_groups = [master_group],
                         instance_type = master_type,
                         placement = opts.zone,
                         min_count = opts.ft,
                         max_count = opts.ft,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master, regid = " + master_res.id

  # Launch ZooKeeper nodes if required
  if opts.ft > 1:
    zoo_res = image.run(key_name = opts.key_pair,
                        security_groups = [zoo_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = 3,
                        max_count = 3,
                        block_device_map = block_map)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 42
0
    def create_node(self,
                    name,
                    distribution,
                    size=None,
                    disk_size=8,
                    metadata={}):
        if size is None:
            size = self._default_size

        with start_action(
                action_type=u"flocker:provision:aws:create_node",
                name=name,
                distribution=distribution,
                image_size=size,
                disk_size=disk_size,
                metadata=metadata,
        ):

            metadata = metadata.copy()
            metadata['Name'] = name

            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]}, )

            with start_action(
                    action_type=
                    u"flocker:provision:aws:create_node:run_instances",
            ) as context:
                reservation = self._connection.run_instances(
                    images[0].id,
                    key_name=self._keyname,
                    instance_type=size,
                    security_groups=self._security_groups,
                    block_device_map=diskmap,
                    placement=self._zone,
                    # On some operating systems, a tty is requried for sudo.
                    # Since AWS systems have a non-root user as the login,
                    # disable this, so we can use sudo with conch.
                    user_data=dedent("""\
                        #!/bin/sh
                        sed -i '/Defaults *requiretty/d' /etc/sudoers
                        """),
                )

                instance = reservation.instances[0]
                context.add_success_fields(instance_id=instance.id)

            self._connection.create_tags([instance.id], metadata)

            # Display state as instance starts up, to keep user informed that
            # things are happening.
            _wait_until_running(instance)

            return AWSNode(
                name=name,
                _provisioner=self,
                _instance=instance,
                distribution=distribution,
            )
Ejemplo n.º 43
0
def launch_cluster(conn, opts, cluster_name):
  if opts.identity_file is None:
    print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
    sys.exit(1)
  if opts.key_pair is None:
    print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
    sys.exit(1)
  print "Setting up security groups..."
  
  if opts.one_security_group:
    master_group = get_or_make_group(conn, cluster_name + "-group")
    master_group.owner_id = os.getenv('EC2_USER_ID')
    slave_group = master_group
    zoo_group = master_group
  
  else:
      master_group = get_or_make_group(conn, cluster_name + "-master")
      master_group.owner_id = os.getenv('EC2_USER_ID')
      slave_group = get_or_make_group(conn, cluster_name + "-slaves")
      slave_group.owner_id = os.getenv('EC2_USER_ID')
      zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
      zoo_group.owner_id = os.getenv('EC2_USER_ID')
      
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
    master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50031, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
    master_group.authorize('tcp', 40000, 40000, '0.0.0.0/0') #apache hama
    master_group.authorize('tcp', 40013, 40013, '0.0.0.0/0') #apache hama
    master_group.authorize('tcp', 8020, 8020, '0.0.0.0/0') #hdfs HA nameservice
    master_group.authorize('tcp', 8485, 8485, '0.0.0.0/0') #journal nodes
    master_group.authorize('tcp', 8023, 8023, '0.0.0.0/0') #jt HA   
    master_group.authorize('tcp', 8021, 8021, '0.0.0.0/0') #jt HA
    master_group.authorize('tcp', 8018, 8019, '0.0.0.0/0') #zkfc
    master_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui    
    
    #If cohosted with zookeeper open necessary ports
    if opts.cohost:
        print "Opening additional ports for zookeeper... "
        master_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
        master_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
        master_group.authorize('tcp', 3888, 3888, '0.0.0.0/0') 
        
    if opts.ganglia:
      master_group.authorize('tcp', 80, 80, '0.0.0.0/0')
      #Also needed 8649 and 8651 but check if only for master
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 5050, 5051, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
    slave_group.authorize('tcp', 40015, 40015, '0.0.0.0/0') ##apache hama web UI
    slave_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui
    slave_group.authorize('tcp', 31000, 32000, '0.0.0.0/0') #task tracker web ui    
  
  if zoo_group.rules == []: # Group was just now created
      zoo_group.authorize(src_group=master_group)
      zoo_group.authorize(src_group=slave_group)
      zoo_group.authorize(src_group=zoo_group)
      zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
      zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
      zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
      zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')
      zoo_group.authorize('tcp', 8018, 8020, '0.0.0.0/0') #hdfs HA nameservic
      zoo_group.authorize('tcp', 8485, 8485, '0.0.0.0/0') #journal nodes
      zoo_group.authorize('tcp', 8023, 8023, '0.0.0.0/0') #jt HA
      zoo_group.authorize('tcp', 2812, 2812, '0.0.0.0/0') #monit web ui        
   


  # Check if instances are already running in our groups
  # Grouped instances are instances that run on the same security group in order to allow communication
  # using private IPs and without DNS resolving
  existing_masters, existing_slaves, existing_zoos, existing_grouped = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
  if existing_slaves or (existing_masters and not opts.use_existing_master) or existing_grouped:
    print >> stderr, ("ERROR: There are already instances running in " +
        "group %s or %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
    sys.exit(1)

  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.emi])[0]
  except:
    print >> stderr, "Could not find emi " + opts.emi
    sys.exit(1)
    
  try:
    image_master = conn.get_all_images(image_ids=[opts.emi_master])[0]
  except:
    print >> stderr, "Could not find emi " + opts.emi_master
    sys.exit(1)
  
  # Launch additional ZooKeeper nodes if required - ex: if mesos masters specified are 2 and the zoo_num=3 (default)
  if int(opts.ft) > 1:
    if(opts.cohost):
        zoo_num = str(int(opts.zoo_num) - int(opts.ft)) #extra zoo instances needed
    else:
        zoo_num = opts.zoo_num
  else:
      zoo_num = opts.zoo_num
      
  if (zoo_num > 0):
      if opts.emi_zoo == "":
          emi_zoo = opts.emi_master 
      else:
          emi_zoo = opts.emi_zoo
              
      try:
        image_zoo = conn.get_all_images(image_ids=[emi_zoo])[0]
      except:
        print >> stderr, "Could not find emi " + emi_zoo
        sys.exit(1)
       

  # Create block device mapping so that we can add an EBS volume if asked to
  logging.debug( "Calling boto BlockDeviceMapping()...")
  block_map = BlockDeviceMapping()
  logging.debug(" Printing block_map..") 
  #print block_map
  if opts.ebs_vol_size > 0:
    logging.debug("Calling boto EBSBlockDeviceType()...")
    device = EBSBlockDeviceType()
    #print "device: ", device
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    device.ephemeral_name = "ephemeral0"
    #block_map["/dev/sdv"] = device
    #block_map["/dev/sdv"] = device
    block_map["/dev/vdb"] = device
    
  if opts.user_data_file != None:
      user_data_file = open(opts.user_data_file)
      try:
          opts.user_data = user_data_file.read()
          #print "user data (encoded) = ", opts.user_data
      finally:
          user_data_file.close()
  
  # Launch non-spot instances
  zones = get_zones(conn, opts)    
  num_zones = len(zones)
  i = 0
  slave_nodes = []
  for zone in zones:
    num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
    if num_slaves_this_zone > 0:
        slave_res = image.run(key_name = opts.key_pair,
                              security_groups = [slave_group],
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map,
                              user_data = opts.user_data)
        slave_nodes += slave_res.instances
        print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, slave_res.id)
    i += 1  

  # Launch or resume masters
  if existing_masters:
    print "Starting master..."
    for inst in existing_masters:
      if inst.state not in ["shutting-down", "terminated"]:  
        inst.start()
    master_nodes = existing_masters
  else:
    master_type = opts.master_instance_type
    if master_type == "":
      master_type = opts.instance_type
    if opts.zone == 'all':
      opts.zone = random.choice(conn.get_all_zones()).name
    
    print "Running " + opts.ft + " masters"
    master_res = image_master.run(key_name = opts.key_pair,
                           security_groups = [master_group],
                           instance_type = master_type,
                           placement = opts.zone,
                           min_count = opts.ft,
                           max_count = opts.ft,
                           block_device_map = block_map,
                           user_data = opts.user_data)
    master_nodes = master_res.instances
    print "Launched master in %s, regid = %s" % (zone, master_res.id)

  if(zoo_num > 0):
    
    print "Running additional " + zoo_num + " zookeepers"
    zoo_res = image_zoo.run(key_name = opts.key_pair,
                        security_groups = [zoo_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = zoo_num,
                        max_count = zoo_num,
                        block_device_map = block_map,
                        user_data = opts.user_data)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []
    
  if (opts.cohost):
      print "Zookeepers are co-hosted on mesos instances..."

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 44
0
def launch_cluster(conn, opts, cluster_name):

    #Remove known hosts to avoid "Offending key for IP ..." errors.
    known_hosts = os.environ['HOME'] + "/.ssh/known_hosts"
    if os.path.isfile(known_hosts):
        os.remove(known_hosts)
    if opts.key_pair is None:
        opts.key_pair = keypair()
        if opts.key_pair is None:
            print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
            sys.exit(1)

    if opts.profile is None:
        opts.profile = profile()
        if opts.profile is None:
            print >> stderr, "ERROR: No profile found in current host. It be provided with -p option."
            sys.exit(1)

    public_key = pub_key()
    user_data = Template("""#!/bin/bash
  set -e -x
  echo '$public_key' >> ~root/.ssh/authorized_keys
  echo '$public_key' >> ~ec2-user/.ssh/authorized_keys""").substitute(
        public_key=public_key)

    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    sparknotebook_group = get_or_make_group(conn, "SparkNotebookApplication")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=sparknotebook_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 18080, 18080, '0.0.0.0/0')
        master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        master_group.authorize('tcp', 7077, 7077, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=sparknotebook_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    if not any(r for r in sparknotebook_group.rules
               for g in r.grants if master_group.id == g.group_id):
        sparknotebook_group.authorize(ip_protocol="tcp",
                                      from_port="1",
                                      to_port="65535",
                                      src_group=master_group)
        sparknotebook_group.authorize(ip_protocol="icmp",
                                      from_port="-1",
                                      to_port="-1",
                                      src_group=master_group)

    if not any(r for r in sparknotebook_group.rules
               for g in r.grants if slave_group.id == g.group_id):
        sparknotebook_group.authorize(ip_protocol="tcp",
                                      from_port="1",
                                      to_port="65535",
                                      src_group=slave_group)
        sparknotebook_group.authorize(ip_protocol="icmp",
                                      from_port="-1",
                                      to_port="-1",
                                      src_group=slave_group)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances running in " +
                          "group %s or %s" %
                          (master_group.name, slave_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        zones = get_zones(conn, opts)

        num_zones = len(zones)
        i = 0
        my_req_ids = []

        for zone in zones:
            best_price = find_best_price(conn, opts.instance_type, zone,
                                         opts.spot_price)
            # Launch spot instances with the requested price
            print >> stderr, (
                "Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)"
                % (opts.slaves, best_price, opts.slaves * best_price))

            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
                subnet_id=subnetId(),
                groups=[slave_group.id],
                associate_public_ip_address=True)
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(
                interface)

            slave_reqs = conn.request_spot_instances(
                price=best_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
                network_interfaces=interfaces)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print >> stderr, "Waiting for spot instances to be granted"
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print >> stderr, "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    # print >> stderr, ".",
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print >> stderr, "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_group_ids=[slave_group.id],
                                      instance_type=opts.instance_type,
                                      subnet_id=subnetId(),
                                      placement=zone,
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map,
                                      user_data=user_data,
                                      instance_profile_arn=opts.profile)
                slave_nodes += slave_res.instances
                print >> stderr, "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        if opts.spot_price != None:
            best_price = find_best_price(conn, master_type, opts.zone,
                                         opts.spot_price)
            # Launch spot instances with the requested price
            print >> stderr, (
                "Requesting master as spot instances with price $%.3f/hour" %
                (best_price))

            interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
                subnet_id=subnetId(),
                groups=[master_group.id],
                associate_public_ip_address=True)
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(
                interface)

            master_reqs = conn.request_spot_instances(
                price=best_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=opts.zone,
                count=1,
                key_name=opts.key_pair,
                instance_type=master_type,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
                network_interfaces=interfaces)
            my_req_ids = [r.id for r in master_reqs]
            print >> stderr, "Waiting for spot instance to be granted"
            try:
                while True:
                    time.sleep(10)
                    reqs = conn.get_all_spot_instance_requests(
                        request_ids=my_req_ids)
                    id_to_req = {}
                    for r in reqs:
                        id_to_req[r.id] = r
                    active_instance_ids = []
                    for i in my_req_ids:
                        if i in id_to_req and id_to_req[i].state == "active":
                            active_instance_ids.append(
                                id_to_req[i].instance_id)
                    if len(active_instance_ids) == 1:
                        print >> stderr, "Master granted"
                        reservations = conn.get_all_instances(
                            active_instance_ids)
                        master_nodes = []
                        for r in reservations:
                            master_nodes += r.instances
                        break
                    else:
                        # print >> stderr, ".",
                        print "%d of %d masters granted, waiting longer" % (
                            len(active_instance_ids), 1)
            except:
                print >> stderr, "Canceling spot instance requests"
                conn.cancel_spot_instance_requests(my_req_ids)
                # Log a warning if any of these requests actually launched instances:
                (master_nodes,
                 master_nodes) = get_existing_cluster(conn,
                                                      opts,
                                                      cluster_name,
                                                      die_on_error=False)
                running = len(master_nodes) + len(master_nodes)
                if running:
                    print >> stderr, (
                        "WARNING: %d instances are still running" % running)
                sys.exit(0)
        else:
            master_res = image.run(key_name=opts.key_pair,
                                   security_group_ids=[master_group.id],
                                   instance_type=master_type,
                                   subnet_id=subnetId(),
                                   placement=opts.zone,
                                   min_count=1,
                                   max_count=1,
                                   block_device_map=block_map,
                                   user_data=user_data,
                                   instance_profile_arn=opts.profile)
            master_nodes = master_res.instances
            print >> stderr, "Launched master in %s, regid = %s" % (
                zone, master_res.id)
    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 45
0
def launch_cluster(conn, opts, cluster_name):
    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
        master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
        master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
        master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
        if opts.ganglia:
            master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
        slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
        slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
        slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
        slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
        slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(
        conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, ("ERROR: There are already instances running in " +
                          "group %s or %s" %
                          (master_group.name, slave_group.name))
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        # Launch spot instances with the requested price
        print("Requesting %d slaves as spot instances with price $%.3f" %
              (opts.slaves, opts.spot_price))
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        my_req_ids = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            slave_reqs = conn.request_spot_instances(
                price=opts.spot_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                security_groups=[slave_group],
                instance_type=opts.instance_type,
                block_device_map=block_map)
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print "Waiting for spot instances to be granted..."
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    print "%d of %d slaves granted, waiting longer" % (
                        len(active_instance_ids), opts.slaves)
        except:
            print "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes,
             slave_nodes) = get_existing_cluster(conn,
                                                 opts,
                                                 cluster_name,
                                                 die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" %
                                  running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(key_name=opts.key_pair,
                                      security_groups=[slave_group],
                                      instance_type=opts.instance_type,
                                      placement=zone,
                                      instance_profile_name="spark-node",
                                      min_count=num_slaves_this_zone,
                                      max_count=num_slaves_this_zone,
                                      block_device_map=block_map)
                slave_nodes += slave_res.instances
                print "Launched %d slaves in %s, regid = %s" % (
                    num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == 'all':
            opts.zone = random.choice(conn.get_all_zones()).name
        master_res = image.run(key_name=opts.key_pair,
                               security_groups=[master_group],
                               instance_type=master_type,
                               placement=opts.zone,
                               min_count=1,
                               max_count=1,
                               instance_profile_name="spark-node",
                               block_device_map=block_map)
        master_nodes = master_res.instances
        print "Launched master in %s, regid = %s" % (zone, master_res.id)

    # Return all the instances
    return (master_nodes, slave_nodes)
Ejemplo n.º 46
0
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  input_group = get_or_make_group(conn, cluster_name + "-input")
  compute_group = get_or_make_group(conn, cluster_name + "-compute")
  if input_group.rules == []: # Group was just now created
    input_group.authorize(src_group=input_group)
    input_group.authorize(src_group=compute_group)
    input_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    input_group.authorize('tcp', 4000, 4000, '0.0.0.0/0')
    input_group.authorize('tcp', 4001, 4001, '0.0.0.0/0')
  if compute_group.rules == []: # Group was just now created
    compute_group.authorize(src_group=input_group)
    compute_group.authorize(src_group=compute_group)
    compute_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    compute_group.authorize('tcp', 4000, 4000, '0.0.0.0/0')
    compute_group.authorize('tcp', 4001, 4001, '0.0.0.0/0')
    compute_group.authorize('tcp', 5001, 5001, '0.0.0.0/0')

  # Check if instances are already running in our groups
  active_nodes = get_existing_cluster(conn, opts, cluster_name,
                                      die_on_error=False)
  if any(active_nodes):
    print >> stderr, ("ERROR: There are already instances running in " +
        "group %s, %s or %s" % (input_group.name, compute_group.name))
    sys.exit(1)
  
  # CHANGE THIS IF CHANGING REGIONS
  opts.ami = 'ami-d76605be'
  
  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device
  launch_groups = opts.compute_groups + 1
  # Launch compute nodes
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d compute nodes as spot instances with price $%.3f" %
           (launch_groups * opts.slaves, opts.spot_price))
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    my_req_ids = []
    for zone in zones:
      num_slaves_this_zone = get_partition(launch_groups * opts.slaves, num_zones, i)
      compute_reqs = conn.request_spot_instances(
          price = opts.spot_price,
          image_id = opts.ami,
          launch_group = "launch-group-%s" % cluster_name,
          placement = zone,
          count = num_slaves_this_zone,
          key_name = opts.key_pair,
          security_groups = [compute_group],
          instance_type = opts.instance_type,
          block_device_map = block_map)
      my_req_ids += [req.id for req in compute_reqs]
      i += 1
    
    print "Waiting for spot instances to be granted..."
    try:
      while True:
        time.sleep(10)
        reqs = conn.get_all_spot_instance_requests()
        id_to_req = {}
        for r in reqs:
          id_to_req[r.id] = r
        active_instance_ids = []
        for i in my_req_ids:
          if i in id_to_req and id_to_req[i].state == "active":
            active_instance_ids.append(id_to_req[i].instance_id)
        if len(active_instance_ids) == opts.slaves * launch_groups:
          print "All %d compute nodes granted" %(opts.slaves * launch_groups)
          reservations = conn.get_all_instances(active_instance_ids)
          compute_nodes = []
          for r in reservations:
            compute_nodes += r.instances
          break
        else:
          print "%d of %d compute nodes granted, waiting longer" % (
            len(active_instance_ids), opts.slaves * launch_groups)
    except:
      print "Canceling spot instance requests"
      conn.cancel_spot_instance_requests(my_req_ids)
      # Log a warning if any of these requests actually launched instances:
      (input_nodes, compute_nodes) = get_existing_cluster(
          conn, opts, cluster_name, die_on_error=False)
      running = len(input_nodes) + len(compute_nodes)
      if running:
        print >> stderr, ("WARNING: %d instances are still running" % running)
      sys.exit(0)
  else:
    # Launch non-spot instances
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    compute_nodes = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves * launch_groups, num_zones, i)
      if num_slaves_this_zone > 0:
        compute_res = image.run(key_name = opts.key_pair,
                              security_groups = [compute_group],
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map)
        compute_nodes += compute_res.instances
        print "Launched %d compute nodes in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, compute_res.id)
      i += 1

  # Launch input nodes
  input_type = opts.instance_type
  if input_type == "":
    input_type = opts.instance_type
  if opts.zone == 'all':
    opts.zone = random.choice(conn.get_all_zones()).name
  input_res = image.run(key_name = opts.key_pair,
                         security_groups = [input_group],
                         instance_type = input_type,
                         placement = opts.zone,
                         min_count = 1,
                         max_count = 1,
                         block_device_map = block_map)
  input_nodes = input_res.instances
  print "Launched input in %s, regid = %s" % (zone, input_res.id)

  # Return all the instances
  return (input_nodes, compute_nodes)
Ejemplo n.º 47
0
    def create_node(self, name, distribution,
                    size=None, disk_size=8,
                    metadata={}):
        if size is None:
            size = self._default_size

        with start_action(
            action_type=u"flocker:provision:aws:create_node",
            name=name,
            distribution=distribution,
            image_size=size,
            disk_size=disk_size,
            metadata=metadata,
        ):

            metadata = metadata.copy()
            metadata['Name'] = name

            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]},
            )

            with start_action(
                action_type=u"flocker:provision:aws:create_node:run_instances",
            ) as context:
                reservation = self._connection.run_instances(
                    images[0].id,
                    key_name=self._keyname,
                    instance_type=size,
                    security_groups=self._security_groups,
                    block_device_map=diskmap,
                    placement=self._zone,
                    # On some operating systems, a tty is requried for sudo.
                    # Since AWS systems have a non-root user as the login,
                    # disable this, so we can use sudo with conch.
                    user_data=dedent("""\
                        #!/bin/sh
                        sed -i '/Defaults *requiretty/d' /etc/sudoers
                        """),
                )

                instance = reservation.instances[0]
                context.add_success_fields(instance_id=instance.id)

            self._connection.create_tags([instance.id], metadata)

            # Display state as instance starts up, to keep user informed that
            # things are happening.
            _wait_until_running(instance)

            return AWSNode(
                name=name,
                _provisioner=self,
                _instance=instance,
                distribution=distribution,
            )
Ejemplo n.º 48
0
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name + "-master")
  slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize(src_group=master_group)
    zoo_group.authorize(src_group=slave_group)
    zoo_group.authorize(src_group=zoo_group)
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  print "Checking for running cluster..."
  reservations = conn.get_all_instances()
  for res in reservations:
    group_names = [g.id for g in res.groups]
    if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
      active = [i for i in res.instances if is_active(i)]
      if len(active) > 0:
        print >> stderr, ("ERROR: There are already instances running in " +
            "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
        sys.exit(1)
  print "Launching instances..."
  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  slave_res = image.run(key_name = opts.key_pair,
                        security_groups = [slave_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = opts.slaves,
                        max_count = opts.slaves,
                        block_device_map = block_map)
  slave_nodes = slave_res.instances
  print "Launched slaves, regid = " + slave_res.id

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  master_res = image.run(key_name = opts.key_pair,
                         security_groups = [master_group],
                         instance_type = master_type,
                         placement = opts.zone,
                         min_count = opts.ft,
                         max_count = opts.ft,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master, regid = " + master_res.id

  # Launch ZooKeeper nodes if required
  if opts.ft > 1:
    zoo_res = image.run(key_name = opts.key_pair,
                        security_groups = [zoo_group],
                        instance_type = opts.instance_type,
                        placement = opts.zone,
                        min_count = 3,
                        max_count = 3,
                        block_device_map = block_map)
    zoo_nodes = zoo_res.instances
    print "Launched zoo, regid = " + zoo_res.id
  else:
    zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 49
0
def _run_encryptor_instance(
        aws_svc, encryptor_image_id, snapshot, root_size, guest_image_id,
        security_group_ids=None, subnet_id=None, zone=None,
        instance_config=None,
        status_port=encryptor_service.ENCRYPTOR_STATUS_PORT):
    bdm = BlockDeviceMapping()

    if instance_config is None:
        instance_config = InstanceConfig()

    image = aws_svc.get_image(encryptor_image_id)
    virtualization_type = image.virtualization_type

    # Use gp2 for fast burst I/O copying root drive
    guest_unencrypted_root = EBSBlockDeviceType(
        volume_type='gp2',
        snapshot_id=snapshot,
        delete_on_termination=True)
    # Use gp2 for fast burst I/O copying root drive
    log.info('Launching encryptor instance with snapshot %s', snapshot)
    # They are creating an encrypted AMI instead of updating it
    # Use gp2 for fast burst I/O copying root drive
    guest_encrypted_root = EBSBlockDeviceType(
        volume_type='gp2',
        delete_on_termination=True)
    guest_encrypted_root.size = 2 * root_size + 1

    if virtualization_type == 'paravirtual':
        bdm['/dev/sda4'] = guest_unencrypted_root
        bdm['/dev/sda5'] = guest_encrypted_root
    else:
        # Use 'sd' names even though AWS maps these to 'xvd'
        # The AWS GUI only exposes 'sd' names, and won't allow
        # the user to attach to an existing 'sd' name in use, but
        # would allow conflicts if we used 'xvd' names here.
        bdm['/dev/sdf'] = guest_unencrypted_root
        bdm['/dev/sdg'] = guest_encrypted_root

    # If security groups were not specified, create a temporary security
    # group that allows us to poll the metavisor for encryption progress.
    temp_sg_id = None
    instance = None

    try:
        run_instance = aws_svc.run_instance

        if not security_group_ids:
            vpc_id = None
            if subnet_id:
                subnet = aws_svc.get_subnet(subnet_id)
                vpc_id = subnet.vpc_id
            temp_sg_id = create_encryptor_security_group(
                aws_svc, vpc_id=vpc_id, status_port=status_port).id
            security_group_ids = [temp_sg_id]

            # Wrap with a retry, to handle eventual consistency issues with
            # the newly-created group.
            run_instance = aws_svc.retry(
                aws_svc.run_instance,
                error_code_regexp='InvalidGroup\.NotFound'
            )

        user_data = instance_config.make_userdata()
        compressed_user_data = gzip_user_data(user_data)

        instance = run_instance(
            encryptor_image_id,
            security_group_ids=security_group_ids,
            user_data=compressed_user_data,
            placement=zone,
            block_device_map=bdm,
            subnet_id=subnet_id
        )
        aws_svc.create_tags(
            instance.id,
            name=NAME_ENCRYPTOR,
            description=DESCRIPTION_ENCRYPTOR % {'image_id': guest_image_id}
        )
        log.info('Launching encryptor instance %s', instance.id)
        instance = wait_for_instance(aws_svc, instance.id)

        # Tag volumes.
        bdm = instance.block_device_mapping
        if virtualization_type == 'paravirtual':
            aws_svc.create_tags(
                bdm['/dev/sda5'].volume_id, name=NAME_ENCRYPTED_ROOT_VOLUME)
            aws_svc.create_tags(
                bdm['/dev/sda2'].volume_id, name=NAME_METAVISOR_ROOT_VOLUME)
            aws_svc.create_tags(
                bdm['/dev/sda1'].volume_id, name=NAME_METAVISOR_GRUB_VOLUME)
            aws_svc.create_tags(
                bdm['/dev/sda3'].volume_id, name=NAME_METAVISOR_LOG_VOLUME)
        else:
            aws_svc.create_tags(
                bdm['/dev/sda1'].volume_id, name=NAME_METAVISOR_ROOT_VOLUME)
            aws_svc.create_tags(
                bdm['/dev/sdg'].volume_id, name=NAME_ENCRYPTED_ROOT_VOLUME)
    except:
        cleanup_instance_ids = []
        cleanup_sg_ids = []
        if instance:
            cleanup_instance_ids = [instance.id]
        if temp_sg_id:
            cleanup_sg_ids = [temp_sg_id]
        clean_up(
            aws_svc,
            instance_ids=cleanup_instance_ids,
            security_group_ids=cleanup_sg_ids
        )
        raise

    return instance, temp_sg_id
Ejemplo n.º 50
0
    def create_nodes(self, reactor, names, distribution, metadata={}):
        """
        Create nodes with the given names.

        :param reactor: The reactor.
        :param name: The names of the nodes.
        :type name: list of str
        :param str distribution: The name of the distribution to
            install on the nodes.
        :param dict metadata: Metadata to associate with the nodes.

        :return: A list of ``Deferred``s each firing with an INode
            when the corresponding node is created.   The list has
            the same order as :param:`names`.
        """
        size = self._default_size
        disk_size = 8

        action = start_action(
            action_type=u"flocker:provision:aws:create_nodes",
            instance_count=len(names),
            distribution=distribution,
            image_size=size,
            disk_size=disk_size,
            metadata=metadata,
        )
        with action.context():
            disk1 = EBSBlockDeviceType()
            disk1.size = disk_size
            disk1.delete_on_termination = True
            diskmap = BlockDeviceMapping()
            diskmap['/dev/sda1'] = disk1

            images = self._connection.get_all_images(
                filters={'name': IMAGE_NAMES[distribution]},
            )

            instances = self._run_nodes(
                count=len(names),
                image_id=images[0].id,
                size=size,
                diskmap=diskmap
            )

            def make_node(ignored, name, instance):
                return AWSNode(
                    name=name,
                    _provisioner=self,
                    _instance=instance,
                    distribution=distribution,
                )

            results = []
            for name, instance in izip_longest(names, instances):
                if instance is None:
                    results.append(fail(Exception("Could not run instance")))
                else:
                    node_metadata = metadata.copy()
                    node_metadata['Name'] = name
                    d = self._async_get_node(reactor, instance, node_metadata)
                    d = DeferredContext(d)
                    d.addCallback(make_node, name, instance)
                    results.append(d.result)
            action_completion = DeferredContext(DeferredList(results))
            action_completion.addActionFinish()
            # Individual results and errors should be consumed by the caller,
            # so we can leave action_completion alone now.
            return results
Ejemplo n.º 51
0
def launch_cluster(conn, opts, cluster_name):
  if opts.identity_file is None:
    print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
    sys.exit(1)
  if opts.key_pair is None:
    print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
    sys.exit(1)
  #print "Setting up security groups..."
  #master_group = get_or_make_group(conn, cluster_name + "-master")
  #slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  #if master_group.rules == []: # Group was just now created
  #  master_group.authorize(src_group=master_group)
  #  master_group.authorize(src_group=slave_group)
  #  master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
  #  master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
  #  master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
  #  master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
  #  master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
  #  master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
  #  master_group.authorize('tcp', 4040, 4045, '0.0.0.0/0')
  #  if opts.ganglia:
  #    master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
  #if slave_group.rules == []: # Group was just now created
  #  slave_group.authorize(src_group=master_group)
  #  slave_group.authorize(src_group=slave_group)
  #  slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
  #  slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')

  # Check if instances are already running in our groups
  existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                           die_on_error=False)
  #if existing_slaves or (existing_masters and not opts.use_existing_master):
  #  print >> stderr, ("ERROR: There are already instances running in " +
  #      "group %s or %s" % (master_group.name, slave_group.name))
  #  sys.exit(1)

  # Figure out Spark AMI
  if opts.ami is None:
    opts.ami = get_spark_ami(opts)
  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    my_req_ids = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
      slave_reqs = conn.request_spot_instances(
          price = opts.spot_price,
          image_id = opts.ami,
          launch_group = "launch-group-%s" % cluster_name,
          placement = zone,
          count = num_slaves_this_zone,
          key_name = opts.key_pair,
          #security_groups = [slave_group],
          instance_type = opts.instance_type,
          block_device_map = block_map)
      my_req_ids += [req.id for req in slave_reqs]
      i += 1

    print "Waiting for spot instances to be granted..."
    try:
      while True:
        time.sleep(10)
        reqs = conn.get_all_spot_instance_requests()
        id_to_req = {}
        for r in reqs:
          id_to_req[r.id] = r
        active_instance_ids = []
        for i in my_req_ids:
          if i in id_to_req and id_to_req[i].state == "active":
            active_instance_ids.append(id_to_req[i].instance_id)
        if len(active_instance_ids) == opts.slaves:
          print "All %d slaves granted" % opts.slaves
          reservations = conn.get_all_instances(active_instance_ids)
          slave_nodes = []
          for r in reservations:
            slave_nodes += r.instances
          break
        else:
          print "%d of %d slaves granted, waiting longer" % (
            len(active_instance_ids), opts.slaves)
    except:
      print "Canceling spot instance requests"
      conn.cancel_spot_instance_requests(my_req_ids)
      # Log a warning if any of these requests actually launched instances:
      (master_nodes, slave_nodes) = get_existing_cluster(
          conn, opts, cluster_name, die_on_error=False)
      running = len(master_nodes) + len(slave_nodes)
      if running:
        print >> stderr, ("WARNING: %d instances are still running" % running)
      sys.exit(0)
  else:
    # Launch non-spot instances
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    slave_nodes = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
      if num_slaves_this_zone > 0:
        slave_res = image.run(key_name = opts.key_pair,
                              security_group_ids = ["sg-87956be2","sg-1ac33f7f", "sg-1ec33f7b"],
                              subnet_id = "subnet-4182b007",
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map)
        slave_nodes += slave_res.instances
        print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, slave_res.id)
      i += 1

  # Launch or resume masters
  if existing_masters:
    print "Starting master..."
    for inst in existing_masters:
      if inst.state not in ["shutting-down", "terminated"]:
        inst.start()
    master_nodes = existing_masters
  else:
    master_type = opts.master_instance_type
    if master_type == "":
      master_type = opts.instance_type
    if opts.zone == 'all':
      opts.zone = random.choice(conn.get_all_zones()).name
    master_res = image.run(key_name = opts.key_pair,
                           security_group_ids = ["sg-bd956bd8","sg-1ac33f7f", "sg-1ec33f7b"],
                           subnet_id = "subnet-4182b007",
                           instance_type = master_type,
                           placement = opts.zone,
                           min_count = 1,
                           max_count = 1,
                           block_device_map = block_map)
    master_nodes = master_res.instances
    print "Launched master in %s, regid = %s" % (zone, master_res.id)

  # Return all the instances
  return (master_nodes, slave_nodes)
Ejemplo n.º 52
0
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  
  master_group = get_or_make_group(conn, "shark-exp-master")
  slave_group = get_or_make_group(conn, "shark-exp-slaves")
  zoo_group = get_or_make_group(conn, "ampcamp-zoo")
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    if opts.cluster_type == "mesos":
      master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
      master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
      master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
      master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
      # hbase
      master_group.authorize('tcp', 60010, 60010, '0.0.0.0/0')
      master_group.authorize('tcp', 60050, 60050, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    if opts.cluster_type == "mesos":
      slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
      slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
      slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
      slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
      # hbase
      slave_group.authorize('tcp', 60050, 60050, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize(src_group=master_group)
    zoo_group.authorize(src_group=slave_group)
    zoo_group.authorize(src_group=zoo_group)
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  print "Checking for running cluster..."
  reservations = conn.get_all_instances()
  for res in reservations:
    for instance in res.instances:
      if 'tags' in instance.__dict__ and 'cluster' in instance.tags:
        if instance.tags['cluster'] == cluster_name and is_active(instance):
          print >> stderr, ("ERROR: Instances %s is already running in cluster %s"
                            % (instance.id, cluster_name))
          sys.exit(1)

  if opts.ami in ["latest", "standalone"]:
    opts.ami = get_ami(opts.ami)

  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    slave_reqs = conn.request_spot_instances(
        price = opts.spot_price,
        image_id = opts.ami,
        launch_group = "launch-group-%s" % cluster_name,
        placement = opts.zone,
        count = opts.slaves,
        key_name = opts.key_pair,
        security_groups = [slave_group],
        instance_type = opts.instance_type,
        block_device_map = block_map)
    my_req_ids = [req.id for req in slave_reqs]
    print "Waiting for spot instances to be granted..."
    while True:
      time.sleep(10)
      reqs = conn.get_all_spot_instance_requests()
      id_to_req = {}
      for r in reqs:
        id_to_req[r.id] = r
      active = 0
      instance_ids = []
      for i in my_req_ids:
        if id_to_req[i].state == "active":
          active += 1
          instance_ids.append(id_to_req[i].instance_id)
      if active == opts.slaves:
        print "All %d slaves granted" % opts.slaves
        reservations = conn.get_all_instances(instance_ids)
        slave_nodes = []
        for r in reservations:
          slave_nodes += r.instances
        break
      else:
        print "%d of %d slaves granted, waiting longer" % (active, opts.slaves)
  else:
    # Launch non-spot instances
    slave_res = image.run(key_name = opts.key_pair,
                          security_groups = [slave_group],
                          instance_type = opts.instance_type,
                          placement = opts.zone,
                          min_count = opts.slaves,
                          max_count = opts.slaves,
                          block_device_map = block_map)
    slave_nodes = slave_res.instances
    print "Launched slaves, regid = " + slave_res.id

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  master_res = image.run(key_name = opts.key_pair,
                         security_groups = [master_group],
                         instance_type = master_type,
                         placement = opts.zone,
                         min_count = 1,
                         max_count = 1,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master, regid = " + master_res.id

  # Create the right tags
  tags = {}
  tags['cluster'] = cluster_name

  tags['type'] = 'slave'
  for node in slave_nodes:
    conn.create_tags([node.id], tags)
  
  tags['type'] = 'master'
  for node in master_nodes:
    conn.create_tags([node.id], tags)

  zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 53
0
def launch_cluster(conn, opts, cluster_name):
  print "Setting up security groups..."
  master_group = get_or_make_group(conn, cluster_name + "-master")
  slave_group = get_or_make_group(conn, cluster_name + "-slaves")
  zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
  if master_group.rules == []: # Group was just now created
    master_group.authorize(src_group=master_group)
    master_group.authorize(src_group=slave_group)
    master_group.authorize(src_group=zoo_group)
    master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
    master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
    master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
    if opts.cluster_type == "mesos":
      master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0')
    if opts.ganglia:
      master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0')
  if slave_group.rules == []: # Group was just now created
    slave_group.authorize(src_group=master_group)
    slave_group.authorize(src_group=slave_group)
    slave_group.authorize(src_group=zoo_group)
    slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
    slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
    slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
    slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
    slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
  if zoo_group.rules == []: # Group was just now created
    zoo_group.authorize(src_group=master_group)
    zoo_group.authorize(src_group=slave_group)
    zoo_group.authorize(src_group=zoo_group)
    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
    zoo_group.authorize('tcp', 2888, 2888, '0.0.0.0/0')
    zoo_group.authorize('tcp', 3888, 3888, '0.0.0.0/0')

  # Check if instances are already running in our groups
  active_nodes = get_existing_cluster(conn, opts, cluster_name,
                                      die_on_error=False)
  if any(active_nodes):
    print >> stderr, ("ERROR: There are already instances running in " +
        "group %s, %s or %s" % (master_group.name, slave_group.name, zoo_group.name))
    sys.exit(1)

  # Figure out the latest AMI from our static URL
  if opts.ami == "latest":
    try:
      opts.ami = urllib2.urlopen(LATEST_AMI_URL).read().strip()
      print "Latest Spark AMI: " + opts.ami
    except:
      print >> stderr, "Could not read " + LATEST_AMI_URL
      sys.exit(1)

  print "Launching instances..."

  try:
    image = conn.get_all_images(image_ids=[opts.ami])[0]
  except:
    print >> stderr, "Could not find AMI " + opts.ami
    sys.exit(1)

  # Create block device mapping so that we can add an EBS volume if asked to
  block_map = BlockDeviceMapping()
  if opts.ebs_vol_size > 0:
    device = EBSBlockDeviceType()
    device.size = opts.ebs_vol_size
    device.delete_on_termination = True
    block_map["/dev/sdv"] = device

  # Launch slaves
  if opts.spot_price != None:
    # Launch spot instances with the requested price
    print ("Requesting %d slaves as spot instances with price $%.3f" %
           (opts.slaves, opts.spot_price))
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    my_req_ids = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
      slave_reqs = conn.request_spot_instances(
          price = opts.spot_price,
          image_id = opts.ami,
          launch_group = "launch-group-%s" % cluster_name,
          placement = zone,
          count = num_slaves_this_zone,
          key_name = opts.key_pair,
          security_groups = [slave_group],
          instance_type = opts.instance_type,
          block_device_map = block_map)
      my_req_ids += [req.id for req in slave_reqs]
      i += 1
    
    print "Waiting for spot instances to be granted..."
    try:
      while True:
        time.sleep(10)
        reqs = conn.get_all_spot_instance_requests()
        id_to_req = {}
        for r in reqs:
          id_to_req[r.id] = r
        active_instance_ids = []
        for i in my_req_ids:
          if i in id_to_req and id_to_req[i].state == "active":
            active_instance_ids.append(id_to_req[i].instance_id)
        if len(active_instance_ids) == opts.slaves:
          print "All %d slaves granted" % opts.slaves
          reservations = conn.get_all_instances(active_instance_ids)
          slave_nodes = []
          for r in reservations:
            slave_nodes += r.instances
          break
        else:
          print "%d of %d slaves granted, waiting longer" % (
            len(active_instance_ids), opts.slaves)
    except:
      print "Canceling spot instance requests"
      conn.cancel_spot_instance_requests(my_req_ids)
      # Log a warning if any of these requests actually launched instances:
      (master_nodes, slave_nodes, zoo_nodes) = get_existing_cluster(
          conn, opts, cluster_name, die_on_error=False)
      running = len(master_nodes) + len(slave_nodes) + len(zoo_nodes)
      if running:
        print >> stderr, ("WARNING: %d instances are still running" % running)
      sys.exit(0)
  else:
    # Launch non-spot instances
    zones = get_zones(conn, opts)
    num_zones = len(zones)
    i = 0
    slave_nodes = []
    for zone in zones:
      num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
      if num_slaves_this_zone > 0:
        slave_res = image.run(key_name = opts.key_pair,
                              security_groups = [slave_group],
                              instance_type = opts.instance_type,
                              placement = zone,
                              min_count = num_slaves_this_zone,
                              max_count = num_slaves_this_zone,
                              block_device_map = block_map)
        slave_nodes += slave_res.instances
        print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone,
                                                        zone, slave_res.id)
      i += 1

  # Launch masters
  master_type = opts.master_instance_type
  if master_type == "":
    master_type = opts.instance_type
  if opts.zone == 'all':
    opts.zone = random.choice(conn.get_all_zones()).name
  master_res = image.run(key_name = opts.key_pair,
                         security_groups = [master_group],
                         instance_type = master_type,
                         placement = opts.zone,
                         min_count = 1,
                         max_count = 1,
                         block_device_map = block_map)
  master_nodes = master_res.instances
  print "Launched master in %s, regid = %s" % (zone, master_res.id)

  zoo_nodes = []

  # Return all the instances
  return (master_nodes, slave_nodes, zoo_nodes)
Ejemplo n.º 54
0
def launch_cluster(conn, opts, cluster_name):

    # Remove known hosts to avoid "Offending key for IP ..." errors.
    known_hosts = os.environ["HOME"] + "/.ssh/known_hosts"
    if os.path.isfile(known_hosts):
        os.remove(known_hosts)
    if opts.key_pair is None:
        opts.key_pair = keypair()
        if opts.key_pair is None:
            print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
            sys.exit(1)

    if opts.profile is None:
        opts.profile = profile()
        if opts.profile is None:
            print >> stderr, "ERROR: No profile found in current host. It be provided with -p option."
            sys.exit(1)

    public_key = pub_key()
    user_data = Template(
        """#!/bin/bash
  set -e -x
  echo '$public_key' >> ~root/.ssh/authorized_keys
  echo '$public_key' >> ~ec2-user/.ssh/authorized_keys"""
    ).substitute(public_key=public_key)

    print "Setting up security groups..."
    master_group = get_or_make_group(conn, cluster_name + "-master")
    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
    sparknotebook_group = get_or_make_group(conn, "SparkNotebookApplication")
    if master_group.rules == []:  # Group was just now created
        master_group.authorize(src_group=master_group)
        master_group.authorize(src_group=slave_group)
        master_group.authorize(src_group=sparknotebook_group)
        master_group.authorize("tcp", 22, 22, "0.0.0.0/0")
        master_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
        master_group.authorize("tcp", 18080, 18080, "0.0.0.0/0")
        master_group.authorize("tcp", 19999, 19999, "0.0.0.0/0")
        master_group.authorize("tcp", 50030, 50030, "0.0.0.0/0")
        master_group.authorize("tcp", 50070, 50070, "0.0.0.0/0")
        master_group.authorize("tcp", 60070, 60070, "0.0.0.0/0")
        master_group.authorize("tcp", 4040, 4045, "0.0.0.0/0")
        master_group.authorize("tcp", 7077, 7077, "0.0.0.0/0")
        if opts.ganglia:
            master_group.authorize("tcp", 5080, 5080, "0.0.0.0/0")
    if slave_group.rules == []:  # Group was just now created
        slave_group.authorize(src_group=master_group)
        slave_group.authorize(src_group=slave_group)
        slave_group.authorize(src_group=sparknotebook_group)
        slave_group.authorize("tcp", 22, 22, "0.0.0.0/0")
        slave_group.authorize("tcp", 8080, 8081, "0.0.0.0/0")
        slave_group.authorize("tcp", 50060, 50060, "0.0.0.0/0")
        slave_group.authorize("tcp", 50075, 50075, "0.0.0.0/0")
        slave_group.authorize("tcp", 60060, 60060, "0.0.0.0/0")
        slave_group.authorize("tcp", 60075, 60075, "0.0.0.0/0")

    if not any(r for r in sparknotebook_group.rules for g in r.grants if master_group.id == g.group_id):
        sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=master_group)
        sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=master_group)

    if not any(r for r in sparknotebook_group.rules for g in r.grants if slave_group.id == g.group_id):
        sparknotebook_group.authorize(ip_protocol="tcp", from_port="1", to_port="65535", src_group=slave_group)
        sparknotebook_group.authorize(ip_protocol="icmp", from_port="-1", to_port="-1", src_group=slave_group)

    # Check if instances are already running in our groups
    existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
    if existing_slaves or (existing_masters and not opts.use_existing_master):
        print >> stderr, (
            "ERROR: There are already instances running in " + "group %s or %s" % (master_group.name, slave_group.name)
        )
        sys.exit(1)

    # Figure out Spark AMI
    if opts.ami is None:
        opts.ami = get_spark_ami(opts)
    print "Launching instances..."

    try:
        image = conn.get_all_images(image_ids=[opts.ami])[0]
    except:
        print >> stderr, "Could not find AMI " + opts.ami
        sys.exit(1)

    # Create block device mapping so that we can add an EBS volume if asked to
    block_map = BlockDeviceMapping()
    if opts.ebs_vol_size > 0:
        device = EBSBlockDeviceType()
        device.size = opts.ebs_vol_size
        device.delete_on_termination = True
        block_map["/dev/sdv"] = device

    # Launch slaves
    if opts.spot_price != None:
        zones = get_zones(conn, opts)

        num_zones = len(zones)
        i = 0
        my_req_ids = []

        for zone in zones:
            best_price = find_best_price(conn, opts.instance_type, zone, opts.spot_price)
            # Launch spot instances with the requested price
            print >> stderr, (
                "Requesting %d slaves as spot instances with price $%.3f/hour each (total $%.3f/hour)"
                % (opts.slaves, best_price, opts.slaves * best_price)
            )

            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
                subnet_id=subnetId(), groups=[slave_group.id], associate_public_ip_address=True
            )
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)

            slave_reqs = conn.request_spot_instances(
                price=best_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=zone,
                count=num_slaves_this_zone,
                key_name=opts.key_pair,
                instance_type=opts.instance_type,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
                network_interfaces=interfaces,
            )
            my_req_ids += [req.id for req in slave_reqs]
            i += 1

        print >> stderr, "Waiting for spot instances to be granted"
        try:
            while True:
                time.sleep(10)
                reqs = conn.get_all_spot_instance_requests()
                id_to_req = {}
                for r in reqs:
                    id_to_req[r.id] = r
                active_instance_ids = []
                for i in my_req_ids:
                    if i in id_to_req and id_to_req[i].state == "active":
                        active_instance_ids.append(id_to_req[i].instance_id)
                if len(active_instance_ids) == opts.slaves:
                    print >> stderr, "All %d slaves granted" % opts.slaves
                    reservations = conn.get_all_instances(active_instance_ids)
                    slave_nodes = []
                    for r in reservations:
                        slave_nodes += r.instances
                    break
                else:
                    # print >> stderr, ".",
                    print "%d of %d slaves granted, waiting longer" % (len(active_instance_ids), opts.slaves)
        except:
            print >> stderr, "Canceling spot instance requests"
            conn.cancel_spot_instance_requests(my_req_ids)
            # Log a warning if any of these requests actually launched instances:
            (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
            running = len(master_nodes) + len(slave_nodes)
            if running:
                print >> stderr, ("WARNING: %d instances are still running" % running)
            sys.exit(0)
    else:
        # Launch non-spot instances
        zones = get_zones(conn, opts)
        num_zones = len(zones)
        i = 0
        slave_nodes = []
        for zone in zones:
            num_slaves_this_zone = get_partition(opts.slaves, num_zones, i)
            if num_slaves_this_zone > 0:
                slave_res = image.run(
                    key_name=opts.key_pair,
                    security_group_ids=[slave_group.id],
                    instance_type=opts.instance_type,
                    subnet_id=subnetId(),
                    placement=zone,
                    min_count=num_slaves_this_zone,
                    max_count=num_slaves_this_zone,
                    block_device_map=block_map,
                    user_data=user_data,
                    instance_profile_arn=opts.profile,
                )
                slave_nodes += slave_res.instances
                print >> stderr, "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, zone, slave_res.id)
            i += 1

    # Launch or resume masters
    if existing_masters:
        print "Starting master..."
        for inst in existing_masters:
            if inst.state not in ["shutting-down", "terminated"]:
                inst.start()
        master_nodes = existing_masters
    else:
        master_type = opts.master_instance_type
        if master_type == "":
            master_type = opts.instance_type
        if opts.zone == "all":
            opts.zone = random.choice(conn.get_all_zones()).name
        if opts.spot_price != None:
            best_price = find_best_price(conn, master_type, opts.zone, opts.spot_price)
            # Launch spot instances with the requested price
            print >> stderr, ("Requesting master as spot instances with price $%.3f/hour" % (best_price))

            interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
                subnet_id=subnetId(), groups=[master_group.id], associate_public_ip_address=True
            )
            interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)

            master_reqs = conn.request_spot_instances(
                price=best_price,
                image_id=opts.ami,
                launch_group="launch-group-%s" % cluster_name,
                placement=opts.zone,
                count=1,
                key_name=opts.key_pair,
                instance_type=master_type,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
                network_interfaces=interfaces,
            )
            my_req_ids = [r.id for r in master_reqs]
            print >> stderr, "Waiting for spot instance to be granted"
            try:
                while True:
                    time.sleep(10)
                    reqs = conn.get_all_spot_instance_requests(request_ids=my_req_ids)
                    id_to_req = {}
                    for r in reqs:
                        id_to_req[r.id] = r
                    active_instance_ids = []
                    for i in my_req_ids:
                        if i in id_to_req and id_to_req[i].state == "active":
                            active_instance_ids.append(id_to_req[i].instance_id)
                    if len(active_instance_ids) == 1:
                        print >> stderr, "Master granted"
                        reservations = conn.get_all_instances(active_instance_ids)
                        master_nodes = []
                        for r in reservations:
                            master_nodes += r.instances
                        break
                    else:
                        # print >> stderr, ".",
                        print "%d of %d masters granted, waiting longer" % (len(active_instance_ids), 1)
            except:
                print >> stderr, "Canceling spot instance requests"
                conn.cancel_spot_instance_requests(my_req_ids)
                # Log a warning if any of these requests actually launched instances:
                (master_nodes, master_nodes) = get_existing_cluster(conn, opts, cluster_name, die_on_error=False)
                running = len(master_nodes) + len(master_nodes)
                if running:
                    print >> stderr, ("WARNING: %d instances are still running" % running)
                sys.exit(0)
        else:
            master_res = image.run(
                key_name=opts.key_pair,
                security_group_ids=[master_group.id],
                instance_type=master_type,
                subnet_id=subnetId(),
                placement=opts.zone,
                min_count=1,
                max_count=1,
                block_device_map=block_map,
                user_data=user_data,
                instance_profile_arn=opts.profile,
            )
            master_nodes = master_res.instances
            print >> stderr, "Launched master in %s, regid = %s" % (zone, master_res.id)
    # Return all the instances
    return (master_nodes, slave_nodes)