else: if private_ip: module.fail_json( msg='private_ip only available with on-demand (non-spot) instances') if boto_supports_param_in_spot_request(ec2, 'placement_group'): params['placement_group'] = placement_group elif placement_group : module.fail_json( msg="placement_group parameter requires Boto version 2.3.0 or higher.") params.update(dict( count = count_remaining, type = spot_type, )) res = ec2.request_spot_instances(spot_price, **params) # Now we have to do the intermediate waiting if wait: spot_req_inst_ids = dict() spot_wait_timeout = time.time() + spot_wait_timeout while spot_wait_timeout > time.time(): reqs = ec2.get_all_spot_instance_requests() for sirb in res: if sirb.id in spot_req_inst_ids: continue for sir in reqs: if sir.id == sirb.id and sir.instance_id is not None: spot_req_inst_ids[sirb.id] = sir.instance_id if len(spot_req_inst_ids) < count: time.sleep(5)
"use a (possibly different) 'instanceid' parameter") else: if private_ip: module.fail_json( msg='private_ip only available with on-demand (non-spot) instances') if boto_supports_param_in_spot_request(ec2, 'placement_group'): params['placement_group'] = placement_group elif placement_group : module.fail_json( msg="placement_group parameter requires Boto version 2.3.0 or higher.") params.update(dict( count = count_remaining, )) res = ec2.request_spot_instances(spot_price, **params) # Now we have to do the intermediate waiting if wait: spot_req_inst_ids = dict() spot_wait_timeout = time.time() + spot_wait_timeout while spot_wait_timeout > time.time(): reqs = ec2.get_all_spot_instance_requests() for sirb in res: if sirb.id in spot_req_inst_ids: continue for sir in reqs: if sir.id == sirb.id and sir.instance_id is not None: spot_req_inst_ids[sirb.id] = sir.instance_id if len(spot_req_inst_ids) < count: time.sleep(5)
image_id=amiId, min_count=numInstancesToLaunch, max_count=numInstancesToLaunch, key_name=keyName, instance_type=instanceType, security_groups=[securityGroupName], instance_profile_arn=iam_profile_resource_name, instance_profile_name=iam_profile_name, dry_run=dryRun) else: spotRequests = ec2.request_spot_instances( price=spotBid, image_id=amiId, count=numInstancesToLaunch, key_name=keyName, instance_type=instanceType, security_groups=[securityGroupName], instance_profile_arn=iam_profile_resource_name, instance_profile_name=iam_profile_name, dry_run=dryRun, ) requestIDs = [request.id for request in spotRequests] fulfilled = [] while requestIDs: requests = ec2.get_all_spot_instance_requests(request_ids=requestIDs, ) for request in requests: if request.instance_id: requestIDs.remove(request.id)
def _launch_instances_abort_on_error(ami, bid, count, instance_type, subnet, assign_public_ip, source_dest_check, placement_group, disks, security_group_ids, tags=None): ephemeral_idx = 0 bdm = boto.ec2.blockdevicemapping.BlockDeviceMapping() if disks is not None: for disk in disks: bdm[disk] = boto.ec2.blockdevicemapping.BlockDeviceType( ephemeral_name='ephemeral%d' % ephemeral_idx) ephemeral_idx += 1 nics = boto.ec2.networkinterface.NetworkInterfaceCollection() nics.append(boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=subnet, groups=security_group_ids, associate_public_ip_address=assign_public_ip)) ec2 = boto.ec2.connect_to_region(env.ec2_region) sirs = ec2.request_spot_instances( price=bid, image_id=ami, count=count, type='one-time', key_name=env.ec2_key_pair_name, instance_type=instance_type, monitoring_enabled=True, placement_group=placement_group, block_device_map=bdm, network_interfaces=nics) instance_ids = set() while True: time.sleep(10) done = True for sir in ec2.get_all_spot_instance_requests(map(lambda x: x.id, sirs)): print 'State: %s' % sir.state print 'Fault: %s' % sir.fault print 'Status: %s' % sir.status.message if sir.state not in ('open', 'active'): abort('Failed to launch instances') if sir.state == 'open': done = False if sir.state == 'active': instance_ids.add(sir.instance_id) if done: break for instance_id in instance_ids: ec2.modify_instance_attribute( instance_id=instance_id, attribute='sourceDestCheck', value=source_dest_check) print '' print 'Instances:' for reservation in ec2.get_all_instances(list(instance_ids)): for instance in reservation.instances: if tags is not None: instance.add_tags(tags) print ' %s' % instance.id print ' type: %s' % instance.instance_type print ' internal ip: %s' % instance.private_ip_address print ' public ip: %s' % instance.ip_address print ' tags:' for tag, value in sorted(instance.tags.iteritems(), lambda a, b: cmp(a[0], b[0])): print ' %s: %s' % (tag, value)
min_count=numInstancesToLaunch, max_count=numInstancesToLaunch, key_name=keyName, instance_type=instanceType, security_groups=[securityGroupName], instance_profile_arn=iam_profile_resource_name, instance_profile_name=iam_profile_name, dry_run=dryRun ) else: spotRequests = ec2.request_spot_instances( price=spotBid, image_id=amiID, count=numInstancesToLaunch, key_name=keyName, instance_type=instanceType, security_groups=[securityGroupName], instance_profile_arn=iam_profile_resource_name, instance_profile_name=iam_profile_name, dry_run=dryRun, ) requestIDs = [request.id for request in spotRequests] fulfilled = [] while requestIDs: requests = ec2.get_all_spot_instance_requests( request_ids=requestIDs, ) for request in requests:
def launch_spot_instance(id, profile, spot_wait_sleep=5, instance_wait_sleep=3): ec2 = boto.ec2.connect_to_region(profile['region']) if not 'key_pair' in profile: print('key pair {0} does not exist'.format(profile['key_pair'][0])) profile['key_pair'] = ('KP-' + id, 'KP-' + id + '.pem') try: print >> sys.stderr, 'Creating key pair...', keypair = ec2.create_key_pair('KP-' + id) keypair.save('.') print >> sys.stderr, 'created' except boto.exception.EC2ResponseError as e: if e.code == 'InvalidKeyPair.Duplicate': print >> sys.stderr, 'already exists' else: raise e if not 'security_group' in profile: try: print >> sys.stderr, 'Creating security group...', sc = ec2.create_security_group('SG-' + id, 'Security Group for ' + id) for proto, fromport, toport, ip in profile['firewall']: sc.authorize(proto, fromport, toport, ip) profile['security_group'] = (sc.id, sc.name) print >> sys.stderr, 'created' except boto.exception.EC2ResponseError as e: if e.code == 'InvalidGroup.Duplicate': print >> sys.stderr, 'already exists' sc = ec2.get_all_security_groups(groupnames=['SG-' + id])[0] profile['security_group'] = (sc.id, sc.name) else: raise e existing_requests = ec2.get_all_spot_instance_requests( filters={ 'launch.group-id': profile['security_group'][0], 'state': ['open', 'active'] }) if existing_requests: if len(existing_requests) > 1: raise Exception('Too many existing spot requests') print >> sys.stderr, 'Reusing existing spot request' spot_req_id = existing_requests[0].id else: bdm = boto.ec2.blockdevicemapping.BlockDeviceMapping() bdm['/dev/sda1'] = boto.ec2.blockdevicemapping.BlockDeviceType( volume_type='gp2', size=profile['disk_size'], delete_on_termination=profile['disk_delete_on_termination']) bdm['/dev/sdb'] = boto.ec2.blockdevicemapping.BlockDeviceType( ephemeral_name='ephemeral0') print >> sys.stderr, 'Requesting spot instance' spot_reqs = ec2.request_spot_instances( price=profile['price'], image_id=profile['image_id'], instance_type=profile['type'], placement=profile['region'] + profile['availability_zone'], security_groups=[profile['security_group'][1]], key_name=profile['key_pair'][0], block_device_map=bdm, instance_profile_arn= 'arn:aws:iam::720533437540:instance-profile/ec2_ml') spot_req_id = spot_reqs[0].id print >> sys.stderr, 'Waiting for launch', instance_id = None spot_tag_added = False while not instance_id: spot_req = ec2.get_all_spot_instance_requests( request_ids=[spot_req_id])[0] if not spot_tag_added: spot_req.add_tag('Name', id) spot_tag_added = True if spot_req.state == 'failed': # print(dir(spot_req)) raise Exception('spto request failed') print('Spot request failed - {0}'.format(spot_req.status)) sys.exit(0) instance_id = spot_req.instance_id if not instance_id: print >> sys.stderr, '.', time.sleep(spot_wait_sleep) print >> sys.stderr print >> sys.stderr, 'Retrieving instance by id' reservations = ec2.get_all_instances(instance_ids=[instance_id]) instance = reservations[0].instances[0] instance.add_tag('Name', id) print >> sys.stderr, 'Got instance: ' + str( instance.id) + ' [' + instance.state + ']' print >> sys.stderr, 'Waiting for instance to boot', while not instance.state in ['running', 'terminated', 'shutting-down']: print >> sys.stderr, '.', time.sleep(instance_wait_sleep) instance.update() print >> sys.stderr if instance.state != 'running': raise Exception('Instance was terminated') return instance
else: raise # Now start up the instance. The run_instances and spot_instances # methods have many, many parameters but these are all we need # for now. if spot: requests = ec2.request_spot_instances(ec2InstanceRates[instance_type]['hourly'] * .5, ami, # ami count=count, # count type='one-time', key_name=key_name, # key_name security_group_ids=[group.id], subnet_id='subnet-a3cb3bfa', user_data=user_data, instance_type=instance_type, #instance type # # Currently unused options # valid_from=None, valid_until=None, launch_group=None, # availability_zone_group=None, # addressing_type=None, # only the shadow knows # placement=None, kernel_id=None, ramdisk_id=None, # monitoring_enabled=False, subnet_id=None, placement_group=None, # instance_profile_arn=None, instance_profile_name=None, # security_group_ids=None, ebs_optimized=False, network_interfaces=None, block_device_map=bdm) # get the request ids to wait on request_ids = [req.id for req in requests] # wait for the requests to be fulfilled instance_ids = wait_for_fulfillment(ec2, request_ids, copy.deepcopy(request_ids)) reservation = ec2.get_all_reservations(instance_ids=instance_ids)
if int(port) != int(1194): try: mgroup = ec2.get_all_security_groups(groupnames=[group_name])[0] mgroup.authorize('udp', vpn_port, vpn_port, cidr) except ec2.ResponseError, e: if e.code == 'InvalidPermission.Duplicate': '''fail here''' else: raise spot_request = ec2.request_spot_instances( price="0.005", count=1, image_id=ami, key_name=key_name, security_groups=[group_name], instance_type=instance_type, user_data=user_data, )[0] while True: eprint("Waiting. spot request status: '%s', state: '%s'" % (spot_request.state, spot_request.status.code)) if spot_request.state == 'active' and spot_request.status.code == 'fulfilled': break time.sleep(10) spot_request = ec2.get_all_spot_instance_requests( request_ids=[spot_request.id])[0] while True: instance = ec2.get_all_instances(
def create_instances(module, ec2, vpc, override_count=None): """ Creates new instances module : AnsibleModule object ec2: authenticated ec2 connection object Returns: A list of dictionaries with instance information about the instances that were launched """ key_name = module.params.get('key_name') id = module.params.get('id') group_name = module.params.get('group') group_id = module.params.get('group_id') zone = module.params.get('zone') instance_type = module.params.get('instance_type') tenancy = module.params.get('tenancy') spot_price = module.params.get('spot_price') spot_type = module.params.get('spot_type') image = module.params.get('image') if override_count: count = override_count else: count = module.params.get('count') monitoring = module.params.get('monitoring') kernel = module.params.get('kernel') ramdisk = module.params.get('ramdisk') wait = module.params.get('wait') wait_timeout = int(module.params.get('wait_timeout')) spot_wait_timeout = int(module.params.get('spot_wait_timeout')) placement_group = module.params.get('placement_group') user_data = module.params.get('user_data') instance_tags = module.params.get('instance_tags') vpc_subnet_id = module.params.get('vpc_subnet_id') assign_public_ip = module.boolean(module.params.get('assign_public_ip')) private_ip = module.params.get('private_ip') instance_profile_name = module.params.get('instance_profile_name') volumes = module.params.get('volumes') ebs_optimized = module.params.get('ebs_optimized') exact_count = module.params.get('exact_count') count_tag = module.params.get('count_tag') source_dest_check = module.boolean(module.params.get('source_dest_check')) termination_protection = module.boolean( module.params.get('termination_protection')) network_interfaces = module.params.get('network_interfaces') spot_launch_group = module.params.get('spot_launch_group') instance_initiated_shutdown_behavior = module.params.get( 'instance_initiated_shutdown_behavior') # group_id and group_name are exclusive of each other if group_id and group_name: module.fail_json(msg=str( "Use only one type of parameter (group_name) or (group_id)")) vpc_id = None if vpc_subnet_id: if not vpc: module.fail_json(msg="region must be specified") else: vpc_id = vpc.get_all_subnets(subnet_ids=[vpc_subnet_id])[0].vpc_id else: vpc_id = None try: # Here we try to lookup the group id from the security group name - if group is set. if group_name: if vpc_id: grp_details = ec2.get_all_security_groups( filters={'vpc_id': vpc_id}) else: grp_details = ec2.get_all_security_groups() if isinstance(group_name, basestring): group_name = [group_name] unmatched = set(group_name).difference( str(grp.name) for grp in grp_details) if len(unmatched) > 0: module.fail_json( msg="The following group names are not valid: %s" % ', '.join(unmatched)) group_id = [ str(grp.id) for grp in grp_details if str(grp.name) in group_name ] # Now we try to lookup the group id testing if group exists. elif group_id: #wrap the group_id in a list if it's not one already if isinstance(group_id, basestring): group_id = [group_id] grp_details = ec2.get_all_security_groups(group_ids=group_id) group_name = [grp_item.name for grp_item in grp_details] except boto.exception.NoAuthHandlerFound as e: module.fail_json(msg=str(e)) # Lookup any instances that much our run id. running_instances = [] count_remaining = int(count) if id != None: filter_dict = {'client-token': id, 'instance-state-name': 'running'} previous_reservations = ec2.get_all_instances(None, filter_dict) for res in previous_reservations: for prev_instance in res.instances: running_instances.append(prev_instance) count_remaining = count_remaining - len(running_instances) # Both min_count and max_count equal count parameter. This means the launch request is explicit (we want count, or fail) in how many instances we want. if count_remaining == 0: changed = False else: changed = True try: params = { 'image_id': image, 'key_name': key_name, 'monitoring_enabled': monitoring, 'placement': zone, 'instance_type': instance_type, 'kernel_id': kernel, 'ramdisk_id': ramdisk, 'user_data': user_data } if ebs_optimized: params['ebs_optimized'] = ebs_optimized # 'tenancy' always has a default value, but it is not a valid parameter for spot instance request if not spot_price: params['tenancy'] = tenancy if boto_supports_profile_name_arg(ec2): params['instance_profile_name'] = instance_profile_name else: if instance_profile_name is not None: module.fail_json( msg= "instance_profile_name parameter requires Boto version 2.5.0 or higher" ) if assign_public_ip: if not boto_supports_associate_public_ip_address(ec2): module.fail_json( msg= "assign_public_ip parameter requires Boto version 2.13.0 or higher." ) elif not vpc_subnet_id: module.fail_json( msg="assign_public_ip only available with vpc_subnet_id" ) else: if private_ip: interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=vpc_subnet_id, private_ip_address=private_ip, groups=group_id, associate_public_ip_address=assign_public_ip) else: interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=vpc_subnet_id, groups=group_id, associate_public_ip_address=assign_public_ip) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( interface) params['network_interfaces'] = interfaces else: if network_interfaces: if isinstance(network_interfaces, basestring): network_interfaces = [network_interfaces] interfaces = [] for i, network_interface_id in enumerate( network_interfaces): interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( network_interface_id=network_interface_id, device_index=i) interfaces.append(interface) params['network_interfaces'] = \ boto.ec2.networkinterface.NetworkInterfaceCollection(*interfaces) else: params['subnet_id'] = vpc_subnet_id if vpc_subnet_id: params['security_group_ids'] = group_id else: params['security_groups'] = group_name if volumes: bdm = BlockDeviceMapping() for volume in volumes: if 'device_name' not in volume: module.fail_json( msg='Device name must be set for volume') # Minimum volume size is 1GB. We'll use volume size explicitly set to 0 # to be a signal not to create this volume if 'volume_size' not in volume or int( volume['volume_size']) > 0: bdm[volume['device_name']] = create_block_device( module, ec2, volume) params['block_device_map'] = bdm # check to see if we're using spot pricing first before starting instances if not spot_price: if assign_public_ip and private_ip: params.update( dict( min_count=count_remaining, max_count=count_remaining, client_token=id, placement_group=placement_group, )) else: params.update( dict( min_count=count_remaining, max_count=count_remaining, client_token=id, placement_group=placement_group, private_ip_address=private_ip, )) # For ordinary (not spot) instances, we can select 'stop' # (the default) or 'terminate' here. params[ 'instance_initiated_shutdown_behavior'] = instance_initiated_shutdown_behavior or 'stop' res = ec2.run_instances(**params) instids = [i.id for i in res.instances] while True: try: ec2.get_all_instances(instids) break except boto.exception.EC2ResponseError as e: if "<Code>InvalidInstanceID.NotFound</Code>" in str(e): # there's a race between start and get an instance continue else: module.fail_json(msg=str(e)) # The instances returned through ec2.run_instances above can be in # terminated state due to idempotency. See commit 7f11c3d for a complete # explanation. terminated_instances = [ str(instance.id) for instance in res.instances if instance.state == 'terminated' ] if terminated_instances: module.fail_json( msg="Instances with id(s) %s " % terminated_instances + "were created previously but have since been terminated - " + "use a (possibly different) 'instanceid' parameter") else: if private_ip: module.fail_json( msg= 'private_ip only available with on-demand (non-spot) instances' ) if boto_supports_param_in_spot_request(ec2, 'placement_group'): params['placement_group'] = placement_group elif placement_group: module.fail_json( msg= "placement_group parameter requires Boto version 2.3.0 or higher." ) # You can't tell spot instances to 'stop'; they will always be # 'terminate'd. For convenience, we'll ignore the latter value. if instance_initiated_shutdown_behavior and instance_initiated_shutdown_behavior != 'terminate': module.fail_json( msg= "instance_initiated_shutdown_behavior=stop is not supported for spot instances." ) if spot_launch_group and isinstance(spot_launch_group, basestring): params['launch_group'] = spot_launch_group params.update(dict( count=count_remaining, type=spot_type, )) res = ec2.request_spot_instances(spot_price, **params) # Now we have to do the intermediate waiting if wait: instids = await_spot_requests(module, ec2, res, count) except boto.exception.BotoServerError as e: module.fail_json(msg="Instance creation failed => %s: %s" % (e.error_code, e.error_message)) # wait here until the instances are up num_running = 0 wait_timeout = time.time() + wait_timeout while wait_timeout > time.time() and num_running < len(instids): try: res_list = ec2.get_all_instances(instids) except boto.exception.BotoServerError as e: if e.error_code == 'InvalidInstanceID.NotFound': time.sleep(1) continue else: raise num_running = 0 for res in res_list: num_running += len( [i for i in res.instances if i.state == 'running']) if len(res_list) <= 0: # got a bad response of some sort, possibly due to # stale/cached data. Wait a second and then try again time.sleep(1) continue if wait and num_running < len(instids): time.sleep(5) else: break if wait and wait_timeout <= time.time(): # waiting took too long module.fail_json(msg="wait for instances running timeout on %s" % time.asctime()) #We do this after the loop ends so that we end up with one list for res in res_list: running_instances.extend(res.instances) # Enabled by default by AWS if source_dest_check is False: for inst in res.instances: inst.modify_attribute('sourceDestCheck', False) # Disabled by default by AWS if termination_protection is True: for inst in res.instances: inst.modify_attribute('disableApiTermination', True) # Leave this as late as possible to try and avoid InvalidInstanceID.NotFound if instance_tags: try: ec2.create_tags(instids, instance_tags) except boto.exception.EC2ResponseError as e: module.fail_json(msg="Instance tagging failed => %s: %s" % (e.error_code, e.error_message)) instance_dict_array = [] created_instance_ids = [] for inst in running_instances: inst.update() d = get_instance_info(inst) created_instance_ids.append(inst.id) instance_dict_array.append(d) return (instance_dict_array, created_instance_ids, changed)
else: raise # Now start up the instance. The run_instances and spot_instances # methods have many, many parameters but these are all we need # for now. if spot: requests = ec2.request_spot_instances( ec2InstanceRates[instance_type]['hourly'] * .5, ami, # ami count=count, # count type='one-time', key_name=key_name, # key_name security_groups=[group_name], user_data=user_data, instance_type=instance_type, #instance type # # Currently unused options # valid_from=None, valid_until=None, launch_group=None, # availability_zone_group=None, # addressing_type=None, # only the shadow knows # placement=None, kernel_id=None, ramdisk_id=None, # monitoring_enabled=False, subnet_id=None, placement_group=None, # instance_profile_arn=None, instance_profile_name=None, # security_group_ids=None, ebs_optimized=False, network_interfaces=None, block_device_map=bdm) # get the request ids to wait on request_ids = [req.id for req in requests] # wait for the requests to be fulfilled instance_ids = wait_for_fulfillment(ec2, request_ids, copy.deepcopy(request_ids)) reservation = ec2.get_all_reservations(instance_ids=instance_ids)
def start_node(): start_logging() print(" ".join(argv)) if len(argv) != 2: print("Usage: %s <nodename>" % (argv[0], ), file=sys.stderr) return 1 nodename = argv[1] cc = ClusterConfiguration.from_config() region = get_region() ec2 = boto.ec2.connect_to_region(region) if not ec2: print("Could not connect to EC2 endpoint in region %r" % (region, ), file=sys.stderr) return 1 kw = {} slurm_s3_root = cc.slurm_s3_root kw['image_id'] = (cc.compute_ami if cc.compute_ami is not None else amazon_linux_ami[region]) if cc.instance_profile is not None: if cc.instance_profile.startswith("arn:"): kw['instance_profile_arn'] = cc.instance_profile else: kw['instance_profile_name'] = cc.instance_profile kw['key_name'] = cc.key_name kw['instance_type'] = cc.compute_instance_type if cc.compute_bid_price is not None: end = time() + 24 * 60 * 60 # FIXME: Don't hardcode this. kw['price'] = cc.compute_bid_price kw['valid_until'] = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime(end)) node_address = cc.get_address_for_nodename(nodename) node_subnet = cc.get_subnet_for_address(node_address) user_data = init_script % { "region": region, "nodename": nodename, "os_packages": " ".join(cc.compute_os_packages if cc. compute_os_packages is not None else []), "external_packages": " ".join(cc.compute_external_packages if cc. compute_external_packages is not None else []), "slurm_ec2_conf": cc.slurm_ec2_configuration, "slurm_s3_root": slurm_s3_root, } user_data = b64encode(user_data) kw['user_data'] = user_data # Map the ethernet interface to the correct IP address eth0 = NetworkInterfaceSpecification(associate_public_ip_address=True, delete_on_termination=True, device_index=0, groups=cc.security_groups, private_ip_address=str(node_address), subnet_id=node_subnet.id) kw['network_interfaces'] = NetworkInterfaceCollection(eth0) # Attach any ephemeral storage devices block_device_map = BlockDeviceMapping() block_device_map['/dev/xvda'] = BlockDeviceType(size=32, volume_type="gp2") devices = cc.ephemeral_stores[cc.compute_instance_type] for i, device in enumerate(devices): drive = "/dev/sd" + chr(ord('b') + i) block_device_map[drive] = BlockDeviceType( ephemeral_name="ephemeral%d" % i) kw['block_device_map'] = block_device_map if cc.compute_bid_price is None: print("run_instances: %r" % kw) reservation = ec2.run_instances(**kw) tags = { 'SLURMHostname': nodename, 'SLURMS3Root': slurm_s3_root, 'Name': "SLURM Computation Node %s" % nodename, } print("instances: %s" % " ".join([instance.id for instance in reservation.instances])) # create-tags can fail at times since the tag resource database is # a bit behind EC2's actual state. for i in xrange(10): try: ec2.create_tags( [instance.id for instance in reservation.instances], tags) break except Exception as e: print("Failed to tag instance: %s" % e, file=sys.stderr) sleep(0.5 * i) else: print("request_spot_instances: %r" % kw, file=sys.stderr) requests = ec2.request_spot_instances(**kw) print("requests: %s" % " ".join([request.id for request in requests])) return 0
def start_node(): start_logging() print(" ".join(argv)) if len(argv) != 2: print("Usage: %s <nodename>" % (argv[0],), file=sys.stderr) return 1 nodename = argv[1] cc = ClusterConfiguration.from_config() region = get_region() ec2 = boto.ec2.connect_to_region(region) if not ec2: print("Could not connect to EC2 endpoint in region %r" % (region,), file=sys.stderr) return 1 kw = {} slurm_s3_root = cc.slurm_s3_root kw['image_id'] = ( cc.compute_ami if cc.compute_ami is not None else amazon_linux_ami[region]) if cc.instance_profile is not None: if cc.instance_profile.startswith("arn:"): kw['instance_profile_arn'] = cc.instance_profile else: kw['instance_profile_name'] = cc.instance_profile kw['key_name'] = cc.key_name kw['instance_type'] = cc.compute_instance_type if cc.compute_bid_price is not None: end = time() + 24 * 60 * 60 # FIXME: Don't hardcode this. kw['price'] = cc.compute_bid_price kw['valid_until'] = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime(end)) node_address = cc.get_address_for_nodename(nodename) node_subnet = cc.get_subnet_for_address(node_address) user_data = init_script % { "region": region, "nodename": nodename, "os_packages": " ".join( cc.compute_os_packages if cc.compute_os_packages is not None else []), "external_packages": " ".join( cc.compute_external_packages if cc.compute_external_packages is not None else []), "slurm_ec2_conf": cc.slurm_ec2_configuration, "slurm_s3_root": slurm_s3_root, } user_data = b64encode(user_data) kw['user_data'] = user_data # Map the ethernet interface to the correct IP address eth0 = NetworkInterfaceSpecification( associate_public_ip_address=True, delete_on_termination=True, device_index=0, groups=cc.security_groups, private_ip_address=str(node_address), subnet_id=node_subnet.id) kw['network_interfaces'] = NetworkInterfaceCollection(eth0) # Attach any ephemeral storage devices block_device_map = BlockDeviceMapping() block_device_map['/dev/xvda'] = BlockDeviceType(size=32, volume_type="gp2") devices = cc.ephemeral_stores[cc.compute_instance_type] for i, device in enumerate(devices): drive = "/dev/sd" + chr(ord('b') + i) block_device_map[drive] = BlockDeviceType( ephemeral_name="ephemeral%d" % i) kw['block_device_map'] = block_device_map if cc.compute_bid_price is None: print("run_instances: %r" % kw) reservation = ec2.run_instances(**kw) tags = { 'SLURMHostname': nodename, 'SLURMS3Root': slurm_s3_root, 'Name': "SLURM Computation Node %s" % nodename, } print("instances: %s" % " ".join([instance.id for instance in reservation.instances])) # create-tags can fail at times since the tag resource database is # a bit behind EC2's actual state. for i in xrange(10): try: ec2.create_tags([ instance.id for instance in reservation.instances], tags) break except Exception as e: print("Failed to tag instance: %s" % e, file=sys.stderr) sleep(0.5 * i) else: print("request_spot_instances: %r" % kw, file=sys.stderr) requests = ec2.request_spot_instances(**kw) print("requests: %s" % " ".join([request.id for request in requests])) return 0