def prepare_template(self, availability_zone: str, subnet_id: str, on_demand: bool, key_name: str): """Prepares CloudFormation template to run a Spot Instance.""" # read and update CF template with open(data_dir('create_ami.yaml')) as f: template = yaml.load(f, Loader=CfnYamlLoader) # remove key parameter if key is not provided if not key_name: del template['Parameters']['KeyName'] del template['Resources']['SpotInstanceLaunchTemplate'][ 'Properties']['LaunchTemplateData']['KeyName'] # set availability zone if availability_zone: template['Resources']['SpotInstanceLaunchTemplate']['Properties'][ 'LaunchTemplateData']['Placement'] = { 'AvailabilityZone': availability_zone, } # set subnet if subnet_id: template['Resources']['SpotInstanceLaunchTemplate']['Properties'][ 'LaunchTemplateData']['NetworkInterfaces'] = [{ 'SubnetId': subnet_id, 'DeviceIndex': 0, }] # run on-demand instance if on_demand: del template['Resources']['SpotInstanceLaunchTemplate'][ 'Properties']['LaunchTemplateData']['InstanceMarketOptions'] return yaml.dump(template, Dumper=CfnYamlDumper)
def create_or_update_instance_profile(cf, output: AbstractOutputWriter): """Creates or updates instance profile. It was moved to a separate stack because creating of an instance profile resource takes 2 minutes. """ instance_profile_stack_name = 'spotty-instance-profile' with open(data_dir('create_instance_profile.yaml')) as f: instance_profile_stack_template = f.read() if stack_exists(cf, instance_profile_stack_name): try: res = cf.update_stack( StackName=instance_profile_stack_name, TemplateBody=instance_profile_stack_template, Capabilities=['CAPABILITY_IAM'], ) except ClientError as e: res = None error_code = e.response.get('Error', {}).get('Code', 'Unknown') if error_code != 'ValidationError': raise e if res: output.write('Updating IAM role for the instance...') # wait for the stack to be updated waiter = cf.get_waiter('stack_update_complete') waiter.wait(StackName=res['StackId'], WaiterConfig={'Delay': 10}) else: output.write('Creating IAM role for the instance...') res = cf.create_stack( StackName=instance_profile_stack_name, TemplateBody=instance_profile_stack_template, Capabilities=['CAPABILITY_IAM'], OnFailure='DELETE', ) # wait for the stack to be created waiter = cf.get_waiter('stack_create_complete') waiter.wait(StackName=res['StackId'], WaiterConfig={'Delay': 10}) info = cf.describe_stacks( StackName=instance_profile_stack_name)['Stacks'][0] status = info['StackStatus'] if status not in ['CREATE_COMPLETE', 'UPDATE_COMPLETE']: raise ValueError('Stack "%s" failed.\n' 'Please, see CloudFormation logs for the details.' % instance_profile_stack_name) profile_arn = [ row['OutputValue'] for row in info['Outputs'] if row['OutputKey'] == 'ProfileArn' ][0] return profile_arn
def prepare_template(self, ec2, availability_zone: str, subnet_id: str, instance_type: str, volumes: list, ports: list, max_price, docker_commands): """Prepares CloudFormation template to run a Spot Instance.""" # read and update CF template with open(data_dir('run_container.yaml')) as f: template = yaml.load(f, Loader=CfnYamlLoader) # ending letters for the devices (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/device_naming.html) device_letters = 'fghijklmnop' # create and attach volumes for i, volume in enumerate(volumes): device_letter = device_letters[i] volume_resources, volume_availability_zone = self._get_volume_resources( ec2, volume, device_letter) # existing volume will be attached to the instance if availability_zone and volume_availability_zone and ( availability_zone != volume_availability_zone): raise ValueError( 'The availability zone in the configuration file doesn\'t match the availability zone ' 'of the existing volume or you have two existing volumes in different availability ' 'zones.') # update availability zone if volume_availability_zone: availability_zone = volume_availability_zone # update template resources template['Resources'].update(volume_resources) # set availability zone if availability_zone: template['Resources']['SpotInstanceLaunchTemplate']['Properties'][ 'LaunchTemplateData']['Placement'] = { 'AvailabilityZone': availability_zone, } # set subnet if subnet_id: template['Resources']['SpotInstanceLaunchTemplate']['Properties'][ 'LaunchTemplateData']['NetworkInterfaces'] = [{ 'SubnetId': subnet_id, 'DeviceIndex': 0, 'Groups': template['Resources']['SpotInstanceLaunchTemplate'] ['Properties']['LaunchTemplateData']['SecurityGroupIds'], }] del template['Resources']['SpotInstanceLaunchTemplate'][ 'Properties']['LaunchTemplateData']['SecurityGroupIds'] # make sure that the lambda to update log group retention was called after # the log group was created template['Resources']['RenameSnapshotFunctionRetention'][ 'DependsOn'] = [ resource_name for resource_name, resource in template['Resources'].items() if resource['Type'] == 'Custom::SnapshotRenaming' ] # delete calls of the SetLogsRetentionFunction lambda if not template['Resources']['RenameSnapshotFunctionRetention'][ 'DependsOn']: del template['Resources']['RenameSnapshotFunctionRetention'] # make sure that the lambda to update log group retention was called after # the log group was created template['Resources']['DeleteSnapshotFunctionRetention'][ 'DependsOn'] = [ resource_name for resource_name, resource in template['Resources'].items() if resource['Type'] == 'Custom::SnapshotDeletion' ] # delete calls of the SetLogsRetentionFunction lambda if not template['Resources']['DeleteSnapshotFunctionRetention'][ 'DependsOn']: del template['Resources']['DeleteSnapshotFunctionRetention'] # TerminateInstanceFunction lambda should depend on all volume attachments template['Resources']['TerminateInstance']['DependsOn'] = [ resource_name for resource_name, resource in template['Resources'].items() if resource['Type'] == 'AWS::EC2::VolumeAttachment' ] # add ports to the security group for port in set(ports): if port != 22: template['Resources']['InstanceSecurityGroup']['Properties'][ 'SecurityGroupIngress'] += [{ 'CidrIp': '0.0.0.0/0', 'IpProtocol': 'tcp', 'FromPort': port, 'ToPort': port, }, { 'CidrIpv6': '::/0', 'IpProtocol': 'tcp', 'FromPort': port, 'ToPort': port, }] if max_price: # check the maximum price current_price = get_current_spot_price(ec2, instance_type, availability_zone) if current_price > max_price: raise ValueError( 'Current price for the instance (%.04f) is higher than the maximum price in the ' 'configuration file (%.04f).' % (current_price, max_price)) # set maximum price template['Resources']['SpotInstanceLaunchTemplate']['Properties']['LaunchTemplateData'] \ ['InstanceMarketOptions']['SpotOptions']['MaxPrice'] = max_price # set initial docker commands if docker_commands: template['Resources']['SpotInstanceLaunchTemplate']['Metadata']['AWS::CloudFormation::Init'] \ ['docker_container_config']['files']['/tmp/docker/docker_commands.sh']['content'] = docker_commands return yaml.dump(template, Dumper=CfnYamlDumper)
def run(self, output: AbstractOutputWriter): # check that it's a GPU instance type instance_type = self._config['instance']['instanceType'] if not is_gpu_instance(instance_type): raise ValueError('"%s" is not a GPU instance' % instance_type) region = self._config['instance']['region'] cf = boto3.client('cloudformation', region_name=region) ec2 = boto3.client('ec2', region_name=region) # check that an image with this name doesn't exist yet ami_name = self._config['instance']['amiName'] res = ec2.describe_images(Filters=[ {'Name': 'name', 'Values': [ami_name]}, ]) if len(res['Images']): raise ValueError('AMI with name "%s" already exists.' % ami_name) # read and update CF template with open(data_dir('create_ami.yaml')) as f: template = yaml.load(f, Loader=CfnYamlLoader) # remove key parameter if key is not provided key_name = self._config['instance'].get('keyName', '') if not key_name: del template['Parameters']['KeyName'] del template['Resources']['SpotInstanceLaunchTemplate']['Properties']['LaunchTemplateData']['KeyName'] # create stack params = [ {'ParameterKey': 'InstanceType', 'ParameterValue': instance_type}, {'ParameterKey': 'ImageName', 'ParameterValue': ami_name}, ] if key_name: params.append({'ParameterKey': 'KeyName', 'ParameterValue': key_name}) stack_name = 'spotty-nvidia-docker-ami-%s' % random_string(8) res = cf.create_stack( StackName=stack_name, TemplateBody=yaml.dump(template, Dumper=CfnYamlDumper), Parameters=params, Capabilities=['CAPABILITY_IAM'], OnFailure='DELETE', ) output.write('Waiting for the AMI to be created...') resource_messages = [ ('InstanceProfile', 'creating IAM role for the instance'), ('SpotInstance', 'launching the instance'), ('InstanceReadyWaitCondition', 'installing NVIDIA Docker'), ('AMICreatedWaitCondition', 'creating AMI and terminating the instance'), ] # wait for the stack to be created status, stack = wait_stack_status_changed(cf, stack_id=res['StackId'], waiting_status='CREATE_IN_PROGRESS', resource_messages=resource_messages, resource_success_status='CREATE_COMPLETE', output=output) if status == 'CREATE_COMPLETE': ami_id = [row['OutputValue'] for row in stack['Outputs'] if row['OutputKey'] == 'NewAMI'][0] output.write('\n' '--------------------\n' 'AMI "%s" (ID=%s) was successfully created.\n' 'Use "spotty start" command to run a Spot Instance.\n' '--------------------' % (ami_name, ami_id)) else: raise ValueError('Stack "%s" was not created.\n' 'See CloudFormation and CloudWatch logs for details.' % stack_name)