def deploy(self, project_config: ProjectConfig, output: AbstractOutputWriter, dry_run=False): # remove the stack it it exists to make all the disks available stack = self.stack stack.delete_stack(output=output) # create or get existing bucket for the project bucket_name = self.bucket.get_or_create_bucket(output, dry_run) # sync the project with the bucket output.write('Syncing the project with the bucket...') sync_local_to_bucket(project_config.project_dir, bucket_name, project_config.sync_filters, dry_run) # check GPU configuration check_gpu_configuration(self._ce, self.instance_config.gpu) # get volumes volumes = self._get_volumes() if volumes: # create disks output.write('\nCreating disks...') with output.prefix(' '): self._create_disks(volumes, output=output, dry_run=dry_run) output.write('') # prepare Deployment Manager template output.write('Preparing the deployment template...') with output.prefix(' '): # get an image image = self._get_image() # get or create an SSH key public_key_value = self.ssh_key.get_public_key_value() container = ContainerDeployment(project_config.project_name, volumes, project_config.container) template = prepare_instance_template( self.instance_config, container, project_config.sync_filters, volumes, self.machine_name, image.self_link, bucket_name, public_key_value, self._credentials.service_account_email, output) output.write('') # print information about the volumes output.write( 'Volumes:\n%s\n' % render_volumes_info_table(container.volume_mounts, volumes)) # create stack if not dry_run: stack.create_stack(template, output=output)
def delete_stack(self, output: AbstractOutputWriter, stack_id=None): """Deletes an AMI stack. Args: output: output writer stack_id: ID of the stack to delete (for older versions of Spotty) """ # delete the image stack = Stack.get_by_name(self._cf, stack_id) if stack_id else self.get_stack() stack.delete() output.write('Waiting for the AMI to be deleted...') # wait for the deletion to be completed with output.prefix(' '): stack = stack.wait_status_changed( waiting_status='DELETE_IN_PROGRESS', resource_messages=[], resource_success_status='DELETE_COMPLETE', output=output) if stack.status == 'DELETE_COMPLETE': output.write('\n' '-----------------------------\n' 'AMI was successfully deleted.\n' '-----------------------------') else: raise ValueError( 'Stack "%s" not deleted.\n' 'See CloudFormation and CloudWatch logs for details.' % stack_id)
def create_stack(self, template: str, output: AbstractOutputWriter): """Deploys a Deployment Manager template.""" # create a stack res = Stack.create(self._dm, self._stack_name, template) # print(res) # exit() output.write('Waiting for the stack to be created...') resource_messages = OrderedDict([ (self._INSTANCE_RESOURCE_NAME, 'launching the instance'), (self._DOCKER_WAITER_RESOURCE_NAME, 'running the Docker container'), ]) # wait for the stack to be created with output.prefix(' '): wait_resources(self._dm, self._ce, self._stack_name, resource_messages, instance_resource_name=self._INSTANCE_RESOURCE_NAME, machine_name=self._machine_name, output=output)
def _get_instance_id(instances: List[dict], instance_name: str, output: AbstractOutputWriter): if not instance_name: if len(instances) > 1: # ask user to choose the instance output.write('Select the instance:\n') with output.prefix(' '): for i, instance_config in enumerate(instances): output.write('[%d] %s' % (i + 1, instance_config['name'])) output.write() try: num = int(input('Enter number: ')) output.write() except ValueError: num = 0 if num < 1 or num > len(instances): raise ValueError('The value from 1 to %d was expected.' % len(instances)) instance_id = num - 1 else: instance_id = 0 else: # get instance ID by name instance_ids = [i for i, instance in enumerate(instances) if instance['name'] == instance_name] if not instance_ids: raise ValueError('Instance "%s" not found in the configuration file' % instance_name) instance_id = instance_ids[0] return instance_id
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): dry_run = args.dry_run if args.container: # check that the instance is started if not instance_manager.is_running(): raise InstanceNotRunningError(instance_manager.instance_config.name) # start a container on the running instance instance_manager.start_container(output, dry_run=dry_run) if not dry_run: instance_name = '' if len(instance_manager.project_config.instances) > 1: instance_name = ' ' + instance_manager.instance_config.name output.write('\nContainer was successfully started.\n' 'Use the "spotty sh%s" command to connect to the container.\n' % instance_name) else: # start the instance with output.prefix('[dry-run] ' if dry_run else ''): instance_manager.start(output, dry_run) if not dry_run: instance_name = '' if len(instance_manager.project_config.instances) > 1: instance_name = ' ' + instance_manager.instance_config.name output.write('\n%s\n' '\nUse the "spotty sh%s" command to connect to the container.\n' % (instance_manager.get_status_text(), instance_name))
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): filters = [{'exclude': ['*']}, {'include': args.filters}] dry_run = args.dry_run with output.prefix('[dry-run] ' if dry_run else ''): instance_manager.download(filters, output, dry_run) output.write('Done')
def deploy(self, project_config: ProjectConfig, output: AbstractOutputWriter, dry_run=False): # check that it's not a Nitro-based instance if is_nitro_instance(self.instance_config.instance_type): raise ValueError('Currently Nitro-based instances are not supported.') # check availability zone and subnet configuration check_az_and_subnet(self._ec2, self.instance_config.region, self.instance_config.availability_zone, self.instance_config.subnet_id) # get volumes volumes = self._get_volumes() # get deployment availability zone availability_zone = self._get_availability_zone(volumes) # check the maximum price for a spot instance check_max_price(self._ec2, self.instance_config.instance_type, self.instance_config.on_demand, self.instance_config.max_price, availability_zone) # create or get existing bucket for the project bucket_name = self.bucket.get_or_create_bucket(output, project_config.tags, dry_run) # sync the project with the bucket output.write('Syncing the project with S3 bucket...') sync_project_with_s3(project_config.project_dir, bucket_name, self.instance_config.region, project_config.sync_filters, dry_run) # create or update instance profile if not dry_run: instance_profile_stack = InstanceProfileStackResource( self._project_name, self.instance_config.name, self.instance_config.region) instance_profile_arn = instance_profile_stack.create_or_update_stack( self.instance_config.managed_policy_arns, output=output, tags=project_config.tags) else: instance_profile_arn = None output.write('Preparing CloudFormation template...') # prepare CloudFormation template container = ContainerDeployment(project_config.project_name, volumes, project_config.container) with output.prefix(' '): template = prepare_instance_template(self.instance_config, volumes, availability_zone, container, output) # get parameters for the template parameters = self._get_template_parameters(instance_profile_arn, self.instance_config.name, bucket_name, project_config.sync_filters, volumes, container, output, dry_run=dry_run) # print information about the volumes output.write('\nVolumes:\n%s\n' % render_volumes_info_table(container.volume_mounts, volumes)) # create stack if not dry_run: self.stack.create_or_update_stack(template, parameters, output, project_config.tags)
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): # check that the instance is started if not instance_manager.is_running(): raise InstanceNotRunningError( instance_manager.instance_config.name) dry_run = args.dry_run with output.prefix('[dry-run] ' if dry_run else ''): try: instance_manager.sync(output, dry_run) except NothingToDoError as e: output.write(str(e)) return output.write('Done')
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): # start the instance dry_run = args.dry_run with output.prefix('[dry-run] ' if dry_run else ''): instance_manager.start(output, dry_run) if not dry_run: instance_name = '' if len(instance_manager.project_config.instances) > 1: instance_name = ' ' + instance_manager.instance_config.name output.write( '\nThe instance was successfully started.\n' '\n%s\n' '\nUse the "spotty ssh%s" command to connect to the Docker container.\n' % (instance_manager.get_status_text(), instance_name))
def delete(self, output: AbstractOutputWriter): # terminate the instance instance = self.get_instance() if instance: output.write('Terminating the instance... ', newline=False) instance.terminate() output.write('DONE') else: output.write('The instance was already terminated.') # delete the stack in background if it exists self.stack_manager.delete_stack(output, no_wait=True) output.write('Applying deletion policies for the volumes...') # apply deletion policies for the volumes with output.prefix(' '): apply_deletion_policies(self._ec2, self.instance_config.volumes, output)
def delete(self, output: AbstractOutputWriter): # terminate the instance instance = self.get_instance() if instance: output.write('Terminating the instance...') instance.terminate() instance.wait_instance_terminated() else: output.write('The instance is already terminated.') # delete the stack in background if it exists self.stack.delete_stack(output, no_wait=True) output.write('Applying deletion policies for the volumes...') # apply deletion policies for the volumes with output.prefix(' '): self._apply_deletion_policies(output)
def create_stack(self, template: str, machine_name: str, debug_mode: bool, output: AbstractOutputWriter): """Creates an image stack and waits for the image to be created.""" # check that the stack doesn't exist if self.get_stack(): raise ValueError('Deployment "%s" already exists.' % self._stack_name) # create stack Stack.create(self._dm, self._stack_name, template) output.write('Waiting for the image to be created...') resource_messages = OrderedDict([ (machine_name, 'launching the instance'), ('%s-docker-waiter' % machine_name, 'installing NVIDIA Docker'), ]) if not debug_mode: resource_messages[ '%s-image-waiter' % machine_name] = 'creating an image and terminating the instance' # wait for the stack to be created with output.prefix(' '): wait_resources(self._dm, self._ce, self._stack_name, resource_messages, instance_resource_name=machine_name, machine_name=machine_name, output=output) if debug_mode: output.write('Stack "%s" was created in debug mode.' % self._stack_name) else: output.write('\n' '--------------------------------------------------\n' 'Image "%s" was successfully created.\n' 'Use the "spotty start" command to run an instance.\n' '--------------------------------------------------' % self._image_name)
def _get_instance_config(project_config: ProjectConfig, instance_name: str, output: AbstractOutputWriter): if not instance_name: if len(project_config.instances) > 1: # ask user to choose the instance output.write('Select the instance:\n') with output.prefix(' '): for i, instance_config in enumerate( project_config.instances): output.write('[%d] %s' % (i + 1, instance_config['name'])) output.write() try: num = int(input('Enter number: ')) output.write() except ValueError: num = 0 if num < 1 or num > len(project_config.instances): raise ValueError('The value from 1 to %d was expected.' % len(project_config.instances)) else: num = 1 instance_config = project_config.instances[num - 1] else: # get the instance by name instance_configs = filter_list(project_config.instances, 'name', instance_name) if not instance_configs: raise ValueError( 'Instance "%s" not found in the configuration file' % instance_name) instance_config = instance_configs[0] return instance_config
def create_stack(self, template: str, parameters: dict, debug_mode: bool, output: AbstractOutputWriter): """Creates an AMI stack and waits for the AMI to be created. Args: template: CloudFormation template parameters: parameters for the template debug_mode: if "True", NVIDIA Docker will be installed, but an AMI will not be created and the instance will not be terminated, so the user can connect to the instance for debugging. output: output writer """ stack = Stack.create_stack( cf=self._cf, StackName=self._stack_name, TemplateBody=template, Parameters=[{ 'ParameterKey': key, 'ParameterValue': value } for key, value in parameters.items()], Capabilities=['CAPABILITY_IAM'], OnFailure='DO_NOTHING' if debug_mode else 'DELETE', ) output.write('Waiting for the AMI to be created...') resource_messages = [ ('InstanceProfile', 'creating IAM role for the instance'), ('Instance', 'launching the instance'), ('InstanceReadyWaitCondition', 'installing NVIDIA Docker'), ('AMICreatedWaitCondition', 'creating AMI and terminating the instance'), ] # wait for the stack to be created with output.prefix(' '): stack = stack.wait_status_changed( waiting_status='CREATE_IN_PROGRESS', resource_messages=resource_messages, resource_success_status='CREATE_COMPLETE', output=output) if stack.status != 'CREATE_COMPLETE': raise ValueError( 'Stack "%s" was not created.\n' 'Please, see CloudFormation logs for the details.' % self._stack_name) if debug_mode: output.write('Stack "%s" was created in debug mode.' % self._stack_name) else: ami_id = [ row['OutputValue'] for row in stack.outputs if row['OutputKey'] == 'NewAMI' ][0] output.write('\n' '--------------------------------------------------\n' 'AMI "%s" (ID=%s) was successfully created.\n' 'Use the "spotty start" command to run an instance.\n' '--------------------------------------------------' % (parameters['ImageName'], ami_id))
def _run(self, instance_manager: AbstractInstanceManager, args: Namespace, output: AbstractOutputWriter): dry_run = args.dry_run with output.prefix('[dry-run] ' if dry_run else ''): instance_manager.sync(output, dry_run) output.write('Done')
def create_or_update_stack(self, template: str, parameters: dict, instance_config: InstanceConfig, output: AbstractOutputWriter): """Runs CloudFormation template.""" # delete the stack if it exists stack = Stack.get_by_name(self._cf, self._stack_name) if stack: self.delete_stack(output) # create new stack stack = Stack.create_stack( cf=self._cf, StackName=self._stack_name, TemplateBody=template, Parameters=[{ 'ParameterKey': key, 'ParameterValue': value } for key, value in parameters.items()], Capabilities=['CAPABILITY_IAM'], OnFailure='DO_NOTHING', ) output.write('Waiting for the stack to be created...') tasks = [ Task( message='launching the instance', start_resource=None, finish_resource='Instance', enabled=True, ), Task( message='preparing the instance', start_resource='Instance', finish_resource='MountingVolumesSignal', enabled=True, ), Task( message='mounting volumes', start_resource='MountingVolumesSignal', finish_resource='SettingDockerRootSignal', enabled=bool(instance_config.volumes), ), Task( message='setting Docker data root', start_resource='SettingDockerRootSignal', finish_resource='SyncingProjectSignal', enabled=bool(instance_config.docker_data_root), ), Task( message='syncing project files', start_resource='SyncingProjectSignal', finish_resource='RunningInstanceStartupCommandsSignal', enabled=True, ), Task( message='running instance startup commands', start_resource='RunningInstanceStartupCommandsSignal', finish_resource='BuildingDockerImageSignal', enabled=bool(instance_config.commands), ), Task( message='building Docker image', start_resource='BuildingDockerImageSignal', finish_resource='StartingContainerSignal', enabled=bool(instance_config.dockerfile_path), ), Task( message='starting container', start_resource='StartingContainerSignal', finish_resource='RunningContainerStartupCommandsSignal', enabled=True, ), Task( message='running container startup commands', start_resource='RunningContainerStartupCommandsSignal', finish_resource='DockerReadyWaitCondition', enabled=bool(instance_config.container_config.commands), ), ] # wait for the stack to be created with output.prefix(' '): stack.wait_tasks(tasks, resource_success_status='CREATE_COMPLETE', resource_fail_status='CREATE_FAILED', output=output) stack = stack.wait_status_changed( stack_waiting_status='CREATE_IN_PROGRESS', output=output) return stack
def deploy(self, container_commands: DockerCommands, bucket_name: str, data_transfer: DataTransfer, output: AbstractOutputWriter, dry_run: bool = False): # get deployment availability zone availability_zone = update_availability_zone(self._ec2, self.instance_config.availability_zone, self.instance_config.volumes) # check availability zone and subnet configuration check_az_and_subnet(self._ec2, self.instance_config.region, availability_zone, self.instance_config.subnet_id) # check the maximum price for a spot instance check_max_spot_price(self._ec2, self.instance_config.instance_type, self.instance_config.is_spot_instance, self.instance_config.max_price, availability_zone) # sync the project with the S3 bucket if bucket_name is not None: output.write('Syncing the project with the S3 bucket...') data_transfer.upload_local_to_bucket(bucket_name, dry_run=dry_run) # create or update instance profile if not dry_run: instance_profile_stack_manager = InstanceProfileStackManager( self._project_name, self.instance_config.name, self.instance_config.region) if not self.instance_config.instance_profile_arn: instance_profile_arn = instance_profile_stack_manager.create_or_update_stack( self.instance_config.managed_policy_arns, output=output) else: instance_profile_arn = self.instance_config.instance_profile_arn else: instance_profile_arn = None # create a key pair if it doesn't exist if not dry_run: self.key_pair_manager.maybe_create_key() output.write('Preparing CloudFormation template...') # prepare CloudFormation template with output.prefix(' '): template = prepare_instance_template( ec2=self._ec2, instance_config=self.instance_config, docker_commands=container_commands, availability_zone=availability_zone, sync_project_cmd=data_transfer.get_download_bucket_to_instance_command(bucket_name=bucket_name), output=output, ) # get parameters for the template parameters = get_template_parameters( ec2=self._ec2, instance_config=self.instance_config, instance_profile_arn=instance_profile_arn, bucket_name=bucket_name, key_pair_name=self.key_pair_manager.key_name, output=output, ) # print information about the volumes output.write('\nVolumes:\n%s\n' % render_volumes_info_table(self.instance_config.volume_mounts, self.instance_config.volumes)) # create stack if not dry_run: stack = self.stack_manager.create_or_update_stack(template, parameters, self.instance_config, output) if stack.status != 'CREATE_COMPLETE': logs_str = 'Please, see CloudFormation logs for the details.' # download CloudFormation logs from the instance if it was created if self.get_instance(): log_paths = download_logs( bucket_name=bucket_name, instance_name=self.instance_config.name, stack_uuid=stack.stack_uuid, region=self.instance_config.region, ) logs_str = 'Please, see the logs for the details:\n ' logs_str += '\n '.join(log_paths) raise ValueError('Stack "%s" was not created.\n%s' % (stack.name, logs_str))
def deploy(self, container_commands: DockerCommands, bucket_name: str, data_transfer: DataTransfer, output: AbstractOutputWriter, dry_run: bool = False): # check machine type if not self._ce.get_machine_types(self.instance_config.machine_type): raise ValueError( '"%s" machine type is not available in the "%s" zone.' % (self.instance_config.machine_type, self.instance_config.zone)) # check GPU configuration check_gpu_configuration(self._ce, self.instance_config.gpu) # remove the stack it it exists to make all the disks available stack_manager = self.stack_manager stack_manager.delete_stack(output=output) # sync the project with the S3 bucket if bucket_name is not None: output.write('Syncing the project with the bucket...') data_transfer.upload_local_to_bucket(bucket_name, dry_run=dry_run) # create volumes if self.instance_config.volumes: # create disks output.write('\nCreating disks...') with output.prefix(' '): create_disks(self._ce, self.instance_config.volumes, output=output, dry_run=dry_run) output.write('') # prepare Deployment Manager template output.write('Preparing the deployment template...') with output.prefix(' '): # get an image image_link = get_image(self._ce, self.instance_config.image_uri, self.instance_config.image_name).self_link # get or create an SSH key public_key_value = self.ssh_key_manager.get_public_key_value() # prepare the deployment template sync_project_cmd = data_transfer.get_download_bucket_to_instance_command( bucket_name=bucket_name) template = prepare_instance_template( instance_config=self.instance_config, docker_commands=container_commands, image_link=image_link, bucket_name=bucket_name, sync_project_cmd=sync_project_cmd, public_key_value=public_key_value, service_account_email=self._credentials.service_account_email, output=output, ) output.write('') # print information about the volumes output.write('Volumes:\n%s\n' % render_volumes_info_table( self.instance_config.volume_mounts, self.instance_config.volumes)) # create stack if not dry_run: stack_manager.create_stack(template, output=output)