def wait_for_instance( aws_svc, instance_id, timeout=600, state='running'): """ Wait for up to timeout seconds for an instance to be in the given state. Sleep for 2 seconds between checks. :return: The Instance object :raises InstanceError if a timeout occurs or the instance unexpectedly goes into an error or terminated state """ log.debug( 'Waiting for %s, timeout=%d, state=%s', instance_id, timeout, state) deadline = Deadline(timeout) while not deadline.is_expired(): instance = aws_svc.get_instance(instance_id) log.debug('Instance %s state=%s', instance.id, instance.state) if instance.state == state: return instance if instance.state == 'error': raise InstanceError( 'Instance %s is in an error state. Cannot proceed.' % instance_id ) if state != 'terminated' and instance.state == 'terminated': raise InstanceError( 'Instance %s was unexpectedly terminated.' % instance_id ) sleep(2) raise InstanceError( 'Timed out waiting for %s to be in the %s state' % (instance_id, state) )
def wait_for_encryption(enc_svc, progress_timeout=ENCRYPTION_PROGRESS_TIMEOUT): err_count = 0 max_errs = 10 start_time = time.time() last_log_time = start_time progress_deadline = Deadline(progress_timeout) last_progress = 0 last_state = '' while err_count < max_errs: try: status = enc_svc.get_status() err_count = 0 except Exception as e: log.warn("Failed getting encryption status: %s", e) log.warn("Retrying. . .") err_count += 1 sleep(10) continue state = status['state'] percent_complete = status['percent_complete'] log.debug('state=%s, percent_complete=%d', state, percent_complete) # Make sure that encryption progress hasn't stalled. if progress_deadline.is_expired(): raise EncryptionError( 'Waited for encryption progress for longer than %s seconds' % progress_timeout ) if percent_complete > last_progress or state != last_state: last_progress = percent_complete last_state = state progress_deadline = Deadline(progress_timeout) # Log progress once a minute. now = time.time() if now - last_log_time >= 60: if state == ENCRYPT_INITIALIZING: log.info('Encryption process is initializing') else: state_display = 'Encryption' if state == ENCRYPT_DOWNLOADING: state_display = 'Download from cloud storage' log.info( '%s is %d%% complete', state_display, percent_complete) last_log_time = now if state == ENCRYPT_SUCCESSFUL: log.info('Encrypted root drive created.') return elif state == ENCRYPT_FAILED: log.error('Encryption status: %s', json.dumps(status)) _handle_failure_code(status.get('failure_code')) sleep(10) # We've failed to get encryption status for _max_errs_ consecutive tries. # Assume that the server has crashed. raise EncryptionError('Encryption service unavailable')
def _wait_for_instance( aws_svc, instance_id, timeout=300, state='running'): """ Wait for up to timeout seconds for an instance to be in the 'running' state. Sleep for 2 seconds between checks. :return: The Instance object, or None if a timeout occurred """ log.debug( 'Waiting for %s, timeout=%d, state=%s', instance_id, timeout, state) # Wait for AWS eventual consistency to catch up. instance = _safe_get_instance(aws_svc, instance_id) deadline = Deadline(timeout) while not deadline.is_expired(): log.debug('Instance %s state=%s', instance.id, instance.state) if instance.state == state: return instance if instance.state == 'error': raise Exception( 'Instance %s is in an error state. Cannot proceed.' ) _sleep(2) instance = aws_svc.get_instance(instance_id) raise Exception( 'Timed out waiting for %s to be in the %s state' % (instance_id, state) )
def wait_for_encryption(enc_svc, progress_timeout=ENCRYPTION_PROGRESS_TIMEOUT): err_count = 0 max_errs = 10 start_time = time.time() last_log_time = start_time progress_deadline = Deadline(progress_timeout) last_progress = 0 while err_count < max_errs: try: status = enc_svc.get_status() err_count = 0 except Exception as e: log.warn("Failed getting encryption status: %s", e) err_count += 1 sleep(10) continue state = status['state'] percent_complete = status['percent_complete'] log.debug('state=%s, percent_complete=%d', state, percent_complete) # Make sure that encryption progress hasn't stalled. if progress_deadline.is_expired(): raise EncryptionError( 'Waited for encryption progress for longer than %s seconds' % progress_timeout ) if percent_complete > last_progress: last_progress = percent_complete progress_deadline = Deadline(progress_timeout) # Log progress once a minute. now = time.time() if now - last_log_time >= 60: log.info('Encryption is %d%% complete', percent_complete) last_log_time = now if state == encryptor_service.ENCRYPT_SUCCESSFUL: log.info('Encrypted root drive created.') return elif state == encryptor_service.ENCRYPT_FAILED: failure_code = status.get('failure_code') log.debug('failure_code=%s', failure_code) if failure_code == \ encryptor_service.FAILURE_CODE_UNSUPPORTED_GUEST: raise UnsupportedGuestError( 'The specified AMI uses an unsupported operating system') raise EncryptionError('Encryption failed') sleep(10) # We've failed to get encryption status for _max_errs_ consecutive tries. # Assume that the server has crashed. raise EncryptionError('Encryption service unavailable')
def _wait_for_security_group(aws_svc, sg_id): log.debug('Waiting for security group %s', sg_id) deadline = Deadline(EVENTUAL_CONSISTENCY_TIMEOUT) while not deadline.is_expired(): try: return aws_svc.get_security_group(sg_id) except EC2ResponseError as e: if e.error_code == 'InvalidGroup.NotFound': _sleep(2) else: raise raise Exception('Timed out waiting for security group ' + sg_id)
def wait_for_volume( aws_svc, volume, timeout=300, state='available'): log.debug( 'Waiting for %s, timeout=%d, state=%s', volume.id, timeout, state) deadline = Deadline(timeout) while not deadline.is_expired(): volume = aws_svc.get_volume(volume.id) if volume.status == state: return volume sleep(2) raise InstanceError( 'Timed out waiting for %s to be in the %s state' % (volume.id, state) )
def do_encryption(gce_svc, enc_svc_cls, zone, encryptor, encryptor_image, instance_name, instance_config, encrypted_image_disk, network, status_port=ENCRYPTOR_STATUS_PORT): metadata = gce_metadata_from_userdata(instance_config.make_userdata()) log.info('Launching encryptor instance') gce_svc.run_instance(zone=zone, name=encryptor, image=encryptor_image, network=network, disks=[ gce_svc.get_disk(zone, instance_name), gce_svc.get_disk(zone, encrypted_image_disk) ], metadata=metadata) try: enc_svc = enc_svc_cls([gce_svc.get_instance_ip(encryptor, zone)], port=status_port) wait_for_encryptor_up(enc_svc, Deadline(600)) wait_for_encryption(enc_svc) except Exception as e: f = gce_svc.write_serial_console_file(zone, encryptor) if f: log.info('Encryption failed. Writing console to %s' % f) raise e retry(function=gce_svc.delete_instance, on=[httplib.BadStatusLine, socket.error, errors.HttpError])(zone, encryptor)
def _write_console_output(aws_svc, instance_id, timeout=GET_CONSOLE_TIMEOUT): deadline = Deadline(timeout) while not deadline.is_expired(): try: console_output = aws_svc.get_console_output(instance_id) if console_output.output: prefix = instance_id + '-' with tempfile.NamedTemporaryFile( prefix=prefix, suffix='.log', delete=False) as t: t.write(console_output.output) return t except: pass log.info('Waiting on console output from %s' % instance_id) _sleep(5) log.warn('Timed out waiting for console output from %s' % instance_id) return None
def _safe_get_instance(aws_svc, instance_id): """ Get the instance and handle AWS eventual consistency lag. """ deadline = Deadline(EVENTUAL_CONSISTENCY_TIMEOUT) instance = None while instance is None: try: instance = aws_svc.get_instance(instance_id) except EC2ResponseError as e: if e.error_code == 'InvalidInstanceID.NotFound': log.debug('Instance was not found. Sleeping.') _sleep(2) else: raise if deadline.is_expired(): raise Exception('Invalid instance id: ' + instance_id) return instance
def wait_for_volume(aws_svc, volume_id, timeout=600.0, state='available'): """ Wait for the volume to be in the specified state. :return the Volume object :raise VolumeError if the timeout is exceeded """ log.debug( 'Waiting for %s, timeout=%.02f, state=%s', volume_id, timeout, state) deadline = Deadline(timeout) sleep_time = 0.5 while not deadline.is_expired(): volume = aws_svc.get_volume(volume_id) if volume.status == state: return volume util.sleep(sleep_time) sleep_time *= 2 raise VolumeError( 'Timed out waiting for %s to be in the %s state' % (volume_id, state) )
def update_gce_image(gce_svc, enc_svc_cls, image_id, encryptor_image, encrypted_image_name, zone, instance_config, keep_encryptor=False, image_file=None, image_bucket=None, network=None, status_port=ENCRYPTOR_STATUS_PORT): snap_created = None try: # create image from file in GCS bucket log.info('Retrieving encryptor image from GCS bucket') if not encryptor_image: encryptor_image = gce_svc.get_latest_encryptor_image( zone, image_bucket, image_file=image_file) else: # Keep user provided encryptor image keep_encryptor = True instance_name = 'brkt-updater-' + gce_svc.get_session_id() updater = instance_name + '-metavisor' encrypted_image_disk = instance_name + '-guest' # Create disk from encrypted guest snapshot. This disk # won't be altered. It will be re-snapshotted and paired # with the new encryptor image. gce_svc.disk_from_snapshot(zone, image_id, encrypted_image_disk) gce_svc.wait_for_disk(zone, encrypted_image_disk) log.info("Creating snapshot of encrypted image disk") gce_svc.create_snapshot(zone, encrypted_image_disk, encrypted_image_name) snap_created = True log.info("Launching encrypted updater") instance_config.brkt_config['solo_mode'] = 'updater' user_data = gce_metadata_from_userdata(instance_config.make_userdata()) gce_svc.run_instance(zone, updater, encryptor_image, network=network, disks=[], metadata=user_data) enc_svc = enc_svc_cls([gce_svc.get_instance_ip(updater, zone)], port=status_port) # wait for updater to finish and guest root disk wait_for_encryptor_up(enc_svc, Deadline(600)) try: wait_for_encryption(enc_svc) except: raise # delete updater instance log.info('Deleting updater instance') gce_svc.delete_instance(zone, updater) # wait for updater root disk gce_svc.wait_for_detach(zone, updater) # create image from mv root disk and snapshot # encrypted guest root disk log.info("Creating updated metavisor image") gce_svc.create_gce_image_from_disk(zone, encrypted_image_name, updater) gce_svc.wait_image(encrypted_image_name) gce_svc.wait_snapshot(encrypted_image_name) except: f = gce_svc.write_serial_console_file(zone, updater) if f: log.info('Update failed. Writing console to %s' % f) log.info("Update failed. Cleaning up") if snap_created: gce_svc.delete_snapshot(encrypted_image_name) gce_svc.cleanup(zone, encryptor_image, keep_encryptor) raise finally: gce_svc.cleanup(zone, encryptor_image, keep_encryptor) return encrypted_image_name
def update_ami(aws_svc, encrypted_ami, updater_ami, encrypted_ami_name, subnet_id=None, security_group_ids=None, enc_svc_class=encryptor_service.EncryptorService, guest_instance_type='m3.medium', updater_instance_type='m3.medium', instance_config=None, status_port=encryptor_service.ENCRYPTOR_STATUS_PORT): encrypted_guest = None updater = None mv_root_id = None temp_sg_id = None if instance_config is None: instance_config = InstanceConfig() try: guest_image = aws_svc.get_image(encrypted_ami) # Step 1. Launch encrypted guest AMI # Use 'updater' mode to avoid chain loading the guest # automatically. We just want this AMI/instance up as the # base to create a new AMI and preserve license # information embedded in the guest AMI log.info("Launching encrypted guest/updater") instance_config.brkt_config['solo_mode'] = 'updater' instance_config.brkt_config['status_port'] = status_port encrypted_guest = aws_svc.run_instance( encrypted_ami, instance_type=guest_instance_type, ebs_optimized=False, subnet_id=subnet_id, user_data=json.dumps(instance_config.brkt_config)) aws_svc.create_tags( encrypted_guest.id, name=NAME_GUEST_CREATOR, description=DESCRIPTION_GUEST_CREATOR % {'image_id': encrypted_ami} ) # Run updater in same zone as guest so we can swap volumes user_data = instance_config.make_userdata() compressed_user_data = gzip_user_data(user_data) # If the user didn't specify a security group, create a temporary # security group that allows brkt-cli to get status from the updater. run_instance = aws_svc.run_instance if not security_group_ids: vpc_id = None if subnet_id: subnet = aws_svc.get_subnet(subnet_id) vpc_id = subnet.vpc_id temp_sg_id = create_encryptor_security_group( aws_svc, vpc_id=vpc_id, status_port=status_port).id security_group_ids = [temp_sg_id] # Wrap with a retry, to handle eventual consistency issues with # the newly-created group. run_instance = aws_svc.retry( aws_svc.run_instance, error_code_regexp='InvalidGroup\.NotFound' ) updater = run_instance( updater_ami, instance_type=updater_instance_type, user_data=compressed_user_data, ebs_optimized=False, subnet_id=subnet_id, placement=encrypted_guest.placement, security_group_ids=security_group_ids) aws_svc.create_tags( updater.id, name=NAME_METAVISOR_UPDATER, description=DESCRIPTION_METAVISOR_UPDATER, ) wait_for_instance(aws_svc, encrypted_guest.id, state="running") log.info("Launched guest: %s Updater: %s" % (encrypted_guest.id, updater.id) ) # Step 2. Wait for the updater to finish and stop the instances aws_svc.stop_instance(encrypted_guest.id) updater = wait_for_instance(aws_svc, updater.id, state="running") host_ips = [] if updater.ip_address: host_ips.append(updater.ip_address) if updater.private_ip_address: host_ips.append(updater.private_ip_address) log.info('Adding %s to NO_PROXY environment variable' % updater.private_ip_address) if os.environ.get('NO_PROXY'): os.environ['NO_PROXY'] += "," + \ updater.private_ip_address else: os.environ['NO_PROXY'] = updater.private_ip_address enc_svc = enc_svc_class(host_ips, port=status_port) log.info('Waiting for updater service on %s (port %s on %s)', updater.id, enc_svc.port, ', '.join(host_ips)) wait_for_encryptor_up(enc_svc, Deadline(600)) try: wait_for_encryption(enc_svc) except Exception as e: # Stop the updater instance, to make the console log available. encrypt_ami.stop_and_wait(aws_svc, updater.id) log_exception_console(aws_svc, e, updater.id) raise aws_svc.stop_instance(updater.id) encrypted_guest = wait_for_instance( aws_svc, encrypted_guest.id, state="stopped") updater = wait_for_instance(aws_svc, updater.id, state="stopped") guest_bdm = encrypted_guest.block_device_mapping updater_bdm = updater.block_device_mapping # Step 3. Detach old BSD drive(s) and delete from encrypted guest if guest_image.virtualization_type == 'paravirtual': d_list = ['/dev/sda1', '/dev/sda2', '/dev/sda3'] else: d_list = [encrypted_guest.root_device_name] for d in d_list: log.info("Detaching old metavisor disk: %s from %s" % (guest_bdm[d].volume_id, encrypted_guest.id)) aws_svc.detach_volume(guest_bdm[d].volume_id, instance_id=encrypted_guest.id, force=True ) aws_svc.delete_volume(guest_bdm[d].volume_id) # Step 4. Snapshot MV volume(s) log.info("Creating snapshots") if guest_image.virtualization_type == 'paravirtual': description = DESCRIPTION_SNAPSHOT % {'image_id': updater.id} snap_root = aws_svc.create_snapshot( updater_bdm['/dev/sda2'].volume_id, name=NAME_METAVISOR_ROOT_SNAPSHOT, description=description ) snap_log = aws_svc.create_snapshot( updater_bdm['/dev/sda3'].volume_id, name=NAME_METAVISOR_LOG_SNAPSHOT, description=description ) wait_for_snapshots(aws_svc, snap_root.id, snap_log.id) dev_root = EBSBlockDeviceType(volume_type='gp2', snapshot_id=snap_root.id, delete_on_termination=True) dev_log = EBSBlockDeviceType(volume_type='gp2', snapshot_id=snap_log.id, delete_on_termination=True) guest_bdm['/dev/sda2'] = dev_root guest_bdm['/dev/sda3'] = dev_log # Use updater as base instance for create_image boot_snap_name = NAME_METAVISOR_GRUB_SNAPSHOT root_device_name = updater.root_device_name guest_root = '/dev/sda5' d_list.append(guest_root) else: # Use guest_instance as base instance for create_image boot_snap_name = NAME_METAVISOR_ROOT_SNAPSHOT root_device_name = guest_image.root_device_name guest_root = '/dev/sdf' d_list.append(guest_root) # Preserve volume type for any additional attached volumes for d in guest_bdm.keys(): if d not in d_list: log.debug("Preserving volume type for disk %s", d) vol_id = guest_bdm[d].volume_id vol = aws_svc.get_volume(vol_id) guest_bdm[d].volume_type = vol.type # Step 5. Move new MV boot disk to base instance log.info("Detach boot volume from %s" % (updater.id,)) mv_root_id = updater_bdm['/dev/sda1'].volume_id aws_svc.detach_volume(mv_root_id, instance_id=updater.id, force=True ) # Step 6. Attach new boot disk to guest instance log.info("Attaching new metavisor boot disk: %s to %s" % (mv_root_id, encrypted_guest.id) ) aws_svc.attach_volume(mv_root_id, encrypted_guest.id, root_device_name) encrypted_guest = encrypt_ami.wait_for_volume_attached( aws_svc, encrypted_guest.id, root_device_name) guest_bdm[root_device_name] = \ encrypted_guest.block_device_mapping[root_device_name] guest_bdm[root_device_name].delete_on_termination = True guest_bdm[root_device_name].volume_type = 'gp2' guest_root_vol_id = guest_bdm[guest_root].volume_id guest_root_vol = aws_svc.get_volume(guest_root_vol_id) guest_bdm[guest_root].volume_type = guest_root_vol.type # Step 7. Create new AMI. Preserve billing/license info log.info("Creating new AMI") ami = aws_svc.create_image( encrypted_guest.id, encrypted_ami_name, description=guest_image.description, no_reboot=True, block_device_mapping=guest_bdm ) wait_for_image(aws_svc, ami) image = aws_svc.get_image(ami, retry=True) aws_svc.create_tags( image.block_device_mapping[root_device_name].snapshot_id, name=boot_snap_name, ) aws_svc.create_tags( image.block_device_mapping[guest_root].snapshot_id, name=NAME_ENCRYPTED_ROOT_SNAPSHOT, ) aws_svc.create_tags(ami) return ami finally: instance_ids = set() volume_ids = set() sg_ids = set() if encrypted_guest: instance_ids.add(encrypted_guest.id) if updater: instance_ids.add(updater.id) if mv_root_id: volume_ids.add(mv_root_id) if temp_sg_id: sg_ids.add(temp_sg_id) clean_up(aws_svc, instance_ids=instance_ids, volume_ids=volume_ids, security_group_ids=sg_ids)
def wait_for_encryption(enc_svc, progress_timeout=ENCRYPTION_PROGRESS_TIMEOUT): err_count = 0 max_errs = 10 start_time = time.time() last_log_time = start_time progress_deadline = Deadline(progress_timeout) last_progress = 0 last_state = '' while err_count < max_errs: try: status = enc_svc.get_status() err_count = 0 except Exception as e: log.warn("Failed getting encryption status: %s", e) log.warn("Retrying. . .") err_count += 1 sleep(10) continue state = status['state'] percent_complete = status['percent_complete'] log.debug('state=%s, percent_complete=%d', state, percent_complete) # Make sure that encryption progress hasn't stalled. if progress_deadline.is_expired(): raise EncryptionError( 'Waited for encryption progress for longer than %s seconds' % progress_timeout) if percent_complete > last_progress or state != last_state: last_progress = percent_complete last_state = state progress_deadline = Deadline(progress_timeout) # Log progress once a minute. now = time.time() if now - last_log_time >= 60: if state == ENCRYPT_INITIALIZING: log.info('Encryption process is initializing') else: state_display = 'Encryption' if state == ENCRYPT_DOWNLOADING: state_display = 'Download from S3' log.info('%s is %d%% complete', state_display, percent_complete) last_log_time = now if state == ENCRYPT_SUCCESSFUL: log.info('Encrypted root drive created.') return elif state == ENCRYPT_FAILED: log.debug('Encryption failed with status %s', status) failure_code = status.get('failure_code') if failure_code == \ FAILURE_CODE_UNSUPPORTED_GUEST: raise UnsupportedGuestError( 'The specified AMI uses an unsupported operating system') if failure_code == FAILURE_CODE_AWS_PERMISSIONS: raise AWSPermissionsError( 'The specified IAM profile has insufficient permissions') if failure_code == \ FAILURE_CODE_INVALID_NTP_SERVERS: raise InvalidNtpServerError('Invalid NTP servers provided.') msg = 'Encryption failed' if failure_code: msg += ' with code %s' % failure_code raise EncryptionError(msg) sleep(10) # We've failed to get encryption status for _max_errs_ consecutive tries. # Assume that the server has crashed. raise EncryptionError('Encryption service unavailable')
def wait_for_encryption(enc_svc, progress_timeout=ENCRYPTION_PROGRESS_TIMEOUT): err_count = 0 max_errs = 10 start_time = time.time() last_log_time = start_time progress_deadline = Deadline(progress_timeout) last_progress = 0 last_state = '' while err_count < max_errs: try: status = enc_svc.get_status() err_count = 0 except Exception as e: log.warn("Failed getting encryption status: %s", e) log.warn("Retrying. . .") err_count += 1 sleep(10) continue state = status['state'] percent_complete = status['percent_complete'] log.debug('state=%s, percent_complete=%d', state, percent_complete) # Make sure that encryption progress hasn't stalled. if progress_deadline.is_expired(): raise EncryptionError( 'Waited for encryption progress for longer than %s seconds' % progress_timeout ) if percent_complete > last_progress or state != last_state: last_progress = percent_complete last_state = state progress_deadline = Deadline(progress_timeout) # Log progress once a minute. now = time.time() if now - last_log_time >= 60: if state == encryptor_service.ENCRYPT_INITIALIZING: log.info('Encryption process is initializing') else: state_display = 'Encryption' if state == encryptor_service.ENCRYPT_DOWNLOADING: state_display = 'Download from S3' log.info( '%s is %d%% complete', state_display, percent_complete) last_log_time = now if state == encryptor_service.ENCRYPT_SUCCESSFUL: log.info('Encrypted root drive created.') return elif state == encryptor_service.ENCRYPT_FAILED: failure_code = status.get('failure_code') log.debug('failure_code=%s', failure_code) if failure_code == \ encryptor_service.FAILURE_CODE_UNSUPPORTED_GUEST: raise UnsupportedGuestError( 'The specified AMI uses an unsupported operating system') if failure_code == encryptor_service.FAILURE_CODE_AWS_PERMISSIONS: raise AWSPermissionsError( 'The specified IAM profile has insufficient permissions') raise EncryptionError('Encryption failed') sleep(10) # We've failed to get encryption status for _max_errs_ consecutive tries. # Assume that the server has crashed. raise EncryptionError('Encryption service unavailable')