def test_is_safe_to_kill(mock_get_hosts_past_maintenance_start, mock_is_host_drained): mock_is_host_drained.return_value = False mock_get_hosts_past_maintenance_start.return_value = [] assert not paasta_maintenance.is_safe_to_kill("blah") mock_is_host_drained.return_value = False mock_get_hosts_past_maintenance_start.return_value = ["blah"] assert paasta_maintenance.is_safe_to_kill("blah") mock_is_host_drained.return_value = True mock_get_hosts_past_maintenance_start.return_value = ["blah"] assert paasta_maintenance.is_safe_to_kill("blah") mock_is_host_drained.return_value = True mock_get_hosts_past_maintenance_start.return_value = [] assert paasta_maintenance.is_safe_to_kill("blah")
def wait_and_terminate(self, slave, drain_timeout, dry_run, region=None): """Waits for slave to be drained and then terminate :param slave: dict of slave to kill :param drain_timeout: how long to wait before terminating even if not drained :param region to connect to ec2 :param dry_run: Don't drain or make changes to spot fleet if True""" ec2_client = boto3.client('ec2', region_name=region) try: # This loop should always finish because the maintenance window should trigger is_ready_to_kill # being true. Just in case though we set a timeout and terminate anyway with Timeout(seconds=drain_timeout + 300): while True: instance_id = slave.instance_id if not instance_id: self.log.warning( "Didn't find instance ID for slave: {}. Skipping terminating" .format(slave.pid), ) continue # Check if no tasks are running or we have reached the maintenance window if is_safe_to_kill(slave.hostname) or dry_run: self.log.info( "TERMINATING: {} (Hostname = {}, IP = {})".format( instance_id, slave.hostname, slave.ip, )) try: ec2_client.terminate_instances( InstanceIds=[instance_id], DryRun=dry_run) except ClientError as e: if e.response['Error'].get( 'Code') == 'DryRunOperation': pass else: raise break else: self.log.info("Instance {}: NOT ready to kill".format( instance_id)) self.log.debug("Waiting 5 seconds and then checking again") time.sleep(5) except TimeoutError: self.log.error( "Timed out after {} waiting to drain {}, now terminating anyway" .format( drain_timeout, slave.pid, )) try: ec2_client.terminate_instances(InstanceIds=instance_id, DryRun=dry_run) except ClientError as e: if e.response['Error'].get('Code') == 'DryRunOperation': pass else: raise
def test_is_safe_to_kill( mock_get_hosts_past_maintenance_start, mock_is_host_drained, ): mock_is_host_drained.return_value = False mock_get_hosts_past_maintenance_start.return_value = [] assert not paasta_maintenance.is_safe_to_kill('blah') mock_is_host_drained.return_value = False mock_get_hosts_past_maintenance_start.return_value = ['blah'] assert paasta_maintenance.is_safe_to_kill('blah') mock_is_host_drained.return_value = True mock_get_hosts_past_maintenance_start.return_value = ['blah'] assert paasta_maintenance.is_safe_to_kill('blah') mock_is_host_drained.return_value = True mock_get_hosts_past_maintenance_start.return_value = [] assert paasta_maintenance.is_safe_to_kill('blah')
def wait_and_terminate(slave, drain_timeout, dry_run, region=None): """Currently kills a slave, will wait for draining to complete soon :param slave: dict of slave to kill :param dry_run: Don't drain or make changes to spot fleet if True""" ec2_client = boto3.client('ec2', region_name=region) try: # This loop should always finish because the maintenance window should trigger is_ready_to_kill # being true. Just in case though we set a timeout and terminate anyway with Timeout(seconds=drain_timeout + 300): while True: instance_id = slave['instance_id'] if not instance_id: log.warning("Didn't find instance ID for slave: {0}. Skipping terminating".format(slave['pid'])) continue # Check if no tasks are running or we have reached the maintenance window if is_safe_to_kill(slave['hostname']) or dry_run: log.info("TERMINATING: {0} (Hostname = {1}, IP = {2})".format( instance_id, slave['hostname'], slave['ip'], )) try: ec2_client.terminate_instances(InstanceIds=[instance_id], DryRun=dry_run) except ClientError as e: if e.response['Error'].get('Code') == 'DryRunOperation': pass else: raise break else: log.info("Instance {0}: NOT ready to kill".format(instance_id)) log.debug("Waiting 5 seconds and then checking again") time.sleep(5) except TimeoutError: log.error("Timed out after {0} waiting to drain {1}, now terminating anyway".format(drain_timeout, slave['pid'])) try: ec2_client.terminate_instances(InstanceIds=instance_id, DryRun=dry_run) except ClientError as e: if e.response['Error'].get('Code') == 'DryRunOperation': pass else: raise
def wait_and_terminate(slave, drain_timeout, dry_run, region=None): """Waits for slave to be drained and then terminate :param slave: dict of slave to kill :param dry_run: Don't drain or make changes to spot fleet if True""" ec2_client = boto3.client('ec2', region_name=region) try: # This loop should always finish because the maintenance window should trigger is_ready_to_kill # being true. Just in case though we set a timeout and terminate anyway with Timeout(seconds=drain_timeout + 300): while True: instance_id = slave['instance_id'] if not instance_id: log.warning("Didn't find instance ID for slave: {0}. Skipping terminating".format(slave['pid'])) continue # Check if no tasks are running or we have reached the maintenance window if is_safe_to_kill(slave['hostname']) or dry_run: log.info("TERMINATING: {0} (Hostname = {1}, IP = {2})".format( instance_id, slave['hostname'], slave['ip'], )) try: ec2_client.terminate_instances(InstanceIds=[instance_id], DryRun=dry_run) except ClientError as e: if e.response['Error'].get('Code') == 'DryRunOperation': pass else: raise break else: log.info("Instance {0}: NOT ready to kill".format(instance_id)) log.debug("Waiting 5 seconds and then checking again") time.sleep(5) except TimeoutError: log.error("Timed out after {0} waiting to drain {1}, now terminating anyway".format(drain_timeout, slave['pid'])) try: ec2_client.terminate_instances(InstanceIds=instance_id, DryRun=dry_run) except ClientError as e: if e.response['Error'].get('Code') == 'DryRunOperation': pass else: raise
def can_kill(self, hostname, should_drain, dry_run, timer): if dry_run: return True if timer.ready(): self.log.warning( "Timer expired before slave ready to kill, proceding to terminate anyways" ) timer.start() raise TimeoutError if not should_drain: self.log.info("Not draining, waiting %s longer before killing" % timer.left()) return False if is_safe_to_kill(hostname): self.log.info("Slave %s is ready to kill, with %s left on timer" % (hostname, timer.left())) timer.start() return True return False