def test_sort_by_fitness_calls_all_sorting_funcs(): with mock.patch( "paasta_tools.autoscaling.ec2_fitness.sort_by_system_instance_health", autospec=True, ) as mock_sort_by_system_instance_health, mock.patch( "paasta_tools.autoscaling.ec2_fitness.sort_by_upcoming_events", autospec=True ) as mock_sort_by_upcoming_events, mock.patch( "paasta_tools.autoscaling.ec2_fitness.sort_by_running_batch_count", autospec=True, ) as mock_sort_by_running_batch_count, mock.patch( "paasta_tools.autoscaling.ec2_fitness.sort_by_total_tasks", autospec=True ) as mock_sort_by_total_tasks: instances = [] ec2_fitness.sort_by_ec2_fitness(instances) assert mock_sort_by_total_tasks.called assert mock_sort_by_running_batch_count.called assert mock_sort_by_upcoming_events.called assert mock_sort_by_system_instance_health.called
def test_sort_by_fitness(): mock_slave_1 = Mock(name="slave1") mock_slave_1.task_counts = SlaveTaskCount(count=3, slave=Mock(), batch_count=1) mock_slave_1.instance_status = { "Events": [], "SystemStatus": {"Status": "impaired"}, "InstanceStatus": {"Status": "ok"}, } mock_slave_2 = Mock(name="slave2") mock_slave_2.task_counts = SlaveTaskCount(count=3, slave=Mock(), batch_count=1) mock_slave_2.instance_status = { "Events": [ { "Code": "instance-reboot", "Description": "foo", "NotBefore": datetime(2015, 1, 1), "NotAfter": datetime(2015, 1, 1), } ], "SystemStatus": {"Status": "ok"}, "InstanceStatus": {"Status": "ok"}, } mock_slave_3 = Mock(name="slave3") mock_slave_3.task_counts = SlaveTaskCount(count=2, slave=Mock(), batch_count=3) mock_slave_3.instance_status = { "Events": [], "SystemStatus": {"Status": "ok"}, "InstanceStatus": {"Status": "ok"}, } mock_slave_4 = Mock(name="slave4") mock_slave_4.task_counts = SlaveTaskCount(count=3, slave=Mock(), batch_count=1) mock_slave_4.instance_status = { "Events": [], "SystemStatus": {"Status": "ok"}, "InstanceStatus": {"Status": "ok"}, } mock_slave_5 = Mock(name="slave5") mock_slave_5.task_counts = SlaveTaskCount(count=1, slave=Mock(), batch_count=1) mock_slave_5.instance_status = { "Events": [], "SystemStatus": {"Status": "ok"}, "InstanceStatus": {"Status": "ok"}, } ret = ec2_fitness.sort_by_ec2_fitness( [mock_slave_1, mock_slave_2, mock_slave_3, mock_slave_4, mock_slave_5] ) # we expect this order for the following reason: # mock_slave_1 is impaired and so should be killed asap # mock_slave_2 has an upcoming event # mock_slave_5 and mock_slave_4 have the fewest batch tasks, and so should be killed before # mock_slave_3 (we cant drain batch tasks, so try and save them) # mock_slave_5 has fewer tasks than mock_slave_4, and so is a better candidate for killing assert ret == [mock_slave_3, mock_slave_4, mock_slave_5, mock_slave_2, mock_slave_1]
def downscale_aws_resource(self, filtered_slaves, current_capacity, target_capacity): killed_slaves = 0 while True: filtered_sorted_slaves = ec2_fitness.sort_by_ec2_fitness( filtered_slaves)[::-1] if len(filtered_sorted_slaves) == 0: self.log.info( "ALL slaves killed so moving on to next resource!") break self.log.info("Resource slave kill preference: {}".format( [slave.hostname for slave in filtered_sorted_slaves])) slave_to_kill = filtered_sorted_slaves.pop(0) instance_capacity = slave_to_kill.instance_weight new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: self.log.info( "Terminating instance {} with weight {} would take us below our target of {}," " so this is as close to our target as we can get".format( slave_to_kill.instance_id, slave_to_kill.instance_weight, target_capacity)) if self.resource[ 'type'] == 'aws_spot_fleet_request' and killed_slaves == 0: self.log.info( "This is a SFR so we must kill at least one slave to prevent the autoscaler " "getting stuck whilst scaling down gradually") else: break try: self.gracefully_terminate_slave( slave_to_kill=slave_to_kill, current_capacity=current_capacity, new_capacity=new_capacity) killed_slaves += 1 except HTTPError: # Something wrong draining host so try next host continue except FailSetResourceCapacity: break current_capacity = new_capacity mesos_state = get_mesos_master().state_summary() if filtered_sorted_slaves: task_counts = get_mesos_task_count_by_slave( mesos_state, slaves_list=[{ 'task_counts': slave.task_counts } for slave in filtered_sorted_slaves]) for i, slave in enumerate(filtered_sorted_slaves): slave.task_counts = task_counts[i]['task_counts'] filtered_slaves = filtered_sorted_slaves
def test_sort_by_fitness(): mock_slave_1 = Mock(name='slave1') mock_slave_1.task_counts = SlaveTaskCount( count=3, slave=Mock(), batch_count=1, ) mock_slave_1.instance_status = { 'Events': [], 'SystemStatus': {'Status': 'impaired', }, 'InstanceStatus': {'Status': 'ok', }, } mock_slave_2 = Mock(name='slave2') mock_slave_2.task_counts = SlaveTaskCount( count=3, slave=Mock(), batch_count=1, ) mock_slave_2.instance_status = { 'Events': [ { 'Code': 'instance-reboot', 'Description': 'foo', 'NotBefore': datetime(2015, 1, 1), 'NotAfter': datetime(2015, 1, 1), }, ], 'SystemStatus': {'Status': 'ok', }, 'InstanceStatus': {'Status': 'ok', }, } mock_slave_3 = Mock(name='slave3') mock_slave_3.task_counts = SlaveTaskCount( count=2, slave=Mock(), batch_count=3, ) mock_slave_3.instance_status = { 'Events': [], 'SystemStatus': {'Status': 'ok', }, 'InstanceStatus': {'Status': 'ok', }, } mock_slave_4 = Mock(name='slave4') mock_slave_4.task_counts = SlaveTaskCount( count=3, slave=Mock(), batch_count=1, ) mock_slave_4.instance_status = { 'Events': [], 'SystemStatus': {'Status': 'ok', }, 'InstanceStatus': {'Status': 'ok', }, } mock_slave_5 = Mock(name='slave5') mock_slave_5.task_counts = SlaveTaskCount( count=1, slave=Mock(), batch_count=1, ) mock_slave_5.instance_status = { 'Events': [], 'SystemStatus': {'Status': 'ok', }, 'InstanceStatus': {'Status': 'ok', }, } ret = ec2_fitness.sort_by_ec2_fitness([mock_slave_1, mock_slave_2, mock_slave_3, mock_slave_4, mock_slave_5]) # we expect this order for the following reason: # mock_slave_1 is impaired and so should be killed asap # mock_slave_2 has an upcoming event # mock_slave_5 and mock_slave_4 have the fewest chronos tasks, and so should be killed before # mock_slave_3 (we cant drain chronos tasks, so try and save them) # mock_slave_5 has fewer tasks than mock_slave_4, and so is a better candidate for killing assert ret == [mock_slave_3, mock_slave_4, mock_slave_5, mock_slave_2, mock_slave_1]
def test_sort_by_fitness(): mock_slave_1 = Mock( task_counts=SlaveTaskCount( count=3, slave=Mock(), chronos_count=1, ), instance_status={ 'Events': [], 'SystemStatus': {'Status': 'impaired', }, 'InstanceStatus': {'Status': 'ok', } }, ) mock_slave_2 = Mock( task_counts=SlaveTaskCount( count=3, slave=Mock(), chronos_count=1, ), instance_status={ 'Events': [ { 'Code': 'instance-reboot', 'Description': 'foo', 'NotBefore': datetime(2015, 1, 1), 'NotAfter': datetime(2015, 1, 1) }, ], 'SystemStatus': {'Status': 'ok', }, 'InstanceStatus': {'Status': 'ok', } }, ) mock_slave_3 = Mock( task_counts=SlaveTaskCount( count=2, slave=Mock(), chronos_count=3, ), instance_status={ 'Events': [], 'SystemStatus': {'Status': 'ok', }, 'InstanceStatus': {'Status': 'ok', } }, ) mock_slave_4 = Mock( task_counts=SlaveTaskCount( count=3, slave=Mock(), chronos_count=1, ), instance_status={ 'Events': [], 'SystemStatus': {'Status': 'ok', }, 'InstanceStatus': {'Status': 'ok', } }, ) mock_slave_5 = Mock( task_counts=SlaveTaskCount( count=1, slave=Mock(), chronos_count=1, ), instance_status={ 'Events': [], 'SystemStatus': {'Status': 'ok', }, 'InstanceStatus': {'Status': 'ok', } }, ) ret = ec2_fitness.sort_by_ec2_fitness([mock_slave_1, mock_slave_2, mock_slave_3, mock_slave_4, mock_slave_5]) assert ret == [mock_slave_5, mock_slave_4, mock_slave_3, mock_slave_2, mock_slave_1]
async def downscale_aws_resource(self, filtered_slaves, current_capacity, target_capacity): self.log.info("downscale_aws_resource for %s" % filtered_slaves) killed_slaves = 0 terminate_tasks = {} self.capacity = current_capacity timer = Timer(300) while True: filtered_sorted_slaves = ec2_fitness.sort_by_ec2_fitness( filtered_slaves)[::-1] if len(filtered_sorted_slaves) == 0: self.log.info( "ALL slaves killed so moving on to next resource!") break self.log.info("Resource slave kill preference: {}".format( [slave.hostname for slave in filtered_sorted_slaves])) slave_to_kill = filtered_sorted_slaves.pop(0) instance_capacity = slave_to_kill.instance_weight new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: self.log.info( "Terminating instance {} with weight {} would take us below our target of {}," " so this is as close to our target as we can get".format( slave_to_kill.instance_id, slave_to_kill.instance_weight, target_capacity, )) if self.resource[ 'type'] == 'aws_spot_fleet_request' and killed_slaves == 0: self.log.info( "This is a SFR so we must kill at least one slave to prevent the autoscaler " "getting stuck whilst scaling down gradually", ) if new_capacity < 1 and self.sfr[ 'SpotFleetRequestState'] == 'active': self.log.info( "Can't set target capacity to less than 1 for SFRs. No further " "action for this SFR", ) break else: break capacity_diff = new_capacity - current_capacity self.log.info("Starting async kill for %s" % slave_to_kill.hostname) # My understanding is that ensure_future will actually start running the coroutine # (gracefully_terminate_slave), until it hits something that sleeps, then the loop # can continue and we start killing the next slave terminate_tasks[slave_to_kill.hostname] = asyncio.ensure_future( self.gracefully_terminate_slave( slave_to_kill=slave_to_kill, capacity_diff=capacity_diff, timer=timer, ), ) killed_slaves += 1 current_capacity = new_capacity filtered_slaves = filtered_sorted_slaves # Now we wait for each task to actually finish... for hostname, task in terminate_tasks.items(): try: await task except (HTTPError, FailSetResourceCapacity): continue