def test_get_mesos_task_count_by_slave(): with contextlib.nested( mock.patch('paasta_tools.mesos_tools.get_running_tasks_from_active_frameworks', autospec=True), ) as ( mock_get_running_tasks_from_active_frameworks, ): mock_chronos = mock.Mock() mock_chronos.name = 'chronos' mock_marathon = mock.Mock() mock_marathon.name = 'marathon' mock_task1 = mock.Mock() mock_task1.slave = {'id': 'slave1'} mock_task1.framework = mock_chronos mock_task2 = mock.Mock() mock_task2.slave = {'id': 'slave1'} mock_task2.framework = mock_marathon mock_task3 = mock.Mock() mock_task3.slave = {'id': 'slave2'} mock_task3.framework = mock_marathon mock_task4 = mock.Mock() mock_task4.slave = {'id': 'slave2'} mock_task4.framework = mock_marathon mock_tasks = [mock_task1, mock_task2, mock_task3, mock_task4] mock_get_running_tasks_from_active_frameworks.return_value = mock_tasks mock_slave_1 = {'id': 'slave1', 'attributes': {'pool': 'default'}, 'hostname': 'host1'} mock_slave_2 = {'id': 'slave2', 'attributes': {'pool': 'default'}, 'hostname': 'host2'} mock_slave_3 = {'id': 'slave3', 'attributes': {'pool': 'another'}, 'hostname': 'host3'} mock_mesos_state = {'slaves': [mock_slave_1, mock_slave_2, mock_slave_3]} ret = mesos_tools.get_mesos_task_count_by_slave(mock_mesos_state, pool='default') mock_get_running_tasks_from_active_frameworks.assert_called_with('') expected = [{'task_counts': mesos_tools.SlaveTaskCount(count=2, chronos_count=1, slave=mock_slave_1)}, {'task_counts': mesos_tools.SlaveTaskCount(count=2, chronos_count=0, slave=mock_slave_2)}] assert len(ret) == len(expected) and sorted(ret) == sorted(expected) ret = mesos_tools.get_mesos_task_count_by_slave(mock_mesos_state, pool=None) mock_get_running_tasks_from_active_frameworks.assert_called_with('') expected = [{'task_counts': mesos_tools.SlaveTaskCount(count=2, chronos_count=1, slave=mock_slave_1)}, {'task_counts': mesos_tools.SlaveTaskCount(count=2, chronos_count=0, slave=mock_slave_2)}, {'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_3)}] assert len(ret) == len(expected) and sorted(ret) == sorted(expected) # test slaves_list override mock_task2 = mock.Mock() mock_task2.slave = {'id': 'slave2'} mock_task2.framework = mock_marathon mock_tasks = [mock_task1, mock_task2, mock_task3, mock_task4] mock_get_running_tasks_from_active_frameworks.return_value = mock_tasks ret = mesos_tools.get_mesos_task_count_by_slave(mock_mesos_state, slaves_list=[mock_slave_1, mock_slave_2, mock_slave_3]) expected = [{'id': 'slave1', 'attributes': {'pool': 'default'}, 'hostname': 'host1', 'task_counts': mesos_tools.SlaveTaskCount(count=1, chronos_count=1, slave=mock_slave_1)}, {'id': 'slave2', 'attributes': {'pool': 'default'}, 'hostname': 'host2', 'task_counts': mesos_tools.SlaveTaskCount(count=3, chronos_count=0, slave=mock_slave_2)}, {'id': 'slave3', 'attributes': {'pool': 'another'}, 'hostname': 'host3', 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_3)}] assert len(ret) == len(expected) and sorted(ret) == sorted(expected)
def scale_resource(self, current_capacity, target_capacity): """Scales an AWS resource based on current and target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param current_capacity: integer current resource capacity :param target_capacity: target resource capacity """ target_capacity = int(target_capacity) delta = target_capacity - current_capacity if delta == 0: self.log.info("Already at target capacity: {}".format(target_capacity)) return elif delta > 0: self.log.info("Increasing resource capacity to: {}".format(target_capacity)) self.set_capacity(target_capacity) return elif delta < 0: mesos_state = get_mesos_master().state_summary() slaves_list = get_mesos_task_count_by_slave(mesos_state, pool=self.resource['pool']) filtered_slaves = self.filter_aws_slaves(slaves_list) killable_capacity = sum([slave.instance_weight for slave in filtered_slaves]) amount_to_decrease = delta * -1 if amount_to_decrease > killable_capacity: self.log.error( "Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!" ) return self.downscale_aws_resource( filtered_slaves=filtered_slaves, current_capacity=current_capacity, target_capacity=target_capacity)
def downscale_spot_fleet_request(resource, filtered_slaves, current_capacity, target_capacity, pool_settings, dry_run): while True: filtered_sorted_slaves = sort_slaves_to_kill(filtered_slaves) if len(filtered_sorted_slaves) == 0: break log.info("SFR slave kill preference: {0}".format( [slave['hostname'] for slave in filtered_sorted_slaves])) filtered_sorted_slaves.reverse() slave_to_kill = filtered_sorted_slaves.pop() instance_capacity = slave_to_kill['instance_weight'] new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: log.info( "Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as" " close to our target as we can get".format( slave_to_kill['instance_id'], slave_to_kill['instance_weight'], target_capacity)) break try: gracefully_terminate_slave(resource=resource, slave_to_kill=slave_to_kill, pool_settings=pool_settings, current_capacity=current_capacity, new_capacity=new_capacity, dry_run=dry_run) except HTTPError: # Something wrong draining host so try next host continue except FailSetSpotCapacity: break current_capacity = new_capacity mesos_state = get_mesos_master().state_summary() filtered_slaves = get_mesos_task_count_by_slave( mesos_state, slaves_list=filtered_sorted_slaves)
def is_host_drained(hostname): """Checks if a host has drained successfully by confirming it is draining and currently running 0 tasks :param hostname: hostname to check :returns: True or False """ mesos_state = get_mesos_state_summary_from_leader() task_counts = get_mesos_task_count_by_slave(mesos_state) if hostname in task_counts: slave_task_count = task_counts[hostname].count else: slave_task_count = 0 return is_host_draining(hostname=hostname) and slave_task_count == 0
def downscale_aws_resource(self, filtered_slaves, current_capacity, target_capacity): killed_slaves = 0 while True: filtered_sorted_slaves = ec2_fitness.sort_by_ec2_fitness( filtered_slaves)[::-1] if len(filtered_sorted_slaves) == 0: self.log.info( "ALL slaves killed so moving on to next resource!") break self.log.info("Resource slave kill preference: {}".format( [slave.hostname for slave in filtered_sorted_slaves])) slave_to_kill = filtered_sorted_slaves.pop(0) instance_capacity = slave_to_kill.instance_weight new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: self.log.info( "Terminating instance {} with weight {} would take us below our target of {}," " so this is as close to our target as we can get".format( slave_to_kill.instance_id, slave_to_kill.instance_weight, target_capacity)) if self.resource[ 'type'] == 'aws_spot_fleet_request' and killed_slaves == 0: self.log.info( "This is a SFR so we must kill at least one slave to prevent the autoscaler " "getting stuck whilst scaling down gradually") else: break try: self.gracefully_terminate_slave( slave_to_kill=slave_to_kill, current_capacity=current_capacity, new_capacity=new_capacity) killed_slaves += 1 except HTTPError: # Something wrong draining host so try next host continue except FailSetResourceCapacity: break current_capacity = new_capacity mesos_state = get_mesos_master().state_summary() if filtered_sorted_slaves: task_counts = get_mesos_task_count_by_slave( mesos_state, slaves_list=[{ 'task_counts': slave.task_counts } for slave in filtered_sorted_slaves]) for i, slave in enumerate(filtered_sorted_slaves): slave.task_counts = task_counts[i]['task_counts'] filtered_slaves = filtered_sorted_slaves
def test_get_mesos_task_count_by_slave(): with contextlib.nested( mock.patch('paasta_tools.mesos_tools.get_running_tasks_from_active_frameworks'), ) as ( mock_get_running_tasks_from_active_frameworks, ): mock_chronos = mock.Mock() mock_chronos.name = 'chronos' mock_marathon = mock.Mock() mock_marathon.name = 'marathon' mock_task1 = mock.Mock() mock_task1.slave = {'id': 'slave1'} mock_task1.framework = mock_chronos mock_task2 = mock.Mock() mock_task2.slave = {'id': 'slave1'} mock_task2.framework = mock_marathon mock_task3 = mock.Mock() mock_task3.slave = {'id': 'slave2'} mock_task3.framework = mock_marathon mock_task4 = mock.Mock() mock_task4.slave = {'id': 'slave2'} mock_task4.framework = mock_marathon mock_tasks = [mock_task1, mock_task2, mock_task3, mock_task4] mock_get_running_tasks_from_active_frameworks.return_value = mock_tasks mock_slave_1 = {'id': 'slave1', 'attributes': {'pool': 'default'}, 'hostname': 'host1'} mock_slave_2 = {'id': 'slave2', 'attributes': {'pool': 'default'}, 'hostname': 'host2'} mock_slave_3 = {'id': 'slave3', 'attributes': {'pool': 'another'}, 'hostname': 'host3'} mock_mesos_state = {'slaves': [mock_slave_1, mock_slave_2, mock_slave_3]} ret = mesos_tools.get_mesos_task_count_by_slave(mock_mesos_state, pool='default') mock_get_running_tasks_from_active_frameworks.assert_called_with('') assert ret == {'host1': mesos_tools.SlaveTaskCount(count=2, chronos_count=1, slave=mock_slave_1), 'host2': mesos_tools.SlaveTaskCount(count=2, chronos_count=0, slave=mock_slave_2)} ret = mesos_tools.get_mesos_task_count_by_slave(mock_mesos_state, pool=None) mock_get_running_tasks_from_active_frameworks.assert_called_with('') assert ret == {'host1': mesos_tools.SlaveTaskCount(count=2, chronos_count=1, slave=mock_slave_1), 'host2': mesos_tools.SlaveTaskCount(count=2, chronos_count=0, slave=mock_slave_2), 'host3': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_3)}
def sort_slaves_to_kill(mesos_state, pool='default'): """Pick the best slaves to kill. This returns a list of slaves after sorting in preference of which slaves we kill first. It sorts first by number of chronos tasks, then by total number of tasks :param mesos_state: mesos_state dict :param pool: pool of slaves to consider :returns: list of slaves""" slaves = get_mesos_task_count_by_slave(mesos_state, pool=pool) if slaves: slaves_by_task_count = [slave.slave for slave in sorted(slaves.values(), key=lambda x: (x.chronos_count, x.count))] return slaves_by_task_count else: return []
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity, pool_settings, dry_run): """Scales a spot fleet request by delta to reach target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param resource: resource to scale :param current_capacity: integer current SFR capacity :param target_capacity: target SFR capacity :param pool_settings: pool settings dict with timeout settings :param dry_run: Don't drain or make changes to spot fleet if True""" target_capacity = int(target_capacity) current_capacity = int(current_capacity) delta = target_capacity - current_capacity sfr_id = resource['id'] if delta == 0: log.info("Already at target capacity: {0}".format(target_capacity)) return elif delta > 0: log.info( "Increasing spot fleet capacity to: {0}".format(target_capacity)) set_spot_fleet_request_capacity(sfr_id, target_capacity, dry_run, region=resource['region']) return elif delta < 0: mesos_state = get_mesos_master().state_summary() slaves_list = get_mesos_task_count_by_slave(mesos_state, pool=resource['pool']) filtered_slaves = filter_sfr_slaves(slaves_list, resource) killable_capacity = sum( [slave['instance_weight'] for slave in filtered_slaves]) amount_to_decrease = delta * -1 if amount_to_decrease > killable_capacity: log.error( "Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!" ) return downscale_spot_fleet_request(resource=resource, filtered_slaves=filtered_slaves, current_capacity=current_capacity, target_capacity=target_capacity, pool_settings=pool_settings, dry_run=dry_run)
def downscale_spot_fleet_request(resource, filtered_slaves, current_capacity, target_capacity, pool_settings, dry_run): killed_slaves = 0 while True: filtered_sorted_slaves = sort_slaves_to_kill(filtered_slaves) if len(filtered_sorted_slaves) == 0: log.info("ALL slaves killed so moving on to next pool!") break log.info("SFR slave kill preference: {0}".format([slave['hostname'] for slave in filtered_sorted_slaves])) filtered_sorted_slaves.reverse() slave_to_kill = filtered_sorted_slaves.pop() instance_capacity = slave_to_kill['instance_weight'] new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as" " close to our target as we can get".format(slave_to_kill['instance_id'], slave_to_kill['instance_weight'], target_capacity)) if killed_slaves == 0: log.info("This is a SFR so we must kill at least one slave to prevent the autoscaler " "getting stuck whilst scaling down gradually") else: break try: gracefully_terminate_slave(resource=resource, slave_to_kill=slave_to_kill, pool_settings=pool_settings, current_capacity=current_capacity, new_capacity=new_capacity, dry_run=dry_run) killed_slaves += 1 except HTTPError: # Something wrong draining host so try next host continue except FailSetSpotCapacity: break current_capacity = new_capacity mesos_state = get_mesos_master().state_summary() if filtered_sorted_slaves: filtered_slaves = get_mesos_task_count_by_slave(mesos_state, slaves_list=filtered_sorted_slaves) else: filtered_slaves = filtered_sorted_slaves
def downscale_spot_fleet_request(resource, filtered_slaves, current_capacity, target_capacity, pool_settings, dry_run): killed_slaves = 0 while True: filtered_sorted_slaves = sort_slaves_to_kill(filtered_slaves) if len(filtered_sorted_slaves) == 0: log.info("ALL slaves killed so moving on to next pool!") break log.info("SFR slave kill preference: {0}".format([slave['hostname'] for slave in filtered_sorted_slaves])) filtered_sorted_slaves.reverse() slave_to_kill = filtered_sorted_slaves.pop() instance_capacity = slave_to_kill['instance_weight'] new_capacity = current_capacity - instance_capacity if new_capacity < target_capacity: log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as" " close to our target as we can get".format(slave_to_kill['instance_id'], slave_to_kill['instance_weight'], target_capacity)) if resource['sfr']['SpotFleetRequestState'] == 'cancelled_running' and killed_slaves == 0: log.info("This is a cancelled SFR so we must kill at least one slave to prevent it lingering") else: break try: gracefully_terminate_slave(resource=resource, slave_to_kill=slave_to_kill, pool_settings=pool_settings, current_capacity=current_capacity, new_capacity=new_capacity, dry_run=dry_run) killed_slaves += 1 except HTTPError: # Something wrong draining host so try next host continue except FailSetSpotCapacity: break current_capacity = new_capacity mesos_state = get_mesos_master().state_summary() if filtered_sorted_slaves: filtered_slaves = get_mesos_task_count_by_slave(mesos_state, slaves_list=filtered_sorted_slaves) else: filtered_slaves = filtered_sorted_slaves
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity, pool_settings, dry_run): """Scales a spot fleet request by delta to reach target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param resource: resource to scale :param current_capacity: integer current SFR capacity :param target_capacity: target SFR capacity :param pool_settings: pool settings dict with timeout settings :param dry_run: Don't drain or make changes to spot fleet if True""" target_capacity = int(target_capacity) delta = target_capacity - current_capacity sfr_id = resource['id'] if delta == 0: log.info("Already at target capacity: {0}".format(target_capacity)) return elif delta > 0: log.info("Increasing spot fleet capacity to: {0}".format(target_capacity)) set_spot_fleet_request_capacity(sfr_id, target_capacity, dry_run, region=resource['region']) return elif delta < 0: mesos_state = get_mesos_master().state_summary() slaves_list = get_mesos_task_count_by_slave(mesos_state, pool=resource['pool']) filtered_slaves = filter_sfr_slaves(slaves_list, resource) killable_capacity = sum([slave['instance_weight'] for slave in filtered_slaves]) amount_to_decrease = delta * -1 if amount_to_decrease > killable_capacity: log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!") return downscale_spot_fleet_request(resource=resource, filtered_slaves=filtered_slaves, current_capacity=current_capacity, target_capacity=target_capacity, pool_settings=pool_settings, dry_run=dry_run)
def test_get_mesos_task_count_by_slave(): with mock.patch('paasta_tools.mesos_tools.get_all_running_tasks', autospec=True) as mock_get_all_running_tasks: mock_chronos = mock.Mock() mock_chronos.name = 'chronos' mock_marathon = mock.Mock() mock_marathon.name = 'marathon' mock_task1 = mock.Mock() mock_task1.slave = {'id': 'slave1'} mock_task1.framework = mock_chronos mock_task2 = mock.Mock() mock_task2.slave = {'id': 'slave1'} mock_task2.framework = mock_marathon mock_task3 = mock.Mock() mock_task3.slave = {'id': 'slave2'} mock_task3.framework = mock_marathon mock_task4 = mock.Mock() mock_task4.slave = {'id': 'slave2'} mock_task4.framework = mock_marathon mock_tasks = [mock_task1, mock_task2, mock_task3, mock_task4] mock_get_all_running_tasks.return_value = mock_tasks mock_slave_1 = { 'id': 'slave1', 'attributes': { 'pool': 'default' }, 'hostname': 'host1' } mock_slave_2 = { 'id': 'slave2', 'attributes': { 'pool': 'default' }, 'hostname': 'host2' } mock_slave_3 = { 'id': 'slave3', 'attributes': { 'pool': 'another' }, 'hostname': 'host3' } mock_mesos_state = { 'slaves': [mock_slave_1, mock_slave_2, mock_slave_3] } ret = mesos_tools.get_mesos_task_count_by_slave(mock_mesos_state, pool='default') assert mock_get_all_running_tasks.called expected = [ { 'task_counts': mesos_tools.SlaveTaskCount(count=2, chronos_count=1, slave=mock_slave_1) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=2, chronos_count=0, slave=mock_slave_2) }, ] assert len(ret) == len(expected) and utils.sort_dicts( ret) == utils.sort_dicts(expected) ret = mesos_tools.get_mesos_task_count_by_slave(mock_mesos_state, pool=None) assert mock_get_all_running_tasks.called expected = [ { 'task_counts': mesos_tools.SlaveTaskCount(count=2, chronos_count=1, slave=mock_slave_1) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=2, chronos_count=0, slave=mock_slave_2) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_3) }, ] assert len(ret) == len(expected) and utils.sort_dicts( ret) == utils.sort_dicts(expected) # test slaves_list override mock_task2 = mock.Mock() mock_task2.slave = {'id': 'slave2'} mock_task2.framework = mock_marathon mock_tasks = [mock_task1, mock_task2, mock_task3, mock_task4] mock_get_all_running_tasks.return_value = mock_tasks mock_slaves_list = [ { 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_1) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_2) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_3) }, ] ret = mesos_tools.get_mesos_task_count_by_slave( mock_mesos_state, slaves_list=mock_slaves_list, ) expected = [ { 'task_counts': mesos_tools.SlaveTaskCount(count=1, chronos_count=1, slave=mock_slave_1) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=3, chronos_count=0, slave=mock_slave_2) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_3) }, ] assert len(ret) == len(expected) and utils.sort_dicts( ret) == utils.sort_dicts(expected) # test SlaveDoesNotExist exception handling mock_task2.__getitem__ = mock.Mock(side_effect="fakeid") mock_task2.slave = mock.Mock() mock_task2.slave.__getitem__ = mock.Mock() mock_task2.slave.__getitem__.side_effect = mesos.exceptions.SlaveDoesNotExist # we expect to handle this SlaveDoesNotExist exception gracefully, and continue on to handle other tasks mock_tasks = [mock_task1, mock_task2, mock_task3, mock_task4] mock_get_all_running_tasks.return_value = mock_tasks mock_slaves_list = [ { 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_1) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_2) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_3) }, ] ret = mesos_tools.get_mesos_task_count_by_slave( mock_mesos_state, slaves_list=mock_slaves_list, ) # we expect mock_slave_2 to only count 2 tasks, as one of them returned a SlaveDoesNotExist exception expected = [ { 'task_counts': mesos_tools.SlaveTaskCount(count=1, chronos_count=1, slave=mock_slave_1) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=2, chronos_count=0, slave=mock_slave_2) }, { 'task_counts': mesos_tools.SlaveTaskCount(count=0, chronos_count=0, slave=mock_slave_3) }, ] assert len(ret) == len(expected) and utils.sort_dicts( ret) == utils.sort_dicts(expected)