Example #1
0
def test_sort_by_fitness_calls_all_sorting_funcs():
    with mock.patch(
        "paasta_tools.autoscaling.ec2_fitness.sort_by_system_instance_health",
        autospec=True,
    ) as mock_sort_by_system_instance_health, mock.patch(
        "paasta_tools.autoscaling.ec2_fitness.sort_by_upcoming_events", autospec=True
    ) as mock_sort_by_upcoming_events, mock.patch(
        "paasta_tools.autoscaling.ec2_fitness.sort_by_running_batch_count",
        autospec=True,
    ) as mock_sort_by_running_batch_count, mock.patch(
        "paasta_tools.autoscaling.ec2_fitness.sort_by_total_tasks", autospec=True
    ) as mock_sort_by_total_tasks:
        instances = []
        ec2_fitness.sort_by_ec2_fitness(instances)
        assert mock_sort_by_total_tasks.called
        assert mock_sort_by_running_batch_count.called
        assert mock_sort_by_upcoming_events.called
        assert mock_sort_by_system_instance_health.called
Example #2
0
def test_sort_by_fitness():
    mock_slave_1 = Mock(name="slave1")
    mock_slave_1.task_counts = SlaveTaskCount(count=3, slave=Mock(), batch_count=1)
    mock_slave_1.instance_status = {
        "Events": [],
        "SystemStatus": {"Status": "impaired"},
        "InstanceStatus": {"Status": "ok"},
    }
    mock_slave_2 = Mock(name="slave2")
    mock_slave_2.task_counts = SlaveTaskCount(count=3, slave=Mock(), batch_count=1)
    mock_slave_2.instance_status = {
        "Events": [
            {
                "Code": "instance-reboot",
                "Description": "foo",
                "NotBefore": datetime(2015, 1, 1),
                "NotAfter": datetime(2015, 1, 1),
            }
        ],
        "SystemStatus": {"Status": "ok"},
        "InstanceStatus": {"Status": "ok"},
    }
    mock_slave_3 = Mock(name="slave3")
    mock_slave_3.task_counts = SlaveTaskCount(count=2, slave=Mock(), batch_count=3)
    mock_slave_3.instance_status = {
        "Events": [],
        "SystemStatus": {"Status": "ok"},
        "InstanceStatus": {"Status": "ok"},
    }
    mock_slave_4 = Mock(name="slave4")
    mock_slave_4.task_counts = SlaveTaskCount(count=3, slave=Mock(), batch_count=1)
    mock_slave_4.instance_status = {
        "Events": [],
        "SystemStatus": {"Status": "ok"},
        "InstanceStatus": {"Status": "ok"},
    }
    mock_slave_5 = Mock(name="slave5")
    mock_slave_5.task_counts = SlaveTaskCount(count=1, slave=Mock(), batch_count=1)
    mock_slave_5.instance_status = {
        "Events": [],
        "SystemStatus": {"Status": "ok"},
        "InstanceStatus": {"Status": "ok"},
    }
    ret = ec2_fitness.sort_by_ec2_fitness(
        [mock_slave_1, mock_slave_2, mock_slave_3, mock_slave_4, mock_slave_5]
    )

    # we expect this order for the following reason:
    # mock_slave_1 is impaired and so should be killed asap
    # mock_slave_2 has an upcoming event
    # mock_slave_5 and mock_slave_4 have the fewest batch tasks, and so should be killed before
    # mock_slave_3 (we cant drain batch tasks, so try and save them)
    # mock_slave_5 has fewer tasks than mock_slave_4, and so is a better candidate for killing
    assert ret == [mock_slave_3, mock_slave_4, mock_slave_5, mock_slave_2, mock_slave_1]
    def downscale_aws_resource(self, filtered_slaves, current_capacity,
                               target_capacity):
        killed_slaves = 0
        while True:
            filtered_sorted_slaves = ec2_fitness.sort_by_ec2_fitness(
                filtered_slaves)[::-1]
            if len(filtered_sorted_slaves) == 0:
                self.log.info(
                    "ALL slaves killed so moving on to next resource!")
                break
            self.log.info("Resource slave kill preference: {}".format(
                [slave.hostname for slave in filtered_sorted_slaves]))
            slave_to_kill = filtered_sorted_slaves.pop(0)
            instance_capacity = slave_to_kill.instance_weight
            new_capacity = current_capacity - instance_capacity
            if new_capacity < target_capacity:
                self.log.info(
                    "Terminating instance {} with weight {} would take us below our target of {},"
                    " so this is as close to our target as we can get".format(
                        slave_to_kill.instance_id,
                        slave_to_kill.instance_weight, target_capacity))
                if self.resource[
                        'type'] == 'aws_spot_fleet_request' and killed_slaves == 0:
                    self.log.info(
                        "This is a SFR so we must kill at least one slave to prevent the autoscaler "
                        "getting stuck whilst scaling down gradually")
                else:
                    break
            try:
                self.gracefully_terminate_slave(
                    slave_to_kill=slave_to_kill,
                    current_capacity=current_capacity,
                    new_capacity=new_capacity)
                killed_slaves += 1
            except HTTPError:
                # Something wrong draining host so try next host
                continue
            except FailSetResourceCapacity:
                break

            current_capacity = new_capacity
            mesos_state = get_mesos_master().state_summary()
            if filtered_sorted_slaves:
                task_counts = get_mesos_task_count_by_slave(
                    mesos_state,
                    slaves_list=[{
                        'task_counts': slave.task_counts
                    } for slave in filtered_sorted_slaves])
                for i, slave in enumerate(filtered_sorted_slaves):
                    slave.task_counts = task_counts[i]['task_counts']
            filtered_slaves = filtered_sorted_slaves
Example #4
0
def test_sort_by_fitness():
    mock_slave_1 = Mock(name='slave1')
    mock_slave_1.task_counts = SlaveTaskCount(
        count=3,
        slave=Mock(),
        batch_count=1,
    )
    mock_slave_1.instance_status = {
        'Events': [],
        'SystemStatus': {'Status': 'impaired', },
        'InstanceStatus': {'Status': 'ok', },
    }
    mock_slave_2 = Mock(name='slave2')
    mock_slave_2.task_counts = SlaveTaskCount(
        count=3,
        slave=Mock(),
        batch_count=1,
    )
    mock_slave_2.instance_status = {
        'Events': [
            {
                'Code': 'instance-reboot',
                'Description': 'foo',
                'NotBefore': datetime(2015, 1, 1),
                'NotAfter': datetime(2015, 1, 1),
            },
        ],
        'SystemStatus': {'Status': 'ok', },
        'InstanceStatus': {'Status': 'ok', },
    }
    mock_slave_3 = Mock(name='slave3')
    mock_slave_3.task_counts = SlaveTaskCount(
        count=2,
        slave=Mock(),
        batch_count=3,
    )
    mock_slave_3.instance_status = {
        'Events': [],
        'SystemStatus': {'Status': 'ok', },
        'InstanceStatus': {'Status': 'ok', },
    }
    mock_slave_4 = Mock(name='slave4')
    mock_slave_4.task_counts = SlaveTaskCount(
        count=3,
        slave=Mock(),
        batch_count=1,
    )
    mock_slave_4.instance_status = {
        'Events': [],
        'SystemStatus': {'Status': 'ok', },
        'InstanceStatus': {'Status': 'ok', },
    }
    mock_slave_5 = Mock(name='slave5')
    mock_slave_5.task_counts = SlaveTaskCount(
        count=1,
        slave=Mock(),
        batch_count=1,
    )
    mock_slave_5.instance_status = {
        'Events': [],
        'SystemStatus': {'Status': 'ok', },
        'InstanceStatus': {'Status': 'ok', },
    }
    ret = ec2_fitness.sort_by_ec2_fitness([mock_slave_1, mock_slave_2, mock_slave_3, mock_slave_4, mock_slave_5])

    # we expect this order for the following reason:
    # mock_slave_1 is impaired and so should be killed asap
    # mock_slave_2 has an upcoming event
    # mock_slave_5 and mock_slave_4 have the fewest chronos tasks, and so should be killed before
    # mock_slave_3 (we cant drain chronos tasks, so try and save them)
    # mock_slave_5 has fewer tasks than mock_slave_4, and so is a better candidate for killing
    assert ret == [mock_slave_3, mock_slave_4, mock_slave_5, mock_slave_2, mock_slave_1]
Example #5
0
def test_sort_by_fitness():
    mock_slave_1 = Mock(
        task_counts=SlaveTaskCount(
            count=3,
            slave=Mock(),
            chronos_count=1,
        ),
        instance_status={
            'Events': [],
            'SystemStatus': {'Status': 'impaired', },
            'InstanceStatus': {'Status': 'ok', }
        },
    )
    mock_slave_2 = Mock(
        task_counts=SlaveTaskCount(
            count=3,
            slave=Mock(),
            chronos_count=1,
        ),
        instance_status={
            'Events': [
                {
                    'Code': 'instance-reboot',
                    'Description': 'foo',
                    'NotBefore': datetime(2015, 1, 1),
                    'NotAfter': datetime(2015, 1, 1)
                },
            ],
            'SystemStatus': {'Status': 'ok', },
            'InstanceStatus': {'Status': 'ok', }
        },
    )
    mock_slave_3 = Mock(
        task_counts=SlaveTaskCount(
            count=2,
            slave=Mock(),
            chronos_count=3,
        ),
        instance_status={
            'Events': [],
            'SystemStatus': {'Status': 'ok', },
            'InstanceStatus': {'Status': 'ok', }
        },
    )
    mock_slave_4 = Mock(
        task_counts=SlaveTaskCount(
            count=3,
            slave=Mock(),
            chronos_count=1,
        ),
        instance_status={
            'Events': [],
            'SystemStatus': {'Status': 'ok', },
            'InstanceStatus': {'Status': 'ok', }
        },
    )
    mock_slave_5 = Mock(
        task_counts=SlaveTaskCount(
            count=1,
            slave=Mock(),
            chronos_count=1,
        ),
        instance_status={
            'Events': [],
            'SystemStatus': {'Status': 'ok', },
            'InstanceStatus': {'Status': 'ok', }
        },
    )
    ret = ec2_fitness.sort_by_ec2_fitness([mock_slave_1, mock_slave_2, mock_slave_3, mock_slave_4, mock_slave_5])
    assert ret == [mock_slave_5, mock_slave_4, mock_slave_3, mock_slave_2, mock_slave_1]
    async def downscale_aws_resource(self, filtered_slaves, current_capacity,
                                     target_capacity):
        self.log.info("downscale_aws_resource for %s" % filtered_slaves)
        killed_slaves = 0
        terminate_tasks = {}
        self.capacity = current_capacity
        timer = Timer(300)
        while True:
            filtered_sorted_slaves = ec2_fitness.sort_by_ec2_fitness(
                filtered_slaves)[::-1]
            if len(filtered_sorted_slaves) == 0:
                self.log.info(
                    "ALL slaves killed so moving on to next resource!")
                break
            self.log.info("Resource slave kill preference: {}".format(
                [slave.hostname for slave in filtered_sorted_slaves]))
            slave_to_kill = filtered_sorted_slaves.pop(0)
            instance_capacity = slave_to_kill.instance_weight
            new_capacity = current_capacity - instance_capacity
            if new_capacity < target_capacity:
                self.log.info(
                    "Terminating instance {} with weight {} would take us below our target of {},"
                    " so this is as close to our target as we can get".format(
                        slave_to_kill.instance_id,
                        slave_to_kill.instance_weight,
                        target_capacity,
                    ))
                if self.resource[
                        'type'] == 'aws_spot_fleet_request' and killed_slaves == 0:
                    self.log.info(
                        "This is a SFR so we must kill at least one slave to prevent the autoscaler "
                        "getting stuck whilst scaling down gradually", )
                    if new_capacity < 1 and self.sfr[
                            'SpotFleetRequestState'] == 'active':
                        self.log.info(
                            "Can't set target capacity to less than 1 for SFRs. No further "
                            "action for this SFR", )
                        break
                else:
                    break

            capacity_diff = new_capacity - current_capacity
            self.log.info("Starting async kill for %s" %
                          slave_to_kill.hostname)
            # My understanding is that ensure_future will actually start running the coroutine
            #  (gracefully_terminate_slave), until it hits something that sleeps, then the loop
            #  can continue and we start killing the next slave
            terminate_tasks[slave_to_kill.hostname] = asyncio.ensure_future(
                self.gracefully_terminate_slave(
                    slave_to_kill=slave_to_kill,
                    capacity_diff=capacity_diff,
                    timer=timer,
                ), )
            killed_slaves += 1

            current_capacity = new_capacity
            filtered_slaves = filtered_sorted_slaves

        # Now we wait for each task to actually finish...
        for hostname, task in terminate_tasks.items():
            try:
                await task
            except (HTTPError, FailSetResourceCapacity):
                continue