def test_drain(mock_build_maintenance_schedule_payload, mock_get_schedule_client):
    fake_schedule = {"fake_schedule": "fake_value"}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    drain(hostnames=["some-host"], start="some-start", duration="some-duration")
    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(["some-host"], "some-start", "some-duration", drain=True)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args
    assert mock_get_schedule_client.call_count == 1
    assert mock_get_schedule_client.return_value.call_count == 1
    expected_args = mock.call(method="POST", endpoint="", data=json.dumps(fake_schedule))
    assert mock_get_schedule_client.return_value.call_args == expected_args
Example #2
0
def scale_aws_spot_fleet_request(resource, delta, target_capacity, sorted_slaves, dry_run):
    """Scales a spot fleet request by delta to reach target capacity
    If scaling up we just set target capacity and let AWS take care of the rest
    If scaling down we pick the slaves we'd prefer to kill, put them in maintenance
    mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill
    them once they are running 0 tasks or once a timeout is reached

    :param resource: resource to scale
    :param delta: integer change in number of servers
    :param target_capacity: target number of instances
    :param sorted_slaves: list of slaves by order to kill
    :param dry_run: Don't drain or make changes to spot fleet if True"""
    sfr_id = resource['id']
    ec2_client = boto3.client('ec2')
    if delta == 0:
        return
    elif delta > 0:
        log.info("Increasing spot fleet capacity to: {0}".format(target_capacity))
        if not dry_run:
            ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=target_capacity,
                                                 ExcessCapacityTerminationPolicy='noTermination')
            return
    elif delta < 0:
        number_to_kill = delta * -1
        sfr_ips = get_sfr_instance_ips(sfr_id)
        log.debug("IPs in SFR: {0}".format(sfr_ips))
        sfr_sorted_slaves = [slave for slave in sorted_slaves if slave_pid_to_ip(slave['pid']) in sfr_ips]
        log.info("SFR slave kill preference: {0}".format([slave['pid'] for slave in sfr_sorted_slaves]))
        if number_to_kill > len(sfr_sorted_slaves):
            log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!")
            return
        slaves_to_kill = sfr_sorted_slaves[0:number_to_kill]
        log.info("Set to kill: {0}".format([slave['pid'] for slave in slaves_to_kill]))
        instances_to_kill = {}
        for slave in slaves_to_kill:
            ip = slave_pid_to_ip(slave['pid'])
            instances_to_kill[slave['pid']] = {'ip': ip,
                                               'instance_id': get_instance_id_from_ip(ip)}
        # The start time of the maintenance window is the point at which
        # we giveup waiting for the instance to drain and mark it for termination anyway
        start = int(time.time() + CLUSTER_DRAIN_TIMEOUT)
        # Set the duration to an hour, if we haven't cleaned up and termintated by then
        # mesos should put the slave back into the pool
        duration = 600
        log.info("Draining {0}".format(instances_to_kill))
        log.info("Decreasing spot fleet capacity to: {0}".format(target_capacity))
        if not dry_run:
            # sort to make testing easier
            drain([instance['ip'] for instance in sorted(instances_to_kill.values())], start, duration)
            ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=target_capacity,
                                                 ExcessCapacityTerminationPolicy='noTermination')
        log.info("Waiting for instances to drain before we terminate")
        wait_and_terminate(instances_to_kill, dry_run)
def test_drain(
    mock_build_maintenance_schedule_payload,
    mock_get_schedule_client,
):
    fake_schedule = {'fake_schedule': 'fake_value'}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    drain(hostnames=['some-host'], start='some-start', duration='some-duration')
    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(['some-host'], 'some-start', 'some-duration', drain=True)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args
    assert mock_get_schedule_client.call_count == 1
    assert mock_get_schedule_client.return_value.call_count == 1
    expected_args = mock.call(method="POST", endpoint="", data=json.dumps(fake_schedule))
    assert mock_get_schedule_client.return_value.call_args == expected_args
Example #4
0
def mark_host_at_risk(context, host):
    start = paasta_maintenance.datetime_to_nanoseconds(paasta_maintenance.now())
    duration = paasta_maintenance.parse_timedelta('1h')
    config = {
        'master': '%s' % get_service_connection_string('mesosmaster'),
        'scheme': 'http',
        'response_timeout': 5,
    }
    with contextlib.nested(
        mock.patch('paasta_tools.paasta_maintenance.load_credentials', autospec=True),
        mock.patch.object(mesos.cli.master, 'CFG', config),
    ) as (
        mock_load_credentials,
        _,
    ):
        mock_load_credentials.side_effect = paasta_maintenance.load_credentials(mesos_secrets='/etc/mesos-slave-secret')
        paasta_maintenance.drain([host], start, duration)
        context.at_risk_host = host
def test_drain(
    mock_build_maintenance_schedule_payload,
    mock_get_schedule_client,
):
    fake_schedule = {'fake_schedule': 'fake_value'}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    drain(hostnames=['some-host'],
          start='some-start',
          duration='some-duration')
    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(['some-host'],
                              'some-start',
                              'some-duration',
                              drain=True)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args
    assert mock_get_schedule_client.call_count == 1
    assert mock_get_schedule_client.return_value.call_count == 1
    expected_args = mock.call(method="POST",
                              endpoint="",
                              data=json.dumps(fake_schedule))
    assert mock_get_schedule_client.return_value.call_args == expected_args
Example #6
0
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity, sorted_slaves, dry_run):
    """Scales a spot fleet request by delta to reach target capacity
    If scaling up we just set target capacity and let AWS take care of the rest
    If scaling down we pick the slaves we'd prefer to kill, put them in maintenance
    mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill
    them once they are running 0 tasks or once a timeout is reached

    :param resource: resource to scale
    :param current_capacity: integer current SFR capacity
    :param target_capacity: target SFR capacity
    :param sorted_slaves: list of slaves by order to kill
    :param dry_run: Don't drain or make changes to spot fleet if True"""
    target_capacity = int(target_capacity)
    current_capacity = int(current_capacity)
    delta = target_capacity - current_capacity
    sfr_id = resource['id']
    if delta == 0:
        log.info("Already at target capacity: {0}".format(target_capacity))
        return
    elif delta > 0:
        log.info("Increasing spot fleet capacity to: {0}".format(target_capacity))
        set_spot_fleet_request_capacity(sfr_id, target_capacity, dry_run)
        return
    elif delta < 0:
        sfr_sorted_slaves = filter_sfr_slaves(sorted_slaves, resource['sfr'])
        log.info("SFR slave kill preference: {0}".format([slave['pid'] for slave in sfr_sorted_slaves]))
        killable_capacity = sum([slave['instance_weight'] for slave in sfr_sorted_slaves])
        amount_to_decrease = delta * -1
        if amount_to_decrease > killable_capacity:
            log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!")
            return
        sfr_sorted_slaves.reverse()
        while True:
            if len(sfr_sorted_slaves) == 0:
                break
            slave_to_kill = sfr_sorted_slaves.pop()
            instance_capacity = int(slave_to_kill['instance_weight'])
            new_capacity = current_capacity - instance_capacity
            if current_capacity - instance_capacity < target_capacity:
                log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as"
                         " close to our target as we can get".format(slave_to_kill['instance_id'],
                                                                     slave_to_kill['instance_weight'],
                                                                     target_capacity))
                break
            # The start time of the maintenance window is the point at which
            # we giveup waiting for the instance to drain and mark it for termination anyway
            start = int(time.time() + CLUSTER_DRAIN_TIMEOUT)
            # Set the duration to an hour, if we haven't cleaned up and termintated by then
            # mesos should put the slave back into the pool
            duration = 600
            log.info("Draining {0}".format(slave_to_kill['pid']))
            if not dry_run:
                try:
                    drain([slave_to_kill['ip']], start, duration)
                except HTTPError as e:
                    log.error("Failed to trigger drain on {0}: {1}\n Trying next host".format(slave_to_kill['ip'], e))
                    continue
            log.info("Decreasing spot fleet capacity from {0} to: {1}".format(current_capacity, new_capacity))
            if not set_spot_fleet_request_capacity(sfr_id, new_capacity, dry_run):
                log.error("Couldn't update spot fleet, stopping autoscaler")
                break
            log.info("Waiting for instance to drain before we terminate")
            try:
                wait_and_terminate(slave_to_kill, dry_run)
            except ClientError as e:
                log.error("Failure when terminating: {0}: {1}".format(slave['pid'], e))
                log.error("Setting spot fleet capacity back to {0}".format(current_capacity))
                if not set_spot_fleet_request_capacity(sfr_id, current_capacity, dry_run):
                    log.error("Couldn't update spot fleet, stopping autoscaler")
                    break
                continue
            current_capacity = new_capacity
Example #7
0
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity, sorted_slaves, pool_settings, dry_run):
    """Scales a spot fleet request by delta to reach target capacity
    If scaling up we just set target capacity and let AWS take care of the rest
    If scaling down we pick the slaves we'd prefer to kill, put them in maintenance
    mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill
    them once they are running 0 tasks or once a timeout is reached

    :param resource: resource to scale
    :param current_capacity: integer current SFR capacity
    :param target_capacity: target SFR capacity
    :param sorted_slaves: list of slaves by order to kill
    :param dry_run: Don't drain or make changes to spot fleet if True"""
    target_capacity = int(target_capacity)
    current_capacity = int(current_capacity)
    delta = target_capacity - current_capacity
    sfr_id = resource['id']
    if delta == 0:
        log.info("Already at target capacity: {0}".format(target_capacity))
        return
    elif delta > 0:
        log.info("Increasing spot fleet capacity to: {0}".format(target_capacity))
        set_spot_fleet_request_capacity(sfr_id, target_capacity, dry_run, region=resource['region'])
        return
    elif delta < 0:
        sfr_sorted_slaves = filter_sfr_slaves(sorted_slaves, resource)
        log.info("SFR slave kill preference: {0}".format([slave['pid'] for slave in sfr_sorted_slaves]))
        killable_capacity = sum([slave['instance_weight'] for slave in sfr_sorted_slaves])
        amount_to_decrease = delta * -1
        if amount_to_decrease > killable_capacity:
            log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!")
            return
        sfr_sorted_slaves.reverse()
        while True:
            if len(sfr_sorted_slaves) == 0:
                break
            slave_to_kill = sfr_sorted_slaves.pop()
            # Instance weights can be floats but the target has to be an integer
            # because AWS...
            instance_capacity = slave_to_kill['instance_weight']
            new_capacity = int(round(current_capacity - instance_capacity))
            if new_capacity < target_capacity:
                log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as"
                         " close to our target as we can get".format(slave_to_kill['instance_id'],
                                                                     slave_to_kill['instance_weight'],
                                                                     target_capacity))
                break
            drain_timeout = pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT)
            # The start time of the maintenance window is the point at which
            # we giveup waiting for the instance to drain and mark it for termination anyway
            start = int(time.time() + drain_timeout) * 1000000000  # nanoseconds
            # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually
            # do anything at the end of the maintenance window.
            duration = 600 * 1000000000  # nanoseconds
            log.info("Draining {0}".format(slave_to_kill['pid']))
            if not dry_run:
                try:
                    drain_host_string = "{0}|{1}".format(slave_to_kill['hostname'], slave_to_kill['ip'])
                    drain([drain_host_string], start, duration)
                except HTTPError as e:
                    log.error("Failed to start drain "
                              "on {0}: {1}\n Trying next host".format(slave_to_kill['hostname'], e))
                    continue
            log.info("Decreasing spot fleet capacity from {0} to: {1}".format(current_capacity, new_capacity))
            if not set_spot_fleet_request_capacity(sfr_id, new_capacity, dry_run, region=resource['region']):
                log.error("Couldn't update spot fleet, stopping autoscaler")
                log.info("Undraining {0}".format(slave_to_kill['pid']))
                if not dry_run:
                    undrain([drain_host_string])
                break
            log.info("Waiting for instance to drain before we terminate")
            try:
                wait_and_terminate(slave_to_kill, drain_timeout, dry_run, region=resource['region'])
            except ClientError as e:
                log.error("Failure when terminating: {0}: {1}".format(slave['pid'], e))
                log.error("Setting spot fleet capacity back to {0}".format(current_capacity))
                if not set_spot_fleet_request_capacity(sfr_id, current_capacity, dry_run, region=resource['region']):
                    log.error("Couldn't update spot fleet, stopping autoscaler")
                    break
                continue
            finally:
                log.info("Undraining {0}".format(slave_to_kill['pid']))
                if not dry_run:
                    undrain([drain_host_string])
            current_capacity = new_capacity