Ejemplo n.º 1
0
def test_drain(
    mock_reserve_all_resources,
    mock_build_maintenance_schedule_payload,
    mock_get_schedule_client,
):
    fake_schedule = {'fake_schedule': 'fake_value'}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    drain(hostnames=['some-host'],
          start='some-start',
          duration='some-duration')

    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(['some-host'],
                              'some-start',
                              'some-duration',
                              drain=True)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args

    assert mock_reserve_all_resources.call_count == 1
    expected_args = mock.call(['some-host'])
    assert mock_reserve_all_resources.call_args == expected_args

    assert mock_get_schedule_client.call_count == 1
    assert mock_get_schedule_client.return_value.call_count == 1
    expected_args = mock.call(method="POST",
                              endpoint="",
                              data=json.dumps(fake_schedule))
    assert mock_get_schedule_client.return_value.call_args == expected_args

    mock_reserve_all_resources.side_effect = HTTPError()
    drain(hostnames=['some-host'],
          start='some-start',
          duration='some-duration')
    assert mock_get_schedule_client.call_count == 2
Ejemplo n.º 2
0
def test_drain(
    mock_reserve_all_resources,
    mock_build_maintenance_schedule_payload,
    mock_operator_api,
):
    fake_schedule = {'fake_schedule': 'fake_value'}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    drain(hostnames=['some-host'],
          start='some-start',
          duration='some-duration')

    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(['some-host'],
                              'some-start',
                              'some-duration',
                              drain=True)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args

    assert mock_reserve_all_resources.call_count == 1
    expected_args = mock.call(['some-host'])
    assert mock_reserve_all_resources.call_args == expected_args

    assert mock_operator_api.call_count == 1
    assert mock_operator_api.return_value.call_count == 1
    expected_args = mock.call(data=fake_schedule)
    assert mock_operator_api.return_value.call_args == expected_args

    mock_reserve_all_resources.side_effect = HTTPError()
    drain(hostnames=['some-host'],
          start='some-start',
          duration='some-duration')
    assert mock_operator_api.call_count == 2
Ejemplo n.º 3
0
def gracefully_terminate_slave(resource, slave_to_kill, pool_settings,
                               current_capacity, new_capacity, dry_run):
    sfr_id = resource['id']
    drain_timeout = pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT)
    # The start time of the maintenance window is the point at which
    # we giveup waiting for the instance to drain and mark it for termination anyway
    start = int(time.time() + drain_timeout) * 1000000000  # nanoseconds
    # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually
    # do anything at the end of the maintenance window.
    duration = 600 * 1000000000  # nanoseconds
    log.info("Draining {0}".format(slave_to_kill['pid']))
    if not dry_run:
        try:
            drain_host_string = "{0}|{1}".format(slave_to_kill['hostname'],
                                                 slave_to_kill['ip'])
            drain([drain_host_string], start, duration)
        except HTTPError as e:
            log.error("Failed to start drain "
                      "on {0}: {1}\n Trying next host".format(
                          slave_to_kill['hostname'], e))
            raise
    log.info("Decreasing spot fleet capacity from {0} to: {1}".format(
        current_capacity, new_capacity))
    # Instance weights can be floats but the target has to be an integer
    # because this is all AWS allows on the API call to set target capacity
    new_capacity = int(floor(new_capacity))
    try:
        set_spot_fleet_request_capacity(sfr_id,
                                        new_capacity,
                                        dry_run,
                                        region=resource['region'])
    except FailSetSpotCapacity:
        log.error("Couldn't update spot fleet, stopping autoscaler")
        log.info("Undraining {0}".format(slave_to_kill['pid']))
        if not dry_run:
            undrain([drain_host_string])
        raise
    log.info("Waiting for instance to drain before we terminate")
    try:
        wait_and_terminate(slave_to_kill,
                           drain_timeout,
                           dry_run,
                           region=resource['region'])
    except ClientError as e:
        log.error("Failure when terminating: {0}: {1}".format(
            slave_to_kill['pid'], e))
        log.error(
            "Setting spot fleet capacity back to {0}".format(current_capacity))
        set_spot_fleet_request_capacity(sfr_id,
                                        current_capacity,
                                        dry_run,
                                        region=resource['region'])
    finally:
        log.info("Undraining {0}".format(slave_to_kill['pid']))
        if not dry_run:
            undrain([drain_host_string])
Ejemplo n.º 4
0
def paasta_maintenance():
    """Manipulate the maintenance state of a PaaSTA host.
    :returns: None
    """
    args = parse_args()

    if args.verbose >= 2:
        logging.basicConfig(level=logging.DEBUG)
    elif args.verbose == 1:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.WARNING)

    action = args.action
    hostnames = args.hostname

    if action != 'status' and not hostnames:
        paasta_print("You must specify one or more hostnames")
        return

    start = args.start
    duration = args.duration

    ret = "Done"
    if action == 'drain':
        mesos_maintenance.drain(hostnames, start, duration)
    elif action == 'undrain':
        mesos_maintenance.undrain(hostnames)
    elif action == 'down':
        mesos_maintenance.down(hostnames)
    elif action == 'up':
        mesos_maintenance.up(hostnames)
    elif action == 'status':
        ret = mesos_maintenance.friendly_status()
    elif action == 'cluster_status':
        ret = mesos_maintenance.status()
    elif action == 'schedule':
        ret = mesos_maintenance.schedule()
    elif action == 'is_safe_to_drain':
        ret = is_safe_to_drain(hostnames[0])
    elif action == 'is_safe_to_kill':
        ret = is_safe_to_kill(hostnames[0])
    elif action == 'is_host_drained':
        ret = mesos_maintenance.is_host_drained(hostnames[0])
    elif action == 'is_host_down':
        ret = mesos_maintenance.is_host_down(hostnames[0])
    elif action == 'is_host_draining':
        ret = mesos_maintenance.is_host_draining(hostnames[0])
    elif action == 'is_host_past_maintenance_start':
        ret = mesos_maintenance.is_host_past_maintenance_start(hostnames[0])
    elif action == 'is_host_past_maintenance_end':
        ret = mesos_maintenance.is_host_past_maintenance_end(hostnames[0])
    else:
        raise NotImplementedError("Action: '%s' is not implemented." % action)
    paasta_print(ret)
    return ret
Ejemplo n.º 5
0
def paasta_maintenance():
    """Manipulate the maintenance state of a PaaSTA host.
    :returns: None
    """
    args = parse_args()

    if args.verbose >= 2:
        logging.basicConfig(level=logging.DEBUG)
    elif args.verbose == 1:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.WARNING)

    action = args.action
    hostnames = args.hostname

    if action != 'status' and not hostnames:
        paasta_print("You must specify one or more hostnames")
        return

    start = args.start
    duration = args.duration

    ret = "Done"
    if action == 'drain':
        mesos_maintenance.drain(hostnames, start, duration)
    elif action == 'undrain':
        mesos_maintenance.undrain(hostnames)
    elif action == 'down':
        mesos_maintenance.down(hostnames)
    elif action == 'up':
        mesos_maintenance.up(hostnames)
    elif action == 'status':
        ret = "%s" % mesos_maintenance.status()
    elif action == 'schedule':
        ret = "%s" % mesos_maintenance.schedule()
    elif action == 'is_safe_to_drain':
        ret = is_safe_to_drain(hostnames[0])
    elif action == 'is_safe_to_kill':
        ret = is_safe_to_kill(hostnames[0])
    elif action == 'is_host_drained':
        ret = mesos_maintenance.is_host_drained(hostnames[0])
    elif action == 'is_host_down':
        ret = mesos_maintenance.is_host_down(hostnames[0])
    elif action == 'is_host_draining':
        ret = mesos_maintenance.is_host_draining(hostnames[0])
    elif action == 'is_host_past_maintenance_start':
        ret = mesos_maintenance.is_host_past_maintenance_start(hostnames[0])
    elif action == 'is_host_past_maintenance_end':
        ret = mesos_maintenance.is_host_past_maintenance_end(hostnames[0])
    else:
        raise NotImplementedError("Action: '%s' is not implemented." % action)
    paasta_print(ret)
    return ret
Ejemplo n.º 6
0
 def gracefully_terminate_slave(self, slave_to_kill, current_capacity,
                                new_capacity):
     drain_timeout = self.pool_settings.get('drain_timeout',
                                            DEFAULT_DRAIN_TIMEOUT)
     # The start time of the maintenance window is the point at which
     # we giveup waiting for the instance to drain and mark it for termination anyway
     start = int(time.time() + drain_timeout) * 1000000000  # nanoseconds
     # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually
     # do anything at the end of the maintenance window.
     duration = 600 * 1000000000  # nanoseconds
     self.log.info("Draining {}".format(slave_to_kill.pid))
     should_drain = self.should_drain(slave_to_kill)
     if should_drain:
         try:
             drain_host_string = "{}|{}".format(slave_to_kill.hostname,
                                                slave_to_kill.ip)
             drain([drain_host_string], start, duration)
         except HTTPError as e:
             self.log.error("Failed to start drain "
                            "on {}: {}\n Trying next host".format(
                                slave_to_kill.hostname, e))
             raise
     self.log.info("Decreasing resource from {} to: {}".format(
         current_capacity, new_capacity))
     # Instance weights can be floats but the target has to be an integer
     # because this is all AWS allows on the API call to set target capacity
     new_capacity = int(floor(new_capacity))
     try:
         self.set_capacity(new_capacity)
     except FailSetResourceCapacity:
         self.log.error(
             "Couldn't update resource capacity, stopping autoscaler")
         self.log.info("Undraining {}".format(slave_to_kill.pid))
         if should_drain:
             undrain([drain_host_string])
         raise
     self.log.info("Waiting for instance to drain before we terminate")
     try:
         self.wait_and_terminate(
             slave=slave_to_kill,
             drain_timeout=drain_timeout,
             dry_run=self.dry_run,
             region=self.resource['region'],
             should_drain=should_drain,
         )
     except ClientError as e:
         self.log.error("Failure when terminating: {}: {}".format(
             slave_to_kill.pid, e))
         self.log.error("Setting resource capacity back to {}".format(
             current_capacity))
         self.set_capacity(current_capacity)
         self.log.info("Undraining {}".format(slave_to_kill.pid))
         if should_drain:
             undrain([drain_host_string])
Ejemplo n.º 7
0
def mark_host_at_risk(context, host):
    start = mesos_maintenance.datetime_to_nanoseconds(mesos_maintenance.now())
    duration = mesos_maintenance.parse_timedelta("1h")
    with contextlib.nested(
        mock.patch("paasta_tools.mesos_maintenance.get_principal", autospec=True),
        mock.patch("paasta_tools.mesos_maintenance.get_secret", autospec=True),
    ) as (mock_get_principal, mock_get_secret):
        credentials = mesos_maintenance.load_credentials(mesos_secrets="/etc/mesos-slave-secret")
        mock_get_principal.return_value = credentials.principal
        mock_get_secret.return_value = credentials.secret
        mesos_maintenance.drain([host], start, duration)
        context.at_risk_host = host
Ejemplo n.º 8
0
def mark_host_at_risk(context, host):
    start = mesos_maintenance.datetime_to_nanoseconds(mesos_maintenance.now())
    duration = mesos_maintenance.parse_timedelta('1h')
    with mock.patch(
        'paasta_tools.mesos_maintenance.get_principal', autospec=True,
    ) as mock_get_principal, mock.patch(
        'paasta_tools.mesos_maintenance.get_secret', autospec=True,
    ) as mock_get_secret:
        credentials = mesos_maintenance.load_credentials(mesos_secrets='/etc/mesos-slave-secret')
        mock_get_principal.return_value = credentials.principal
        mock_get_secret.return_value = credentials.secret
        mesos_maintenance.drain([host], start, duration)
        context.at_risk_host = host
Ejemplo n.º 9
0
def gracefully_terminate_slave(resource, slave_to_kill, pool_settings, current_capacity, new_capacity, dry_run):
    sfr_id = resource['id']
    drain_timeout = pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT)
    # The start time of the maintenance window is the point at which
    # we giveup waiting for the instance to drain and mark it for termination anyway
    start = int(time.time() + drain_timeout) * 1000000000  # nanoseconds
    # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually
    # do anything at the end of the maintenance window.
    duration = 600 * 1000000000  # nanoseconds
    log.info("Draining {0}".format(slave_to_kill['pid']))
    if not dry_run:
        try:
            drain_host_string = "{0}|{1}".format(slave_to_kill['hostname'], slave_to_kill['ip'])
            drain([drain_host_string], start, duration)
        except HTTPError as e:
            log.error("Failed to start drain "
                      "on {0}: {1}\n Trying next host".format(slave_to_kill['hostname'], e))
            raise
    log.info("Decreasing spot fleet capacity from {0} to: {1}".format(current_capacity, new_capacity))
    # Instance weights can be floats but the target has to be an integer
    # because this is all AWS allows on the API call to set target capacity
    new_capacity = int(floor(new_capacity))
    try:
        set_spot_fleet_request_capacity(sfr_id, new_capacity, dry_run, region=resource['region'])
    except FailSetSpotCapacity:
        log.error("Couldn't update spot fleet, stopping autoscaler")
        log.info("Undraining {0}".format(slave_to_kill['pid']))
        if not dry_run:
            undrain([drain_host_string])
        raise
    log.info("Waiting for instance to drain before we terminate")
    try:
        wait_and_terminate(slave_to_kill, drain_timeout, dry_run, region=resource['region'])
    except ClientError as e:
        log.error("Failure when terminating: {0}: {1}".format(slave_to_kill['pid'], e))
        log.error("Setting spot fleet capacity back to {0}".format(current_capacity))
        set_spot_fleet_request_capacity(sfr_id, current_capacity, dry_run, region=resource['region'])
    finally:
        log.info("Undraining {0}".format(slave_to_kill['pid']))
        if not dry_run:
            undrain([drain_host_string])
Ejemplo n.º 10
0
def test_drain(
    mock_reserve_all_resources,
    mock_build_maintenance_schedule_payload,
    mock_get_schedule_client,
):
    fake_schedule = {'fake_schedule': 'fake_value'}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    drain(hostnames=['some-host'], start='some-start', duration='some-duration')

    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(['some-host'], 'some-start', 'some-duration', drain=True)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args

    assert mock_reserve_all_resources.call_count == 1
    expected_args = mock.call(['some-host'])
    assert mock_reserve_all_resources.call_args == expected_args

    assert mock_get_schedule_client.call_count == 1
    assert mock_get_schedule_client.return_value.call_count == 1
    expected_args = mock.call(method="POST", endpoint="", data=json.dumps(fake_schedule))
    assert mock_get_schedule_client.return_value.call_args == expected_args
Ejemplo n.º 11
0
def test_drain(
    mock_reserve_all_resources,
    mock_build_maintenance_schedule_payload,
    mock_operator_api,
):
    fake_schedule = {"fake_schedule": "fake_value"}
    mock_build_maintenance_schedule_payload.return_value = fake_schedule
    drain(hostnames=["some-host"],
          start="some-start",
          duration="some-duration")

    assert mock_build_maintenance_schedule_payload.call_count == 1
    expected_args = mock.call(["some-host"],
                              "some-start",
                              "some-duration",
                              drain=True)
    assert mock_build_maintenance_schedule_payload.call_args == expected_args

    assert mock_reserve_all_resources.call_count == 1
    expected_args = mock.call(["some-host"])
    assert mock_reserve_all_resources.call_args == expected_args

    assert mock_operator_api.call_count == 1
    assert mock_operator_api.return_value.call_count == 1
    expected_args = mock.call(data=fake_schedule)
    assert mock_operator_api.return_value.call_args == expected_args

    mock_reserve_all_resources.side_effect = HTTPError()
    drain(hostnames=["some-host"],
          start="some-start",
          duration="some-duration")
    assert mock_operator_api.call_count == 2

    mock_reserve_all_resources.reset_mock()
    mock_operator_api.reset_mock()
    drain(
        hostnames=["some-host"],
        start="some-start",
        duration="some-duration",
        reserve_resources=False,
    )
    assert mock_reserve_all_resources.call_count == 0
    assert mock_operator_api.return_value.call_count == 1
Ejemplo n.º 12
0
 async def gracefully_terminate_slave(self, slave_to_kill, capacity_diff,
                                      timer):
     """
     Since this is async, it can be suspended at an `await` call.  Because of this, we need to re-calculate
     the capacity each time we call `set_capacity` (as another coroutine could have set the capacity while
     this one was suspended).  `set_capacity` stores the currently set capacity in the object, and then
     this function re-calculates that from the capacity_diff each time we call `set_capacity`
     """
     drain_timeout = self.pool_settings.get('drain_timeout',
                                            DEFAULT_DRAIN_TIMEOUT)
     # The start time of the maintenance window is the point at which
     # we giveup waiting for the instance to drain and mark it for termination anyway
     start = int(time.time() + drain_timeout) * 1000000000  # nanoseconds
     # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually
     # do anything at the end of the maintenance window.
     duration = 600 * 1000000000  # nanoseconds
     self.log.info("Draining {}".format(slave_to_kill.pid))
     should_drain = self.should_drain(slave_to_kill)
     if should_drain:
         try:
             drain_host_string = "{}|{}".format(slave_to_kill.hostname,
                                                slave_to_kill.ip)
             drain([drain_host_string], start, duration)
         except HTTPError as e:
             self.log.error("Failed to start drain "
                            "on {}: {}\n Trying next host".format(
                                slave_to_kill.hostname, e))
             raise
     self.log.info("Decreasing resource from {} to: {}".format(
         self.capacity, self.capacity + capacity_diff))
     # Instance weights can be floats but the target has to be an integer
     # because this is all AWS allows on the API call to set target capacity
     try:
         self.set_capacity(self.capacity + capacity_diff)
     except FailSetResourceCapacity:
         self.log.error(
             "Couldn't update resource capacity, stopping autoscaler")
         self.log.info("Undraining {}".format(slave_to_kill.pid))
         if should_drain:
             undrain([drain_host_string])
         raise
     self.log.info("Waiting for instance to drain before we terminate")
     try:
         await self.wait_and_terminate(
             slave=slave_to_kill,
             drain_timeout=drain_timeout,
             dry_run=self.dry_run,
             timer=timer,
             region=self.resource['region'],
             should_drain=should_drain,
         )
     except ClientError as e:
         self.log.error("Failure when terminating: {}: {}".format(
             slave_to_kill.pid, e))
         self.log.error(
             "Setting resource capacity back to {}".format(self.capacity -
                                                           capacity_diff))
         self.set_capacity(self.capacity - capacity_diff)
         self.log.info("Undraining {}".format(slave_to_kill.pid))
         if should_drain:
             undrain([drain_host_string])