def test_drain(mock_build_maintenance_schedule_payload, mock_get_schedule_client): fake_schedule = {"fake_schedule": "fake_value"} mock_build_maintenance_schedule_payload.return_value = fake_schedule drain(hostnames=["some-host"], start="some-start", duration="some-duration") assert mock_build_maintenance_schedule_payload.call_count == 1 expected_args = mock.call(["some-host"], "some-start", "some-duration", drain=True) assert mock_build_maintenance_schedule_payload.call_args == expected_args assert mock_get_schedule_client.call_count == 1 assert mock_get_schedule_client.return_value.call_count == 1 expected_args = mock.call(method="POST", endpoint="", data=json.dumps(fake_schedule)) assert mock_get_schedule_client.return_value.call_args == expected_args
def scale_aws_spot_fleet_request(resource, delta, target_capacity, sorted_slaves, dry_run): """Scales a spot fleet request by delta to reach target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param resource: resource to scale :param delta: integer change in number of servers :param target_capacity: target number of instances :param sorted_slaves: list of slaves by order to kill :param dry_run: Don't drain or make changes to spot fleet if True""" sfr_id = resource['id'] ec2_client = boto3.client('ec2') if delta == 0: return elif delta > 0: log.info("Increasing spot fleet capacity to: {0}".format(target_capacity)) if not dry_run: ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=target_capacity, ExcessCapacityTerminationPolicy='noTermination') return elif delta < 0: number_to_kill = delta * -1 sfr_ips = get_sfr_instance_ips(sfr_id) log.debug("IPs in SFR: {0}".format(sfr_ips)) sfr_sorted_slaves = [slave for slave in sorted_slaves if slave_pid_to_ip(slave['pid']) in sfr_ips] log.info("SFR slave kill preference: {0}".format([slave['pid'] for slave in sfr_sorted_slaves])) if number_to_kill > len(sfr_sorted_slaves): log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!") return slaves_to_kill = sfr_sorted_slaves[0:number_to_kill] log.info("Set to kill: {0}".format([slave['pid'] for slave in slaves_to_kill])) instances_to_kill = {} for slave in slaves_to_kill: ip = slave_pid_to_ip(slave['pid']) instances_to_kill[slave['pid']] = {'ip': ip, 'instance_id': get_instance_id_from_ip(ip)} # The start time of the maintenance window is the point at which # we giveup waiting for the instance to drain and mark it for termination anyway start = int(time.time() + CLUSTER_DRAIN_TIMEOUT) # Set the duration to an hour, if we haven't cleaned up and termintated by then # mesos should put the slave back into the pool duration = 600 log.info("Draining {0}".format(instances_to_kill)) log.info("Decreasing spot fleet capacity to: {0}".format(target_capacity)) if not dry_run: # sort to make testing easier drain([instance['ip'] for instance in sorted(instances_to_kill.values())], start, duration) ec2_client.modify_spot_fleet_request(SpotFleetRequestId=sfr_id, TargetCapacity=target_capacity, ExcessCapacityTerminationPolicy='noTermination') log.info("Waiting for instances to drain before we terminate") wait_and_terminate(instances_to_kill, dry_run)
def test_drain( mock_build_maintenance_schedule_payload, mock_get_schedule_client, ): fake_schedule = {'fake_schedule': 'fake_value'} mock_build_maintenance_schedule_payload.return_value = fake_schedule drain(hostnames=['some-host'], start='some-start', duration='some-duration') assert mock_build_maintenance_schedule_payload.call_count == 1 expected_args = mock.call(['some-host'], 'some-start', 'some-duration', drain=True) assert mock_build_maintenance_schedule_payload.call_args == expected_args assert mock_get_schedule_client.call_count == 1 assert mock_get_schedule_client.return_value.call_count == 1 expected_args = mock.call(method="POST", endpoint="", data=json.dumps(fake_schedule)) assert mock_get_schedule_client.return_value.call_args == expected_args
def mark_host_at_risk(context, host): start = paasta_maintenance.datetime_to_nanoseconds(paasta_maintenance.now()) duration = paasta_maintenance.parse_timedelta('1h') config = { 'master': '%s' % get_service_connection_string('mesosmaster'), 'scheme': 'http', 'response_timeout': 5, } with contextlib.nested( mock.patch('paasta_tools.paasta_maintenance.load_credentials', autospec=True), mock.patch.object(mesos.cli.master, 'CFG', config), ) as ( mock_load_credentials, _, ): mock_load_credentials.side_effect = paasta_maintenance.load_credentials(mesos_secrets='/etc/mesos-slave-secret') paasta_maintenance.drain([host], start, duration) context.at_risk_host = host
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity, sorted_slaves, dry_run): """Scales a spot fleet request by delta to reach target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param resource: resource to scale :param current_capacity: integer current SFR capacity :param target_capacity: target SFR capacity :param sorted_slaves: list of slaves by order to kill :param dry_run: Don't drain or make changes to spot fleet if True""" target_capacity = int(target_capacity) current_capacity = int(current_capacity) delta = target_capacity - current_capacity sfr_id = resource['id'] if delta == 0: log.info("Already at target capacity: {0}".format(target_capacity)) return elif delta > 0: log.info("Increasing spot fleet capacity to: {0}".format(target_capacity)) set_spot_fleet_request_capacity(sfr_id, target_capacity, dry_run) return elif delta < 0: sfr_sorted_slaves = filter_sfr_slaves(sorted_slaves, resource['sfr']) log.info("SFR slave kill preference: {0}".format([slave['pid'] for slave in sfr_sorted_slaves])) killable_capacity = sum([slave['instance_weight'] for slave in sfr_sorted_slaves]) amount_to_decrease = delta * -1 if amount_to_decrease > killable_capacity: log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!") return sfr_sorted_slaves.reverse() while True: if len(sfr_sorted_slaves) == 0: break slave_to_kill = sfr_sorted_slaves.pop() instance_capacity = int(slave_to_kill['instance_weight']) new_capacity = current_capacity - instance_capacity if current_capacity - instance_capacity < target_capacity: log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as" " close to our target as we can get".format(slave_to_kill['instance_id'], slave_to_kill['instance_weight'], target_capacity)) break # The start time of the maintenance window is the point at which # we giveup waiting for the instance to drain and mark it for termination anyway start = int(time.time() + CLUSTER_DRAIN_TIMEOUT) # Set the duration to an hour, if we haven't cleaned up and termintated by then # mesos should put the slave back into the pool duration = 600 log.info("Draining {0}".format(slave_to_kill['pid'])) if not dry_run: try: drain([slave_to_kill['ip']], start, duration) except HTTPError as e: log.error("Failed to trigger drain on {0}: {1}\n Trying next host".format(slave_to_kill['ip'], e)) continue log.info("Decreasing spot fleet capacity from {0} to: {1}".format(current_capacity, new_capacity)) if not set_spot_fleet_request_capacity(sfr_id, new_capacity, dry_run): log.error("Couldn't update spot fleet, stopping autoscaler") break log.info("Waiting for instance to drain before we terminate") try: wait_and_terminate(slave_to_kill, dry_run) except ClientError as e: log.error("Failure when terminating: {0}: {1}".format(slave['pid'], e)) log.error("Setting spot fleet capacity back to {0}".format(current_capacity)) if not set_spot_fleet_request_capacity(sfr_id, current_capacity, dry_run): log.error("Couldn't update spot fleet, stopping autoscaler") break continue current_capacity = new_capacity
def scale_aws_spot_fleet_request(resource, current_capacity, target_capacity, sorted_slaves, pool_settings, dry_run): """Scales a spot fleet request by delta to reach target capacity If scaling up we just set target capacity and let AWS take care of the rest If scaling down we pick the slaves we'd prefer to kill, put them in maintenance mode and drain them (via paasta_maintenance and setup_marathon_jobs). We then kill them once they are running 0 tasks or once a timeout is reached :param resource: resource to scale :param current_capacity: integer current SFR capacity :param target_capacity: target SFR capacity :param sorted_slaves: list of slaves by order to kill :param dry_run: Don't drain or make changes to spot fleet if True""" target_capacity = int(target_capacity) current_capacity = int(current_capacity) delta = target_capacity - current_capacity sfr_id = resource['id'] if delta == 0: log.info("Already at target capacity: {0}".format(target_capacity)) return elif delta > 0: log.info("Increasing spot fleet capacity to: {0}".format(target_capacity)) set_spot_fleet_request_capacity(sfr_id, target_capacity, dry_run, region=resource['region']) return elif delta < 0: sfr_sorted_slaves = filter_sfr_slaves(sorted_slaves, resource) log.info("SFR slave kill preference: {0}".format([slave['pid'] for slave in sfr_sorted_slaves])) killable_capacity = sum([slave['instance_weight'] for slave in sfr_sorted_slaves]) amount_to_decrease = delta * -1 if amount_to_decrease > killable_capacity: log.error("Didn't find enough candidates to kill. This shouldn't happen so let's not kill anything!") return sfr_sorted_slaves.reverse() while True: if len(sfr_sorted_slaves) == 0: break slave_to_kill = sfr_sorted_slaves.pop() # Instance weights can be floats but the target has to be an integer # because AWS... instance_capacity = slave_to_kill['instance_weight'] new_capacity = int(round(current_capacity - instance_capacity)) if new_capacity < target_capacity: log.info("Terminating instance {0} with weight {1} would take us below our target of {2}, so this is as" " close to our target as we can get".format(slave_to_kill['instance_id'], slave_to_kill['instance_weight'], target_capacity)) break drain_timeout = pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT) # The start time of the maintenance window is the point at which # we giveup waiting for the instance to drain and mark it for termination anyway start = int(time.time() + drain_timeout) * 1000000000 # nanoseconds # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually # do anything at the end of the maintenance window. duration = 600 * 1000000000 # nanoseconds log.info("Draining {0}".format(slave_to_kill['pid'])) if not dry_run: try: drain_host_string = "{0}|{1}".format(slave_to_kill['hostname'], slave_to_kill['ip']) drain([drain_host_string], start, duration) except HTTPError as e: log.error("Failed to start drain " "on {0}: {1}\n Trying next host".format(slave_to_kill['hostname'], e)) continue log.info("Decreasing spot fleet capacity from {0} to: {1}".format(current_capacity, new_capacity)) if not set_spot_fleet_request_capacity(sfr_id, new_capacity, dry_run, region=resource['region']): log.error("Couldn't update spot fleet, stopping autoscaler") log.info("Undraining {0}".format(slave_to_kill['pid'])) if not dry_run: undrain([drain_host_string]) break log.info("Waiting for instance to drain before we terminate") try: wait_and_terminate(slave_to_kill, drain_timeout, dry_run, region=resource['region']) except ClientError as e: log.error("Failure when terminating: {0}: {1}".format(slave['pid'], e)) log.error("Setting spot fleet capacity back to {0}".format(current_capacity)) if not set_spot_fleet_request_capacity(sfr_id, current_capacity, dry_run, region=resource['region']): log.error("Couldn't update spot fleet, stopping autoscaler") break continue finally: log.info("Undraining {0}".format(slave_to_kill['pid'])) if not dry_run: undrain([drain_host_string]) current_capacity = new_capacity