Exemple #1
0
def test_agent_failure(dcos_launchpad, cluster, vip_apps):
    # make sure the app works before starting
    @retry_boto_rate_limits
    def get_running_agents(group_name):
        return [i for i in dcos_launchpad.get_auto_scaling_instances(group_name)
                if i.state['Name'] == 'running']

    test_util.helpers.wait_for_pong(vip_apps[0][1], 120)
    test_util.helpers.wait_for_pong(vip_apps[1][1], 10)
    agents = [i.instance_id for i in get_running_agents('PublicSlaveServerGroup') +
              get_running_agents('SlaveServerGroup')]

    # Agents are in auto-scaling groups, so they will automatically be replaced
    dcos_launchpad.boto_wrapper.client('ec2').terminate_instances(InstanceIds=agents)
    waiter = dcos_launchpad.boto_wrapper.client('ec2').get_waiter('instance_terminated')
    retry_boto_rate_limits(waiter.wait)(InstanceIds=agents)

    # Tell mesos the machines are "down" and not coming up so things get rescheduled.
    down_hosts = [{'hostname': slave, 'ip': slave} for slave in cluster.all_slaves]
    cluster.post(
        '/mesos/maintenance/schedule',
        json={'windows': [{
            'machine_ids': down_hosts,
            'unavailability': {'start': {'nanoseconds': 0}}
        }]}).raise_for_status()
    cluster.post('/mesos/machine/down', json=down_hosts).raise_for_status()

    # Wait for replacements
    test_util.helpers.wait_for_len(partial(get_running_agents, 'SlaveServerGroup'), len(cluster.slaves), 600)
    test_util.helpers.wait_for_len(
        partial(get_running_agents, 'PublicSlaveServerGroup'), len(cluster.public_slaves), 600)

    # Reset the cluster to have the replacement agents
    cluster.slaves = sorted([agent.private_ip_address for agent in
                             get_running_agents('SlaveServerGroup')])
    cluster.public_slaves = sorted([agent.private_ip_address for agent in
                                    get_running_agents('PublicSlaveServerGroup')])
    cluster.all_slaves = sorted(cluster.slaves + cluster.public_slaves)

    # verify that everything else is still working
    cluster.wait_for_dcos()
    # finally verify that the app is again running somewhere with its VIPs
    # Give marathon five minutes to deploy both the apps
    test_util.helpers.wait_for_pong(vip_apps[0][1], 300)
    test_util.helpers.wait_for_pong(vip_apps[1][1], 10)
Exemple #2
0
def test_agent_failure(dcos_launchpad, cluster, vip_apps):
    # make sure the app works before starting
    @retry_boto_rate_limits
    def get_running_agents(group_name):
        return [
            i for i in dcos_launchpad.get_auto_scaling_instances(group_name)
            if i.state['Name'] == 'running'
        ]

    test_util.helpers.wait_for_pong(vip_apps[0][1], 120)
    test_util.helpers.wait_for_pong(vip_apps[1][1], 10)
    agents = [
        i.instance_id for i in get_running_agents('PublicSlaveServerGroup') +
        get_running_agents('SlaveServerGroup')
    ]

    # Agents are in autoscaling groups, so they will automatically be replaced
    dcos_launchpad.boto_wrapper.client('ec2').terminate_instances(
        InstanceIds=agents)
    waiter = dcos_launchpad.boto_wrapper.client('ec2').get_waiter(
        'instance_terminated')
    retry_boto_rate_limits(waiter.wait)(InstanceIds=agents)

    # Tell mesos the machines are "down" and not coming up so things get rescheduled.
    down_hosts = [{
        'hostname': slave,
        'ip': slave
    } for slave in cluster.all_slaves]
    cluster.post('/mesos/maintenance/schedule',
                 json={
                     'windows': [{
                         'machine_ids': down_hosts,
                         'unavailability': {
                             'start': {
                                 'nanoseconds': 0
                             }
                         }
                     }]
                 }).raise_for_status()
    cluster.post('/mesos/machine/down', json=down_hosts).raise_for_status()

    # Wait for replacements
    test_util.helpers.wait_for_len(
        partial(get_running_agents, 'SlaveServerGroup'), len(cluster.slaves),
        600)
    test_util.helpers.wait_for_len(
        partial(get_running_agents, 'PublicSlaveServerGroup'),
        len(cluster.public_slaves), 600)

    # Reset the cluster to have the replacement agents
    cluster.slaves = sorted([
        agent.private_ip_address
        for agent in get_running_agents('SlaveServerGroup')
    ])
    cluster.public_slaves = sorted([
        agent.private_ip_address
        for agent in get_running_agents('PublicSlaveServerGroup')
    ])
    cluster.all_slaves = sorted(cluster.slaves + cluster.public_slaves)

    # verify that everything else is still working
    cluster.wait_for_dcos()
    # finally verify that the app is again running somewhere with its VIPs
    # Give marathon five minutes to deploy both the apps
    test_util.helpers.wait_for_pong(vip_apps[0][1], 300)
    test_util.helpers.wait_for_pong(vip_apps[1][1], 10)
def test_agent_failure(dcos_stack, boto_wrapper, dcos_api_session, vip_apps):
    # Accessing AWS Resource objects will trigger a client describe call.
    # As such, any method that touches AWS APIs must be wrapped to avoid
    # CI collapse when rate limits are inevitably reached
    @retry_boto_rate_limits
    def get_running_instances(instance_iter):
        return [i for i in instance_iter if i.state['Name'] == 'running']

    @retry_boto_rate_limits
    def get_instance_ids(instance_iter):
        return [i.instance_id for i in instance_iter]

    @retry_boto_rate_limits
    def get_private_ips(instance_iter):
        return sorted([i.private_ip_address for i in get_running_instances(instance_iter)])

    # make sure the app works before starting
    test_util.helpers.wait_for_pong(vip_apps[0][1], 120)
    test_util.helpers.wait_for_pong(vip_apps[1][1], 10)
    agent_ids = get_instance_ids(
        get_running_instances(dcos_stack.public_agent_instances) +
        get_running_instances(dcos_stack.private_agent_instances))

    # Agents are in auto-scaling groups, so they will automatically be replaced
    boto_wrapper.client('ec2').terminate_instances(InstanceIds=agent_ids)
    waiter = boto_wrapper.client('ec2').get_waiter('instance_terminated')
    retry_boto_rate_limits(waiter.wait)(InstanceIds=agent_ids)

    # Tell mesos the machines are "down" and not coming up so things get rescheduled.
    down_hosts = [{'hostname': slave, 'ip': slave} for slave in dcos_api_session.all_slaves]
    dcos_api_session.post(
        '/mesos/maintenance/schedule',
        json={'windows': [{
            'machine_ids': down_hosts,
            'unavailability': {'start': {'nanoseconds': 0}}
        }]}).raise_for_status()
    dcos_api_session.post('/mesos/machine/down', json=down_hosts).raise_for_status()

    public_agent_count = len(dcos_api_session.public_slaves)
    private_agent_count = len(dcos_api_session.slaves)

    @retrying.retry(
        wait_fixed=60 * 1000,
        retry_on_result=lambda res: res is False,
        stop_max_delay=900 * 1000)
    def wait_for_agents_to_refresh():
        public_agents = get_running_instances(dcos_stack.public_agent_instances)
        if len(public_agents) == public_agent_count:
            dcos_api_session.public_slaves = get_private_ips(public_agents)
        else:
            log.info('Waiting for {} public agents. Current: {}'.format(
                     public_agent_count, len(public_agents)))
            return False
        private_agents = get_running_instances(dcos_stack.private_agent_instances)
        if len(private_agents) == private_agent_count:
            dcos_api_session.slaves = get_private_ips(private_agents)
        else:
            log.info('Waiting for {} private agents. Current: {}'.format(
                     private_agent_count, len(private_agents)))
            return False
        dcos_api_session.all_slaves = sorted(
            dcos_api_session.slaves + dcos_api_session.public_slaves)

    wait_for_agents_to_refresh()

    # verify that everything else is still working
    dcos_api_session.wait_for_dcos()
    # finally verify that the app is again running somewhere with its VIPs
    # Give marathon five minutes to deploy both the apps
    test_util.helpers.wait_for_pong(vip_apps[0][1], 300)
    test_util.helpers.wait_for_pong(vip_apps[1][1], 10)