Example #1
0
def _test_log_rotation_external_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    last_value_0 = '[{}]'.format(','.join((str(expect-v)
                                         for v in range(6,-2,-2))))
    last_value_1 = '[{}]'.format(','.join((str(expect-1-v)
                                         for v in range(6,-2,-2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 + (2 * (workers - 1))
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, (control_port, data_port, external_port),
         worker_ports) = (ports[:sources],
                          ports[sources:sources+3],
                          zip(ports[-(2*(workers-1)):][::2],
                              ports[-(2*(workers-1)):][1::2]))
        inputs = ','.join(['{}:{}'.format(host, p) for p in
                           input_ports])

        start_runners(runners, command, host, inputs, outputs,
                      metrics_port, control_port, external_port, data_port,
                      res_dir, workers, worker_ports)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host, input_ports[0], reader, batch_size=100,
                        interval=0.05)
        sender.start()

        time.sleep(0.5)
        # Trigger log rotation with external message
        cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m '
                                'worker1'
                                .format(host, external_port))

        res = run_shell_cmd(cmd_external_trigger)
        try:
            assert(res.success)
        except AssertionError:
            raise AssertionError('External rotation trigger failed with '
                                 'the error:\n{}'.format(res.output))

        # Check for log rotation
        log_rotated_checker = RunnerChecker(runners[1], log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)


        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'
                        .format(out_file = out_file,
                                expect = expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert(res.success)
        except AssertionError:
            print runners[0].name
            print runners[0].get_output()
            print '---'
            print runners[1].name
            print runners[1].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate all workers underwent log rotation
        r = runners[1]
        stdout = r.get_output()
        try:
            assert(re.search(log_rotated_pattern, stdout, re.M | re.S)
                   is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n'
                                 % (1, r.name, log_rotated_pattern, stdout))
        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout = runners[-1].get_output()
        try:
            assert(re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed recovery as expected. Worker '
                                 'output is '
                                 'included below.\nSTDOUT\n---\n%s'
                                 % (len(runners)-1, runners[-1].name,
                                    stdout))
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Example #2
0
def _autoscale_sequence(command, ops=[], cycles=1, initial=None):
    host = '127.0.0.1'
    sources = 1

    if isinstance(ops, int):
        ops = [ops]

    # If no initial workers value is given, determine the minimum number
    # required at the start so that the cluster never goes below 1 worker.
    # If a number is given, then verify it is sufficient.
    if ops:
        lowest = lowest_point(ops*cycles)
        if lowest < 1:
            min_workers = abs(lowest) + 1
        else:
            min_workers = 1
        if isinstance(initial, int):
            assert(initial >= min_workers)
            workers = initial
        else:
            workers = min_workers
    else:  # Test is only for setup using initial workers
        assert(initial > 0)
        workers = initial

    batch_size = 10
    interval = 0.05
    sender_timeout = 30 # Counted from when Sender is stopped
    runner_join_timeout = 30

    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    setup_resilience_path(res_dir)

    steps = []

    runners = []
    try:
        try:
            # Create sink, metrics, reader, sender
            sink = Sink(host)
            metrics = Metrics(host)
            lowercase2 = [a+b for a in lowercase for b in lowercase]
            char_cycle = cycle(lowercase2)
            expected = Counter()
            def count_sent(s):
                expected[s] += 1

            reader = Reader(iter_generator(
                items=char_cycle, to_string=lambda s: pack('>2sI', s, 1),
                on_next=count_sent))

            # Start sink and metrics, and get their connection info
            sink.start()
            sink_host, sink_port = sink.get_connection_info()
            outputs = '{}:{}'.format(sink_host, sink_port)

            metrics.start()
            metrics_host, metrics_port = metrics.get_connection_info()
            time.sleep(0.05)

            num_ports = sources + 3 + 3 * workers
            ports = get_port_values(num=num_ports, host=host)
            (input_ports, worker_ports) = (
                ports[:sources],
                [ports[sources:][i:i+3] for i in xrange(0,
                    len(ports[sources:]), 3)])
            inputs = ','.join(['{}:{}'.format(host, p) for p in
                               input_ports])

            # Start the initial runners
            start_runners(runners, command, host, inputs, outputs,
                          metrics_port, res_dir, workers, worker_ports)

            # Verify cluster is processing messages
            obs = ObservabilityNotifier(cluster_status_query,
                (host, worker_ports[0][2]),
                tests=test_cluster_is_processing)
            obs.start()
            obs.join()
            if obs.error:
                raise obs.error

            # Verify that `workers` workers are active
            # Create a partial function
            partial_test_worker_count = partial(test_worker_count, workers)
            obs = ObservabilityNotifier(cluster_status_query,
                (host, worker_ports[0][2]),
                tests=partial_test_worker_count)
            obs.start()
            obs.join()
            if obs.error:
                raise obs.error

            # Verify initializer starts with partitions
            obs = ObservabilityNotifier(state_entity_query,
                (host, worker_ports[0][2]),
                 test_worker_has_state_entities)
            obs.start()
            obs.join()
            if obs.error:
                raise obs.error

            # start sender
            sender = Sender(host, input_ports[0], reader, batch_size=batch_size,
                            interval=interval)
            sender.start()
            # Give the cluster 1 second to build up some state
            time.sleep(1)

            # Perform autoscale cycles
            for cyc in range(cycles):
                for joiners in ops:
                    # Verify cluster is processing before proceeding
                    obs = ObservabilityNotifier(cluster_status_query,
                        (host, worker_ports[0][2]),
                        tests=test_cluster_is_processing, timeout=30)
                    obs.start()
                    obs.join()
                    if obs.error:
                        raise obs.error

                    # Test for crashed workers
                    test_crashed_workers(runners)

                    # get partition data before autoscale operation begins
                    addresses = [(r.name, r.external) for r in runners
                                 if r.is_alive()]
                    responses = multi_states_query(addresses)
                    pre_partitions = joined_partition_query_data(responses)
                    steps.append(joiners)
                    joined = []
                    left = []

                    if joiners > 0:  # autoscale: grow
                        # create new workers and have them join
                        new_ports = get_port_values(num=(joiners * 3), host=host,
                                                    base_port=25000)
                        joiner_ports = [new_ports[i:i+3] for i in
                                        xrange(0, len(new_ports), 3)]
                        for i in range(joiners):
                            add_runner(runners, command, host, inputs, outputs,
                                       metrics_port,
                                       worker_ports[0][0], res_dir,
                                       joiners, *joiner_ports[i])
                            joined.append(runners[-1])

                    elif joiners < 0:  # autoscale: shrink
                        # choose the most recent, still-alive runners to leave
                        leavers = abs(joiners)
                        idx = 1
                        while len(left) < leavers and idx < len(runners):
                            if runners[-idx].is_alive():
                                left.append(runners[-idx])
                            idx += 1
                        if len(left) < leavers:
                            raise AutoscaleTestError("Not enough workers left to "
                                                     "shrink! {} requested but "
                                                     "only {} live non-initializer"
                                                     "workers found!"
                                                    .format(joiners, len(left)))

                        # Send the shrink command
                        resp = send_shrink_cmd(*runners[0].external,
                                               names=[r.name for r in left])
                        print("Sent a shrink command for {}".format(
                            [r.name for r in left]))
                        print("Response was: {}".format(resp))

                    else:  # Handle the 0 case as a noop
                        continue

                    # Wait until all live workers report 'ready'
                    wait_for_cluster_to_resume_processing(runners)

                    # Test for crashed workers
                    test_crashed_workers(runners)

                    # Test: at least some states moved, and no states from
                    #       pre are missing from the post

                    # get partition data before autoscale operation begins
                    workers={'joining': [r.name for r in joined],
                             'leaving': [r.name for r in left]}
                    # use a pre_process function to recreate this data for
                    # retriable tests
                    def pre_process():
                        addresses = [(r.name, r.external) for r in runners
                                     if r.is_alive()]
                        responses = multi_states_query(addresses)
                        post_partitions = joined_partition_query_data(responses)
                        return (pre_partitions, post_partitions, workers)
                    # retry the test until it passes or a timeout elapses
                    try_until_timeout(test_migration, pre_process, timeout=120)

                    # Wait a second before the next operation, allowing some
                    # more data to go through the system
                    time.sleep(1)

            time.sleep(2)

            # Test for crashed workers
            test_crashed_workers(runners)

            # Test is done, so stop sender
            sender.stop()

            # wait until sender sends out its final batch and exits
            sender.join(sender_timeout)
            if sender.error:
                raise sender.error
            if sender.is_alive():
                sender.stop()
                raise TimeoutError('Sender did not complete in the expected '
                                   'period')

            print('Sender sent {} messages'.format(sum(expected.values())))

            # Use Sink value to determine when to stop runners and sink
            pack677 = '>I2sQ'
            await_values = [pack(pack677, calcsize(pack677)-4, c, v) for c, v in
                            expected.items()]
            stopper = SinkAwaitValue(sink, await_values, 30)
            stopper.start()
            stopper.join()
            if stopper.error:
                print('sink.data', len(sink.data))
                print('await_values', len(await_values))
                raise stopper.error

            # stop application workers
            for r in runners:
                r.stop()

            # Test for crashed workers
            test_crashed_workers(runners)

            # Stop sink
            sink.stop()

            # Stop metrics
            metrics.stop()

            # validate output
            phase_validate_output(runners, sink, expected)
        #except:
        #    # wait for user interaction to continue
        #    if os.environ.get('pause_for_user'):
        #        pause_for_user()
        #    raise
        finally:
            for r in runners:
                r.stop()
            # Wait on runners to finish waiting on their subprocesses to exit
            for r in runners:
                r.join(runner_join_timeout)
            alive = []
            for r in runners:
                if r.is_alive():
                    alive.append(r)
            for r in runners:
                ec = r.poll()
                if ec != 0:
                    print('Worker {!r} exited with return code {}'
                          .format(r.name, ec))
                    print('Its last 5 log lines were:')
                    print('\n'.join(r.get_output().splitlines()[-5:]))
                    print()
            if alive:
                alive_names = ', '.join((r.name for r in alive))
                outputs = runners_output_format(runners)
                for a in alive:
                    a.kill()
            clean_resilience_path(res_dir)
            if alive:
                raise PipelineTestError("Runners [{}] failed to exit cleanly after"
                                        " {} seconds.\n"
                                        "Runner outputs are attached below:"
                                        "\n===\n{}"
                                        .format(alive_names, runner_join_timeout,
                                                outputs))
    except Exception as err:
        if not hasattr(err, 'as_steps'):
            err.as_steps = steps
        if not hasattr(err, 'runners'):
            err.runners = runners
        raise
Example #3
0
def _test_restart(command):

    host = '127.0.0.1'
    sources = 1
    sinks = 1
    sink_mode = 'framed'
    workers = 2
    expect = 200
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    runner_data = []
    # Start cluster
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 runner_data=runner_data) as cluster:

        # Create sender
        logging.debug("Creating sender")
        sender = Sender(cluster.source_addrs[0],
                        Reader(sequence_generator(expect)),
                        batch_size=1,
                        interval=0.05,
                        reconnect=True)
        cluster.add_sender(sender, start=True)

        # wait for some data to go through the system
        time.sleep(0.5)

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        logging.debug("Killing worker")
        killed = cluster.kill_worker(worker=-1)

        ## restart worker
        logging.debug("Restarting worker")
        cluster.restart_worker(killed)

        # wait until sender completes (~1 second)
        logging.debug("Waiting for sender to complete")
        cluster.wait_for_sender()

        # Wait for the last sent value expected at the worker
        logging.debug("Waiting for sink to complete")
        cluster.sink_await(await_values)

        # stop the cluster
        logging.debug("Stopping cluster")
        cluster.stop_cluster()

    logging.debug("validating restarted worker stdout")
    # Validate worker actually underwent recovery
    pattern_restarting = "Restarting a listener ..."
    try:
        assert (re.search(pattern_restarting, runner_data[2].stdout)
                is not None)
    except AssertionError:
        raise AssertionError('Worker does not appear to have reconnected '
                             'as expected. Worker output is '
                             'included below.\nSTDOUT\n---\n%s' % stdout)
Example #4
0
def _autoscale_run(command,
                   ops=[],
                   cycles=1,
                   initial=None,
                   runner_data=[],
                   as_steps=[]):
    host = '127.0.0.1'
    sources = 1
    sinks = 1
    sink_mode = 'framed'

    if isinstance(ops, int):
        ops = [ops]

    # If no initial workers value is given, determine the minimum number
    # required at the start so that the cluster never goes below 1 worker.
    # If a number is given, then verify it is sufficient.
    if ops:
        lowest = lowest_point(ops * cycles)
        if lowest < 1:
            min_workers = abs(lowest) + 1
        else:
            min_workers = 1
        if isinstance(initial, int):
            assert (initial >= min_workers)
            workers = initial
        else:
            workers = min_workers
    else:  # Test is only for setup using initial workers
        assert (initial > 0)
        workers = initial

    batch_size = 10
    interval = 0.05

    lowercase2 = [a + b for a in lowercase for b in lowercase]
    char_cycle = cycle(lowercase2)
    expected = Counter()

    def count_sent(s):
        expected[s] += 1

    reader = Reader(
        iter_generator(items=char_cycle,
                       to_string=lambda s: pack('>2sI', s, 1),
                       on_next=count_sent))

    # Start cluster
    logging.debug("Creating cluster")
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 runner_data=runner_data) as cluster:

        # Create sender
        logging.debug("Creating sender")
        sender = Sender(cluster.source_addrs[0],
                        reader,
                        batch_size=50,
                        interval=0.05,
                        reconnect=True)
        cluster.add_sender(sender, start=True)
        # wait for some data to go through the system
        time.sleep(1)

        # Perform autoscale cycles
        logging.debug("Starting autoscale cycles")
        for cyc in range(cycles):
            for joiners in ops:
                # Verify cluster is processing before proceeding
                cluster.wait_to_resume_processing(timeout=120)

                # Test for crashed workers
                assert (not cluster.get_crashed_workers())

                # get partition data before autoscale operation begins
                logging.debug("Get partition data before autoscale event")
                pre_partitions = cluster.get_partition_data()
                as_steps.append(joiners)
                joined = []
                left = []

                if joiners > 0:  # autoscale: grow
                    # create new workers and have them join
                    logging.debug("grow by {}".format(joiners))
                    joined = cluster.grow(by=joiners)

                elif joiners < 0:  # autoscale: shrink
                    # choose the most recent, still-alive runners to leave
                    leavers = abs(joiners)
                    left = cluster.shrink(leavers)

                else:  # Handle the 0 case as a noop
                    continue

                # Wait until all live workers report 'ready'
                cluster.wait_to_resume_processing(timeout=120)

                # Test for crashed workers
                assert (not cluster.get_crashed_workers())

                # Wait a second before the next operation, allowing some
                # more data to go through the system
                time.sleep(1)
                logging.debug("end of autoscale iteration")
            logging.debug("End of autoscale cycle")
        logging.debug("End of autoscale events. Entering final validation")
        time.sleep(2)

        # Test for crashed workers
        logging.debug("check for crashed")
        assert (not cluster.get_crashed_workers())

        # Test is done, so stop sender
        cluster.stop_senders()

        # wait until sender sends out its final batch and exits
        cluster.wait_for_sender()

        logging.info('Sender sent {} messages'.format(sum(expected.values())))

        # Use Sink value to determine when to stop runners and sink
        pack677 = '>I2sQ'
        await_values = [
            pack(pack677,
                 calcsize(pack677) - 4, c, v) for c, v in expected.items()
        ]
        cluster.sink_await(await_values, timeout=120)

        # validate output
        phase_validate_output(cluster.sinks[0].data, expected)
Example #5
0
def _test_restart(command):

    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 200
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 * workers
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, worker_ports) = (ports[:sources], [
            ports[sources:][i:i + 3]
            for i in xrange(0, len(ports[sources:]), 3)
        ])
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      res_dir, workers, worker_ports)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=1,
                        interval=0.05,
                        reconnect=True)
        sender.start()
        time.sleep(0.2)

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Wait for the last sent value expected at the worker
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            print 'sink data'
            print sink.data
            print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()

        # Validate worker actually underwent recovery
        pattern_restarting = "Restarting a listener ..."
        stdout = runners[-1].get_output()
        try:
            assert (re.search(pattern_restarting, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have reconnected '
                                 'as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s' % stdout)
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Example #6
0
def _test_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))

    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 + (2 * (workers - 1))
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, (control_port, data_port, external_port),
         worker_ports) = (ports[:sources], ports[sources:sources + 3],
                          zip(ports[-(2 * (workers - 1)):][::2],
                              ports[-(2 * (workers - 1)):][1::2]))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers,
                      worker_ports)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()
        time.sleep(0.2)

        # simulate worker crash by doing a non-graceful shutdown
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(5)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[-1].get_output()
            print '---'
            print runners[-2].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have performed '
                                 'recovery as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s' % stdout)

    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)