Example #1
0
def _test_log_rotation_file_size_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    event_log_file_size = 50000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)
    alt_block = '--event-log-file-size {}'.format(event_log_file_size)
    alt_func = lambda x: x > 0

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 * workers
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, worker_ports) = (ports[:sources], [
            ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3)
        ])
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      res_dir, workers, worker_ports, alt_block, alt_func)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        # Wait for runner to complete a log rotation
        log_rotated_checker = RunnerChecker(runners[1],
                                            log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop the worker in a non-graceful fashion so it doesn't remove
        # recovery files
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            print runners[-1].name
            print runners[-1].get_output()
            print '---'
            print runners[-2].name
            print runners[-2].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate worker underwent log rotation, but not initializer
        i, r = 1, runners[1]
        stdout = r.get_output()
        try:
            assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                    is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n' %
                                 (i, r.name, log_rotated_pattern, stdout))

        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have performed '
                                 'recovery as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s' % stdout)
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Example #2
0
def _test_log_rotation_external_trigger_no_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 * workers
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, worker_ports) = (ports[:sources], [
            ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3)
        ])
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      res_dir, workers, worker_ports)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        time.sleep(0.5)
        # Trigger log rotation with external message
        cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m '
                                'worker1'.format(host, external_port))

        res = run_shell_cmd(cmd_external_trigger)
        try:
            assert (res.success)
        except AssertionError:
            raise AssertionError('External rotation trigger failed with '
                                 'the error:\n{}'.format(res.output))

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            print runners[0].name
            print runners[0].get_output()
            print '---'
            print runners[1].name
            print runners[1].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate all workers underwent log rotation
        for r in runners[1:]:
            stdout = r.get_output()
            try:
                assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                        is not None)
            except AssertionError:
                raise AssertionError('Worker %r does not appear to have '
                                     'performed log rotation as expected.'
                                     ' The pattern %r '
                                     'is missing form the Worker output '
                                     'included below.\nSTDOUT\n---\n%s\n'
                                     '---\n' %
                                     (r.name, log_rotated_pattern, stdout))
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Example #3
0
def _test_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))

    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners[0], timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()
        time.sleep(0.2)

        # stop worker
        runners[-1].stop()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(5)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[-1].get_output()[0]
            print '---'
            print runners[-2].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout, stderr = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have performed '
                                 'recovery as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s\n---\n'
                                 'STDERR\n---\n%s' % (stdout, stderr))

    finally:
        for r in runners:
            r.stop()
        clean_up_resilience_path(res_dir)
Example #4
0
def _test_log_rotation_external_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        time.sleep(0.5)
        # Trigger log rotation with external message
        cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m '
                                'worker1'.format(host, external_port))

        success, stdout, retcode, cmd = ex_validate(cmd_external_trigger)
        try:
            assert (success)
        except AssertionError:
            raise AssertionError('External rotation trigger failed with '
                                 'the error:\n{}'.format(stdout))

        # Check for log rotation
        log_rotated_checker = RunnerChecker(runners[1],
                                            log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()[0]
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[0].name
            print runners[0].get_output()[0]
            print '---'
            print runners[1].name
            print runners[1].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

        # Validate all workers underwent log rotation
        r = runners[1]
        stdout, stderr = r.get_output()
        try:
            assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                    is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n' %
                                 (1, r.name, log_rotated_pattern, stdout))
        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout, stderr = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError(
                'Worker %d.%r does not appear to have '
                'performed recovery as expected. Worker '
                'output is '
                'included below.\nSTDOUT\n---\n%s\n---\n'
                'STDERR\n---\n%s' %
                (len(runners) - 1, runners[-1].name, stdout, stderr))
    finally:
        for r in runners:
            r.stop()
Example #5
0
def _test_autoscale_grow(command):
    host = '127.0.0.1'
    sources = 1
    workers = 1
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))

    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    patterns_i = [
        re.escape(r'***Worker worker1 attempting to join the '
                  r'cluster. Sent necessary information.***'),
        re.escape(r'Migrating partitions to worker1'),
        re.escape(r'--All new workers have acked migration '
                  r'batch complete'),
        re.escape(r'~~~Resuming message processing.~~~')
    ]
    patterns_w = [
        re.escape(r'***Successfully joined cluster!***'),
        re.escape(r'~~~Resuming message processing.~~~')
    ]

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader1 = Reader(sequence_generator(expect - 1000))
        reader2 = Reader(sequence_generator(expect, 1000))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender1 (0,1000]
        sender1 = Sender(host,
                         input_ports[0],
                         reader1,
                         batch_size=10,
                         interval=0.05)
        sender1.start()

        # wait until sender1 completes (~5 seconds)
        sender1.join(30)
        if sender1.error:
            raise sender1.error
        if sender1.is_alive():
            sender1.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # create a new worker and have it join
        add_runner(runners, command, host, inputs, outputs, metrics_port,
                   control_port, external_port, data_port, res_dir, workers)

        # Wait for runner to complete a log rotation
        join_checker_i = RunnerChecker(runners[0], patterns_i, timeout=30)
        join_checker_w = RunnerChecker(runners[1], patterns_w, timeout=30)
        join_checker_i.start()
        join_checker_w.start()
        join_checker_i.join()
        if join_checker_i.error:
            print('worker output:')
            print(runners[1].get_output()[0])
            raise join_checker_i.error
        join_checker_w.join()
        if join_checker_w.error:
            print('initalizer output:')
            print(runners[0].get_output()[0])
            raise join_checker_w.error

        # Start sender2 (1000, 2000]
        sender2 = Sender(host,
                         input_ports[0],
                         reader2,
                         batch_size=10,
                         interval=0.05)
        sender2.start()

        # wait until sender2 completes (~5 seconds)
        sender2.join(30)
        if sender2.error:
            raise sender2.error
        if sender2.is_alive():
            sender2.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use Sink value to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Stop metrics
        metrics.stop()

        # parse metrics data and validate worker has shifted from 1 to 2
        # workers
        mp = MetricsParser()
        mp.load_string_list(metrics.data)
        mp.parse()
        # Now confirm that there are computations in worker1's metrics
        app_key = mp.data.keys()[0]  # 'metrics:Sequence Window Printer'
        worker_metrics = [
            v for v in mp.data[app_key].get('worker1', []) if v[0] == 'metrics'
        ]
        # Verify there is at least one entry for a computation with a nonzero
        # total value
        print('worker_metrics', worker_metrics)
        filtered = filter(
            lambda v:
            (v[1]['metric_category'] == 'computation' and v[1]['total'] > 0),
            worker_metrics)
        print('filtered', filtered)
        assert (len(filtered) > 0)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[-1].get_output()[0]
            print '---'
            print runners[-2].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

    finally:
        for r in runners:
            r.stop()