Exemple #1
0
def _test_log_rotation_external_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 + (2 * (workers - 1))
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, (control_port, data_port, external_port),
         worker_ports) = (ports[:sources], ports[sources:sources + 3],
                          zip(ports[-(2 * (workers - 1)):][::2],
                              ports[-(2 * (workers - 1)):][1::2]))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers,
                      worker_ports)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        time.sleep(0.5)
        # Trigger log rotation with external message
        cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m '
                                'worker1'.format(host, external_port))

        success, stdout, retcode, cmd = ex_validate(cmd_external_trigger)
        try:
            assert (success)
        except AssertionError:
            raise AssertionError('External rotation trigger failed with '
                                 'the error:\n{}'.format(stdout))

        # Check for log rotation
        log_rotated_checker = RunnerChecker(runners[1],
                                            log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[0].name
            print runners[0].get_output()
            print '---'
            print runners[1].name
            print runners[1].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

        # Validate all workers underwent log rotation
        r = runners[1]
        stdout = r.get_output()
        try:
            assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                    is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n' %
                                 (1, r.name, log_rotated_pattern, stdout))
        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed recovery as expected. Worker '
                                 'output is '
                                 'included below.\nSTDOUT\n---\n%s' %
                                 (len(runners) - 1, runners[-1].name, stdout))
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Exemple #2
0
def _test_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))

    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()
        time.sleep(0.2)

        # simulate worker crash by doing a non-graceful shutdown
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(5)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[-1].get_output()[0]
            print '---'
            print runners[-2].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout, stderr = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have performed '
                                 'recovery as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s\n---\n'
                                 'STDERR\n---\n%s' % (stdout, stderr))

    finally:
        for r in runners:
            r.stop()
Exemple #3
0
def _run(command, runner_data=[]):
    host = '127.0.0.1'
    sources = 1
    sinks = 1
    sink_mode = 'framed'
    workers = 2
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    # Start cluster
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 runner_data=runner_data) as cluster:
        # Create sender
        logging.debug("Creating sender")
        sender = Sender(cluster.source_addrs[0],
                        Reader(sequence_generator(expect)),
                        batch_size=100,
                        interval=0.05,
                        reconnect=True)
        cluster.add_sender(sender, start=True)

        # wait for some data to go through the system
        time.sleep(0.2)

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        logging.debug("Killing worker")
        killed = cluster.kill_worker(worker=-1)

        ## restart worker
        logging.debug("Restarting worker")
        cluster.restart_worker(killed)

        # wait until sender completes (~1 second)
        logging.debug("Waiting for sender to complete")
        cluster.wait_for_sender()

        # Wait for the last sent value expected at the worker
        logging.debug("Waiting for sink to complete")
        cluster.sink_await(await_values)

        # stop the cluster
        logging.debug("Stopping cluster")
        cluster.stop_cluster()

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(cluster.res_dir, 'received.txt')
        cluster.sinks[0].save(out_file, mode='giles')

        # Validate captured output
        logging.debug("Validating output")
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            raise AssertionError('Output validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate worker actually underwent recovery
        logging.debug("Validating recovery from worker stdout")
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        try:
            assert (re.search(pattern, cluster.runners[-1].get_output())
                    is not None)
        except AssertionError:
            raise AssertionError("Worker does not appear to have performed "
                                 "recovery as expected.")
def _run(command, persistent_data):
    host = '127.0.0.1'
    sources = 1
    sinks = 1
    sink_mode = 'framed'
    workers = 2
    expect = 200
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    # Start cluster
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 persistent_data=persistent_data) as cluster:

        # Create sender
        logging.debug("Creating sender")
        sender = Sender(cluster.source_addrs[0],
                        Reader(sequence_generator(expect)),
                        batch_size=1,
                        interval=0.05,
                        reconnect=True)
        cluster.add_sender(sender, start=True)

        # wait for some data to go through the system
        time.sleep(0.5)

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        logging.debug("Killing worker")
        killed = cluster.kill_worker(worker=-1)

        ## restart worker
        logging.debug("Restarting worker")
        cluster.restart_worker(killed)

        # wait until sender completes (~1 second)
        logging.debug("Waiting for sender to complete")
        cluster.wait_for_sender()

        # Wait for the last sent value expected at the worker
        logging.debug("Waiting for sink to complete")
        cluster.sink_await(await_values)

        # stop the cluster
        logging.debug("Stopping cluster")
        cluster.stop_cluster()

    logging.debug("validating restarted worker stdout")
    # Validate worker actually underwent recovery
    pattern_restarting = "Restarting a listener ..."
    try:
        assert (re.search(pattern_restarting,
                          persistent_data['runner_data'][2].stdout)
                is not None)
    except AssertionError:
        raise AssertionError('Worker does not appear to have reconnected '
                             'as expected. Worker output is '
                             'included below.\nSTDOUT\n---\n%s' % stdout)
Exemple #5
0
def _test_log_rotation_file_size_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = '/tmp/res-data'
    expect = 2000
    event_log_file_size = 50000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)
    alt_block = '--event-log-file-size {}'.format(event_log_file_size)
    alt_func = lambda x: x > 0

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers,
                      alt_block, alt_func)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners[0], timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        # Wait for runner to complete a log rotation
        log_rotated_checker = RunnerChecker(runners[1],
                                            log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop worker
        runners[-1].stop()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()[0]
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[-1].name
            print runners[-1].get_output()[0]
            print '---'
            print runners[-2].name
            print runners[-2].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

        # Validate worker underwent log rotation, but not initializer
        i, r = 1, runners[1]
        stdout, stderr = r.get_output()
        try:
            assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                    is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n' %
                                 (i, r.name, log_rotated_pattern, stdout))

        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout, stderr = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have performed '
                                 'recovery as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s\n---\n'
                                 'STDERR\n---\n%s' % (stdout, stderr))
    finally:
        for r in runners:
            r.stop()
        clean_up_resilience_path(res_dir)
def _test_restart(command):

    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 200
    last_value_0 = '[{}]'.format(','.join((str(expect-v) for v in range(6,-2,-2))))
    last_value_1 = '[{}]'.format(','.join((str(expect-1-v) for v in range(6,-2,-2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                   struct.pack('>I', len(last_value_1)) + last_value_1)


    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 + (2 * (workers - 1))
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, (control_port, data_port, external_port),
         worker_ports) = (ports[:sources],
                          ports[sources:sources+3],
                          zip(ports[-(2*(workers-1)):][::2],
                              ports[-(2*(workers-1)):][1::2]))
        inputs = ','.join(['{}:{}'.format(host, p) for p in
                           input_ports])

        start_runners(runners, command, host, inputs, outputs,
                      metrics_port, control_port, external_port, data_port,
                      res_dir, workers, worker_ports)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host, input_ports[0], reader, batch_size=1,
                        interval=0.05, reconnect=True)
        sender.start()
        time.sleep(0.2)

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Wait for the last sent value expected at the worker
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            print 'sink data'
            print sink.data
            print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()

        # Validate worker actually underwent recovery
        pattern_restarting = "Restarting a listener ..."
        stdout = runners[-1].get_output()
        try:
            assert(re.search(pattern_restarting, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have reconnected '
                                 'as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s'
                                 % stdout)
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Exemple #7
0
def _test_autoscale_grow(command):
    host = '127.0.0.1'
    sources = 1
    workers = 1
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))

    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    patterns_i = [
        re.escape(r'***Worker worker1 attempting to join the '
                  r'cluster. Sent necessary information.***'),
        re.escape(r'Migrating partitions to worker1'),
        re.escape(r'--All new workers have acked migration '
                  r'batch complete'),
        re.escape(r'~~~Resuming message processing.~~~')
    ]
    patterns_w = [
        re.escape(r'***Successfully joined cluster!***'),
        re.escape(r'~~~Resuming message processing.~~~')
    ]

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader1 = Reader(sequence_generator(expect - 1000))
        reader2 = Reader(sequence_generator(expect, 1000))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender1 (0,1000]
        sender1 = Sender(host,
                         input_ports[0],
                         reader1,
                         batch_size=10,
                         interval=0.05)
        sender1.start()

        # wait until sender1 completes (~5 seconds)
        sender1.join(30)
        if sender1.error:
            raise sender1.error
        if sender1.is_alive():
            sender1.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # create a new worker and have it join
        add_runner(runners, command, host, inputs, outputs, metrics_port,
                   control_port, external_port, data_port, res_dir, workers)

        # Wait for runner to complete a log rotation
        join_checker_i = RunnerChecker(runners[0], patterns_i, timeout=30)
        join_checker_w = RunnerChecker(runners[1], patterns_w, timeout=30)
        join_checker_i.start()
        join_checker_w.start()
        join_checker_i.join()
        if join_checker_i.error:
            print('worker output:')
            print(runners[1].get_output()[0])
            raise join_checker_i.error
        join_checker_w.join()
        if join_checker_w.error:
            print('initalizer output:')
            print(runners[0].get_output()[0])
            raise join_checker_w.error

        # Start sender2 (1000, 2000]
        sender2 = Sender(host,
                         input_ports[0],
                         reader2,
                         batch_size=10,
                         interval=0.05)
        sender2.start()

        # wait until sender2 completes (~5 seconds)
        sender2.join(30)
        if sender2.error:
            raise sender2.error
        if sender2.is_alive():
            sender2.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use Sink value to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Stop metrics
        metrics.stop()

        # parse metrics data and validate worker has shifted from 1 to 2
        # workers
        mp = MetricsParser()
        mp.load_string_list(metrics.data)
        mp.parse()
        # Now confirm that there are computations in worker1's metrics
        app_key = mp.data.keys()[0]  # 'metrics:Sequence Window Printer'
        worker_metrics = [
            v for v in mp.data[app_key].get('worker1', []) if v[0] == 'metrics'
        ]
        # Verify there is at least one entry for a computation with a nonzero
        # total value
        print('worker_metrics', worker_metrics)
        filtered = filter(
            lambda v:
            (v[1]['metric_category'] == 'computation' and v[1]['total'] > 0),
            worker_metrics)
        print('filtered', filtered)
        assert (len(filtered) > 0)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[-1].get_output()[0]
            print '---'
            print runners[-2].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

    finally:
        for r in runners:
            r.stop()
Exemple #8
0
def _test_log_rotation_file_size_trigger_no_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    event_log_file_size = 50000
    last_value_0 = '[{}]'.format(','.join((str(expect-v)
                                         for v in range(6,-2,-2))))
    last_value_1 = '[{}]'.format(','.join((str(expect-1-v)
                                         for v in range(6,-2,-2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --event-log-file-size {} \
        --stop-pause {}
    '''.format(command, event_log_file_size, STOP_THE_WORLD_PAUSE)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 * workers
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, worker_ports) = (
            ports[:sources],
            [ports[sources:][i:i+3] for i in xrange(0,
                len(ports[sources:]), 3)])
        inputs = ','.join(['{}:{}'.format(host, p) for p in
                           input_ports])

        start_runners(runners, command, host, inputs, outputs,
                      metrics_port, res_dir, workers, worker_ports)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host, input_ports[0], reader, batch_size=100,
                        interval=0.05)
        sender.start()


        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'
                        .format(out_file = out_file,
                                expect = expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert(res.success)
        except AssertionError:
            print runners[0].get_output()
            print '---'
            print runners[1].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate all workers underwent log rotation
        for r in runners:
            stdout = r.get_output()
            try:
                assert(re.search(log_rotated_pattern, stdout, re.M | re.S)
                       is not None)
            except AssertionError:
                raise AssertionError('Worker %r does not appear to have '
                                     'performed log rotation as expected.'
                                     ' The pattern %r '
                                     'is missing form the Worker output '
                                     'included below.\nSTDOUT\n---\n%s\n'
                                     '---\n'
                                     % (r.name, log_rotated_pattern, stdout))
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)