def _test_log_rotation_external_trigger_no_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = '/tmp/res-data' expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners[0], timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.5) # Trigger log rotation with external message cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m ' 'worker1'.format(host, external_port)) success, stdout, retcode, cmd = ex_validate(cmd_external_trigger) try: assert (success) except AssertionError: raise AssertionError('External rotation trigger failed with ' 'the error:\n{}'.format(stdout)) # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output()[0] print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) success, stdout, retcode, cmd = ex_validate(cmd_validate) try: assert (success) except AssertionError: print runners[0].name print runners[0].get_output()[0] print '---' print runners[1].name print runners[1].get_output()[0] print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(stdout)) # Validate all workers underwent log rotation for r in runners[1:]: stdout, stderr = r.get_output() try: assert (re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (r.name, log_rotated_pattern, stdout)) finally: for r in runners: r.stop() clean_up_resilience_path(res_dir)
def _test_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = '/tmp/res-data' expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners[0], timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.2) # stop worker runners[-1].stop() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(5) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) success, stdout, retcode, cmd = ex_validate(cmd_validate) try: assert (success) except AssertionError: print runners[-1].get_output()[0] print '---' print runners[-2].get_output()[0] print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout, stderr = runners[-1].get_output() try: assert (re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have performed ' 'recovery as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s\n---\n' 'STDERR\n---\n%s' % (stdout, stderr)) finally: for r in runners: r.stop() clean_up_resilience_path(res_dir)
def _test_restart(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = '/tmp/res-data' expect = 200 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners[0], timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=1, interval=0.05, reconnect=True) sender.start() time.sleep(0.2) # stop worker runners[-1].stop() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Wait for the last sent value expected at the worker stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output()[0] print '---' print 'sink data' print sink.data print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() # Validate worker actually underwent recovery pattern_restarting = "Restarting a listener ..." stdout, stderr = runners[-1].get_output() try: assert (re.search(pattern_restarting, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have reconnected ' 'as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s\n---\n' 'STDERR\n---\n%s' % (stdout, stderr)) finally: for r in runners: r.stop() clean_up_resilience_path(res_dir)