def _test_log_rotation_file_size_trigger_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') expect = 2000 event_log_file_size = 50000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) alt_block = '--event-log-file-size {}'.format(event_log_file_size) alt_func = lambda x: x > 0 runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 * workers ports = get_port_values(num=num_ports, host=host) (input_ports, worker_ports) = (ports[:sources], [ ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3) ]) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, res_dir, workers, worker_ports, alt_block, alt_func) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() # Wait for runner to complete a log rotation log_rotated_checker = RunnerChecker(runners[1], log_rotated_patterns, timeout=AWAIT_TIMEOUT) log_rotated_checker.start() log_rotated_checker.join() if log_rotated_checker.error: raise log_rotated_checker.error # stop the worker in a non-graceful fashion so it doesn't remove # recovery files runners[-1].kill() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output() print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) res = run_shell_cmd(cmd_validate) try: assert (res.success) except AssertionError: print runners[-1].name print runners[-1].get_output() print '---' print runners[-2].name print runners[-2].get_output() print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate worker underwent log rotation, but not initializer i, r = 1, runners[1] stdout = r.get_output() try: assert (re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %d.%r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (i, r.name, log_rotated_pattern, stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout = runners[-1].get_output() try: assert (re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have performed ' 'recovery as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s' % stdout) finally: for r in runners: r.stop() clean_resilience_path(res_dir)
def _test_log_rotation_external_trigger_no_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 * workers ports = get_port_values(num=num_ports, host=host) (input_ports, worker_ports) = (ports[:sources], [ ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3) ]) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, res_dir, workers, worker_ports) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.5) # Trigger log rotation with external message cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m ' 'worker1'.format(host, external_port)) res = run_shell_cmd(cmd_external_trigger) try: assert (res.success) except AssertionError: raise AssertionError('External rotation trigger failed with ' 'the error:\n{}'.format(res.output)) # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output() print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) res = run_shell_cmd(cmd_validate) try: assert (res.success) except AssertionError: print runners[0].name print runners[0].get_output() print '---' print runners[1].name print runners[1].get_output() print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate all workers underwent log rotation for r in runners[1:]: stdout = r.get_output() try: assert (re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (r.name, log_rotated_pattern, stdout)) finally: for r in runners: r.stop() clean_resilience_path(res_dir)
def _test_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = '/tmp/res-data' expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners[0], timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.2) # stop worker runners[-1].stop() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(5) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) success, stdout, retcode, cmd = ex_validate(cmd_validate) try: assert (success) except AssertionError: print runners[-1].get_output()[0] print '---' print runners[-2].get_output()[0] print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout, stderr = runners[-1].get_output() try: assert (re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have performed ' 'recovery as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s\n---\n' 'STDERR\n---\n%s' % (stdout, stderr)) finally: for r in runners: r.stop() clean_up_resilience_path(res_dir)
def _test_log_rotation_external_trigger_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = '/tmp/res-data' expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.5) # Trigger log rotation with external message cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m ' 'worker1'.format(host, external_port)) success, stdout, retcode, cmd = ex_validate(cmd_external_trigger) try: assert (success) except AssertionError: raise AssertionError('External rotation trigger failed with ' 'the error:\n{}'.format(stdout)) # Check for log rotation log_rotated_checker = RunnerChecker(runners[1], log_rotated_patterns, timeout=AWAIT_TIMEOUT) log_rotated_checker.start() log_rotated_checker.join() if log_rotated_checker.error: raise log_rotated_checker.error # stop worker in a non-graceful fashion so that recovery files # aren't removed runners[-1].kill() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output()[0] print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) success, stdout, retcode, cmd = ex_validate(cmd_validate) try: assert (success) except AssertionError: print runners[0].name print runners[0].get_output()[0] print '---' print runners[1].name print runners[1].get_output()[0] print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(stdout)) # Validate all workers underwent log rotation r = runners[1] stdout, stderr = r.get_output() try: assert (re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %d.%r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (1, r.name, log_rotated_pattern, stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout, stderr = runners[-1].get_output() try: assert (re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError( 'Worker %d.%r does not appear to have ' 'performed recovery as expected. Worker ' 'output is ' 'included below.\nSTDOUT\n---\n%s\n---\n' 'STDERR\n---\n%s' % (len(runners) - 1, runners[-1].name, stdout, stderr)) finally: for r in runners: r.stop()
def _test_autoscale_grow(command): host = '127.0.0.1' sources = 1 workers = 1 res_dir = '/tmp/res-data' expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) patterns_i = [ re.escape(r'***Worker worker1 attempting to join the ' r'cluster. Sent necessary information.***'), re.escape(r'Migrating partitions to worker1'), re.escape(r'--All new workers have acked migration ' r'batch complete'), re.escape(r'~~~Resuming message processing.~~~') ] patterns_w = [ re.escape(r'***Successfully joined cluster!***'), re.escape(r'~~~Resuming message processing.~~~') ] setup_resilience_path(res_dir) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader1 = Reader(sequence_generator(expect - 1000)) reader2 = Reader(sequence_generator(expect, 1000)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender1 (0,1000] sender1 = Sender(host, input_ports[0], reader1, batch_size=10, interval=0.05) sender1.start() # wait until sender1 completes (~5 seconds) sender1.join(30) if sender1.error: raise sender1.error if sender1.is_alive(): sender1.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # create a new worker and have it join add_runner(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for runner to complete a log rotation join_checker_i = RunnerChecker(runners[0], patterns_i, timeout=30) join_checker_w = RunnerChecker(runners[1], patterns_w, timeout=30) join_checker_i.start() join_checker_w.start() join_checker_i.join() if join_checker_i.error: print('worker output:') print(runners[1].get_output()[0]) raise join_checker_i.error join_checker_w.join() if join_checker_w.error: print('initalizer output:') print(runners[0].get_output()[0]) raise join_checker_w.error # Start sender2 (1000, 2000] sender2 = Sender(host, input_ports[0], reader2, batch_size=10, interval=0.05) sender2.start() # wait until sender2 completes (~5 seconds) sender2.join(30) if sender2.error: raise sender2.error if sender2.is_alive(): sender2.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use Sink value to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Stop metrics metrics.stop() # parse metrics data and validate worker has shifted from 1 to 2 # workers mp = MetricsParser() mp.load_string_list(metrics.data) mp.parse() # Now confirm that there are computations in worker1's metrics app_key = mp.data.keys()[0] # 'metrics:Sequence Window Printer' worker_metrics = [ v for v in mp.data[app_key].get('worker1', []) if v[0] == 'metrics' ] # Verify there is at least one entry for a computation with a nonzero # total value print('worker_metrics', worker_metrics) filtered = filter( lambda v: (v[1]['metric_category'] == 'computation' and v[1]['total'] > 0), worker_metrics) print('filtered', filtered) assert (len(filtered) > 0) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) success, stdout, retcode, cmd = ex_validate(cmd_validate) try: assert (success) except AssertionError: print runners[-1].get_output()[0] print '---' print runners[-2].get_output()[0] print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(stdout)) finally: for r in runners: r.stop()