def _test_log_rotation_external_trigger_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') expect = 2000 last_value_0 = '[{}]'.format(','.join((str(expect-v) for v in range(6,-2,-2)))) last_value_1 = '[{}]'.format(','.join((str(expect-1-v) for v in range(6,-2,-2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 + (2 * (workers - 1)) ports = get_port_values(num=num_ports, host=host) (input_ports, (control_port, data_port, external_port), worker_ports) = (ports[:sources], ports[sources:sources+3], zip(ports[-(2*(workers-1)):][::2], ports[-(2*(workers-1)):][1::2])) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers, worker_ports) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.5) # Trigger log rotation with external message cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m ' 'worker1' .format(host, external_port)) res = run_shell_cmd(cmd_external_trigger) try: assert(res.success) except AssertionError: raise AssertionError('External rotation trigger failed with ' 'the error:\n{}'.format(res.output)) # Check for log rotation log_rotated_checker = RunnerChecker(runners[1], log_rotated_patterns, timeout=AWAIT_TIMEOUT) log_rotated_checker.start() log_rotated_checker.join() if log_rotated_checker.error: raise log_rotated_checker.error # stop worker in a non-graceful fashion so that recovery files # aren't removed runners[-1].kill() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output() print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a' .format(out_file = out_file, expect = expect)) res = run_shell_cmd(cmd_validate) try: assert(res.success) except AssertionError: print runners[0].name print runners[0].get_output() print '---' print runners[1].name print runners[1].get_output() print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate all workers underwent log rotation r = runners[1] stdout = r.get_output() try: assert(re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %d.%r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (1, r.name, log_rotated_pattern, stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout = runners[-1].get_output() try: assert(re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError('Worker %d.%r does not appear to have ' 'performed recovery as expected. Worker ' 'output is ' 'included below.\nSTDOUT\n---\n%s' % (len(runners)-1, runners[-1].name, stdout)) finally: for r in runners: r.stop() clean_resilience_path(res_dir)
def _autoscale_sequence(command, ops=[], cycles=1, initial=None): host = '127.0.0.1' sources = 1 if isinstance(ops, int): ops = [ops] # If no initial workers value is given, determine the minimum number # required at the start so that the cluster never goes below 1 worker. # If a number is given, then verify it is sufficient. if ops: lowest = lowest_point(ops*cycles) if lowest < 1: min_workers = abs(lowest) + 1 else: min_workers = 1 if isinstance(initial, int): assert(initial >= min_workers) workers = initial else: workers = min_workers else: # Test is only for setup using initial workers assert(initial > 0) workers = initial batch_size = 10 interval = 0.05 sender_timeout = 30 # Counted from when Sender is stopped runner_join_timeout = 30 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') setup_resilience_path(res_dir) steps = [] runners = [] try: try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) lowercase2 = [a+b for a in lowercase for b in lowercase] char_cycle = cycle(lowercase2) expected = Counter() def count_sent(s): expected[s] += 1 reader = Reader(iter_generator( items=char_cycle, to_string=lambda s: pack('>2sI', s, 1), on_next=count_sent)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 + 3 * workers ports = get_port_values(num=num_ports, host=host) (input_ports, worker_ports) = ( ports[:sources], [ports[sources:][i:i+3] for i in xrange(0, len(ports[sources:]), 3)]) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) # Start the initial runners start_runners(runners, command, host, inputs, outputs, metrics_port, res_dir, workers, worker_ports) # Verify cluster is processing messages obs = ObservabilityNotifier(cluster_status_query, (host, worker_ports[0][2]), tests=test_cluster_is_processing) obs.start() obs.join() if obs.error: raise obs.error # Verify that `workers` workers are active # Create a partial function partial_test_worker_count = partial(test_worker_count, workers) obs = ObservabilityNotifier(cluster_status_query, (host, worker_ports[0][2]), tests=partial_test_worker_count) obs.start() obs.join() if obs.error: raise obs.error # Verify initializer starts with partitions obs = ObservabilityNotifier(state_entity_query, (host, worker_ports[0][2]), test_worker_has_state_entities) obs.start() obs.join() if obs.error: raise obs.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=batch_size, interval=interval) sender.start() # Give the cluster 1 second to build up some state time.sleep(1) # Perform autoscale cycles for cyc in range(cycles): for joiners in ops: # Verify cluster is processing before proceeding obs = ObservabilityNotifier(cluster_status_query, (host, worker_ports[0][2]), tests=test_cluster_is_processing, timeout=30) obs.start() obs.join() if obs.error: raise obs.error # Test for crashed workers test_crashed_workers(runners) # get partition data before autoscale operation begins addresses = [(r.name, r.external) for r in runners if r.is_alive()] responses = multi_states_query(addresses) pre_partitions = joined_partition_query_data(responses) steps.append(joiners) joined = [] left = [] if joiners > 0: # autoscale: grow # create new workers and have them join new_ports = get_port_values(num=(joiners * 3), host=host, base_port=25000) joiner_ports = [new_ports[i:i+3] for i in xrange(0, len(new_ports), 3)] for i in range(joiners): add_runner(runners, command, host, inputs, outputs, metrics_port, worker_ports[0][0], res_dir, joiners, *joiner_ports[i]) joined.append(runners[-1]) elif joiners < 0: # autoscale: shrink # choose the most recent, still-alive runners to leave leavers = abs(joiners) idx = 1 while len(left) < leavers and idx < len(runners): if runners[-idx].is_alive(): left.append(runners[-idx]) idx += 1 if len(left) < leavers: raise AutoscaleTestError("Not enough workers left to " "shrink! {} requested but " "only {} live non-initializer" "workers found!" .format(joiners, len(left))) # Send the shrink command resp = send_shrink_cmd(*runners[0].external, names=[r.name for r in left]) print("Sent a shrink command for {}".format( [r.name for r in left])) print("Response was: {}".format(resp)) else: # Handle the 0 case as a noop continue # Wait until all live workers report 'ready' wait_for_cluster_to_resume_processing(runners) # Test for crashed workers test_crashed_workers(runners) # Test: at least some states moved, and no states from # pre are missing from the post # get partition data before autoscale operation begins workers={'joining': [r.name for r in joined], 'leaving': [r.name for r in left]} # use a pre_process function to recreate this data for # retriable tests def pre_process(): addresses = [(r.name, r.external) for r in runners if r.is_alive()] responses = multi_states_query(addresses) post_partitions = joined_partition_query_data(responses) return (pre_partitions, post_partitions, workers) # retry the test until it passes or a timeout elapses try_until_timeout(test_migration, pre_process, timeout=120) # Wait a second before the next operation, allowing some # more data to go through the system time.sleep(1) time.sleep(2) # Test for crashed workers test_crashed_workers(runners) # Test is done, so stop sender sender.stop() # wait until sender sends out its final batch and exits sender.join(sender_timeout) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') print('Sender sent {} messages'.format(sum(expected.values()))) # Use Sink value to determine when to stop runners and sink pack677 = '>I2sQ' await_values = [pack(pack677, calcsize(pack677)-4, c, v) for c, v in expected.items()] stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: print('sink.data', len(sink.data)) print('await_values', len(await_values)) raise stopper.error # stop application workers for r in runners: r.stop() # Test for crashed workers test_crashed_workers(runners) # Stop sink sink.stop() # Stop metrics metrics.stop() # validate output phase_validate_output(runners, sink, expected) #except: # # wait for user interaction to continue # if os.environ.get('pause_for_user'): # pause_for_user() # raise finally: for r in runners: r.stop() # Wait on runners to finish waiting on their subprocesses to exit for r in runners: r.join(runner_join_timeout) alive = [] for r in runners: if r.is_alive(): alive.append(r) for r in runners: ec = r.poll() if ec != 0: print('Worker {!r} exited with return code {}' .format(r.name, ec)) print('Its last 5 log lines were:') print('\n'.join(r.get_output().splitlines()[-5:])) print() if alive: alive_names = ', '.join((r.name for r in alive)) outputs = runners_output_format(runners) for a in alive: a.kill() clean_resilience_path(res_dir) if alive: raise PipelineTestError("Runners [{}] failed to exit cleanly after" " {} seconds.\n" "Runner outputs are attached below:" "\n===\n{}" .format(alive_names, runner_join_timeout, outputs)) except Exception as err: if not hasattr(err, 'as_steps'): err.as_steps = steps if not hasattr(err, 'runners'): err.runners = runners raise
def _test_restart(command): host = '127.0.0.1' sources = 1 sinks = 1 sink_mode = 'framed' workers = 2 expect = 200 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) runner_data = [] # Start cluster with Cluster(command=command, host=host, sources=sources, workers=workers, sinks=sinks, sink_mode=sink_mode, runner_data=runner_data) as cluster: # Create sender logging.debug("Creating sender") sender = Sender(cluster.source_addrs[0], Reader(sequence_generator(expect)), batch_size=1, interval=0.05, reconnect=True) cluster.add_sender(sender, start=True) # wait for some data to go through the system time.sleep(0.5) # stop worker in a non-graceful fashion so that recovery files # aren't removed logging.debug("Killing worker") killed = cluster.kill_worker(worker=-1) ## restart worker logging.debug("Restarting worker") cluster.restart_worker(killed) # wait until sender completes (~1 second) logging.debug("Waiting for sender to complete") cluster.wait_for_sender() # Wait for the last sent value expected at the worker logging.debug("Waiting for sink to complete") cluster.sink_await(await_values) # stop the cluster logging.debug("Stopping cluster") cluster.stop_cluster() logging.debug("validating restarted worker stdout") # Validate worker actually underwent recovery pattern_restarting = "Restarting a listener ..." try: assert (re.search(pattern_restarting, runner_data[2].stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have reconnected ' 'as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s' % stdout)
def _autoscale_run(command, ops=[], cycles=1, initial=None, runner_data=[], as_steps=[]): host = '127.0.0.1' sources = 1 sinks = 1 sink_mode = 'framed' if isinstance(ops, int): ops = [ops] # If no initial workers value is given, determine the minimum number # required at the start so that the cluster never goes below 1 worker. # If a number is given, then verify it is sufficient. if ops: lowest = lowest_point(ops * cycles) if lowest < 1: min_workers = abs(lowest) + 1 else: min_workers = 1 if isinstance(initial, int): assert (initial >= min_workers) workers = initial else: workers = min_workers else: # Test is only for setup using initial workers assert (initial > 0) workers = initial batch_size = 10 interval = 0.05 lowercase2 = [a + b for a in lowercase for b in lowercase] char_cycle = cycle(lowercase2) expected = Counter() def count_sent(s): expected[s] += 1 reader = Reader( iter_generator(items=char_cycle, to_string=lambda s: pack('>2sI', s, 1), on_next=count_sent)) # Start cluster logging.debug("Creating cluster") with Cluster(command=command, host=host, sources=sources, workers=workers, sinks=sinks, sink_mode=sink_mode, runner_data=runner_data) as cluster: # Create sender logging.debug("Creating sender") sender = Sender(cluster.source_addrs[0], reader, batch_size=50, interval=0.05, reconnect=True) cluster.add_sender(sender, start=True) # wait for some data to go through the system time.sleep(1) # Perform autoscale cycles logging.debug("Starting autoscale cycles") for cyc in range(cycles): for joiners in ops: # Verify cluster is processing before proceeding cluster.wait_to_resume_processing(timeout=120) # Test for crashed workers assert (not cluster.get_crashed_workers()) # get partition data before autoscale operation begins logging.debug("Get partition data before autoscale event") pre_partitions = cluster.get_partition_data() as_steps.append(joiners) joined = [] left = [] if joiners > 0: # autoscale: grow # create new workers and have them join logging.debug("grow by {}".format(joiners)) joined = cluster.grow(by=joiners) elif joiners < 0: # autoscale: shrink # choose the most recent, still-alive runners to leave leavers = abs(joiners) left = cluster.shrink(leavers) else: # Handle the 0 case as a noop continue # Wait until all live workers report 'ready' cluster.wait_to_resume_processing(timeout=120) # Test for crashed workers assert (not cluster.get_crashed_workers()) # Wait a second before the next operation, allowing some # more data to go through the system time.sleep(1) logging.debug("end of autoscale iteration") logging.debug("End of autoscale cycle") logging.debug("End of autoscale events. Entering final validation") time.sleep(2) # Test for crashed workers logging.debug("check for crashed") assert (not cluster.get_crashed_workers()) # Test is done, so stop sender cluster.stop_senders() # wait until sender sends out its final batch and exits cluster.wait_for_sender() logging.info('Sender sent {} messages'.format(sum(expected.values()))) # Use Sink value to determine when to stop runners and sink pack677 = '>I2sQ' await_values = [ pack(pack677, calcsize(pack677) - 4, c, v) for c, v in expected.items() ] cluster.sink_await(await_values, timeout=120) # validate output phase_validate_output(cluster.sinks[0].data, expected)
def _test_restart(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') expect = 200 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 * workers ports = get_port_values(num=num_ports, host=host) (input_ports, worker_ports) = (ports[:sources], [ ports[sources:][i:i + 3] for i in xrange(0, len(ports[sources:]), 3) ]) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, res_dir, workers, worker_ports) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=1, interval=0.05, reconnect=True) sender.start() time.sleep(0.2) # stop worker in a non-graceful fashion so that recovery files # aren't removed runners[-1].kill() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Wait for the last sent value expected at the worker stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output() print '---' print 'sink data' print sink.data print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() # Validate worker actually underwent recovery pattern_restarting = "Restarting a listener ..." stdout = runners[-1].get_output() try: assert (re.search(pattern_restarting, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have reconnected ' 'as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s' % stdout) finally: for r in runners: r.stop() clean_resilience_path(res_dir)
def _test_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 + (2 * (workers - 1)) ports = get_port_values(num=num_ports, host=host) (input_ports, (control_port, data_port, external_port), worker_ports) = (ports[:sources], ports[sources:sources + 3], zip(ports[-(2 * (workers - 1)):][::2], ports[-(2 * (workers - 1)):][1::2])) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers, worker_ports) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.2) # simulate worker crash by doing a non-graceful shutdown runners[-1].kill() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(5) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) success, stdout, retcode, cmd = ex_validate(cmd_validate) try: assert (success) except AssertionError: print runners[-1].get_output() print '---' print runners[-2].get_output() print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout = runners[-1].get_output() try: assert (re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have performed ' 'recovery as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s' % stdout) finally: for r in runners: r.stop() clean_resilience_path(res_dir)