def given_data_sent(cluster): reader = Reader(iter_generator(items=[chr(x+65).encode() for x in range(INPUT_ITEMS)], to_bytes=lambda s: pack('>2sI', s, 1))) sender = Sender(cluster.source_addrs[0], reader, batch_size=50, interval=0.05, reconnect=True) cluster.add_sender(sender, start=True) time.sleep(0.5)
def given_data_sent(cluster): values = [chr(x + 65) for x in range(INPUT_ITEMS)] reader = Reader(iter_generator(values)) sender = Sender(cluster.source_addrs[0]["Dummy"], reader, batch_size=50, interval=0.05, reconnect=True) cluster.add_sender(sender, start=True) await_values = [ struct.pack('>I', len(v.encode())) + v.encode() for v in values ] cluster.sink_await(await_values)
def _autoscale_sequence(command, ops=[1], cycles=1, initial=None): host = '127.0.0.1' sources = 1 if isinstance(ops, int): ops = [ops] # If no initial workers value is given, determine the minimum number # required at the start so that the cluster never goes below 1 worker. # If a number is given, then verify it is sufficient. bottom = min(min(compact_sign(ops * cycles)), sum(ops * cycles)) if bottom < 1: min_workers = abs(bottom) + 1 else: min_workers = 1 if isinstance(initial, int): assert (initial >= min_workers) workers = initial else: workers = min_workers batch_size = 10 interval = 0.05 msgs_per_sec = int(batch_size / interval) base_time = 10 # Seconds cycle_time = 10 # seconds expect_time = base_time + cycle_time * cycles # seconds expect = expect_time * msgs_per_sec sender_timeout = expect_time + 10 # seconds join_timeout = 200 runner_join_timeout = 30 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') setup_resilience_path(res_dir) steps = [] runners = [] try: try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) lowercase2 = [a + b for a in lowercase for b in lowercase] char_gen = cycle(lowercase2) chars = [next(char_gen) for i in range(expect)] expected = Counter(chars) reader = Reader(iter_generator(chars, lambda s: pack('>2sI', s, 1))) await_values = [ pack('>I2sQ', 10, c, v) for c, v in expected.items() ] # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 + (2 * (workers - 1)) ports = get_port_values(num=num_ports, host=host) (input_ports, (control_port, data_port, external_port), worker_ports) = (ports[:sources], ports[sources:sources + 3], zip(ports[-(2 * (workers - 1)):][::2], ports[-(2 * (workers - 1)):][1::2])) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers, worker_ports) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # Get initial partition data partitions = query_partitions(host, external_port) # Verify all workers start with partitions assert (map( len, partitions['state_partitions'] ['letter-state'].values()).count(0) == 0) # start sender sender = Sender(host, input_ports[0], reader, batch_size=batch_size, interval=interval) sender.start() time.sleep(2) # Perform autoscale cycles start_froms = {r: 0 for r in runners} for cyc in range(cycles): for joiners in ops: steps.append(joiners) joined = [] left = [] if joiners > 0: # create a new worker and have it join new_ports = get_port_values(num=(joiners * 2), host=host, base_port=25000) joiner_ports = zip(new_ports[::2], new_ports[1::2]) for i in range(joiners): add_runner(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, joiners, *joiner_ports[i]) joined.append(runners[-1]) start_froms[runners[-1]] = 0 patterns_i = ([ re.escape('***Worker {} attempting to join the ' 'cluster. Sent necessary information.***' .format(r.name)) for r in joined ] + [ re.escape('Migrating partitions to {}'.format( r.name)) for r in joined ] + [ re.escape('--All new workers have acked migration ' 'batch complete'), re.escape('~~~Resuming message processing.~~~') ]) patterns_j = [ re.escape('***Successfully joined cluster!***'), re.escape('~~~Resuming message processing.~~~') ] # Wait for runners to complete joining join_checkers = [] join_checkers.append( RunnerChecker(runners[0], patterns_i, timeout=join_timeout, start_from=start_froms[runners[0]])) for runner in joined: join_checkers.append( RunnerChecker(runner, patterns_j, timeout=join_timeout, start_from=start_froms[runner])) for jc in join_checkers: jc.start() for jc in join_checkers: jc.join() if jc.error: outputs = runners_output_format(runners) raise AutoscaleTimeoutError( "'{}' timed out on JOIN in {} " "seconds. The cluster had the following outputs:\n===\n{}" .format(jc.runner_name, jc.timeout, outputs), as_error=jc.error, as_steps=steps) elif joiners < 0: # joiners < 0, e.g. leavers! # choose the most recent, still-alive runners to leave leavers = abs(joiners) idx = 1 while len(left) < leavers and idx < len(runners): if runners[-idx].is_alive(): left.append(runners[-idx]) idx += 1 if len(left) < leavers: raise AutoscaleTestError( "Not enough workers left to " "shrink! {} requested but " "only {} live non-initializer" "workers found!".format(joiners, len(left))) # Create the checkers initializer = [runners[0]] remaining = [ r for r in runners if r.is_alive() and r not in initializer + left ] patterns_i = ([ r'ExternalChannelConnectNotifier: initializer: ' r'server closed', r'Saving topology!', r'Saving worker names to file: .*?initializer.' r'workers' ] + [ re.escape( r'LocalTopology._save_worker_names: {}'.format( r.name)) for r in initializer + remaining ] + [ re.escape(r'~~~Initiating shrink~~~'), re.escape(r'-- Remaining workers:') ] + [ re.escape(r'-- -- {}'.format(r.name)) for n in initializer + remaining ] + [ re.escape(r'~~~Stopping message processing for ' r'leaving workers.~~~'), re.escape(r'~~~Resuming message processing.~~~') ]) patterns_r = ([ re.escape( r'Control Ch: Received Mute Request from initializer' ), re.escape( r'~~~Stopping message processing for leaving workers.~~~' ), re.escape( r'DataChannelConnectNotifier: server closed'), re.escape( r'ControlSenderConnectNotifier: server closed' ), re.escape(r'BoundaryNotify: closed'), re.escape( r'Control Ch: Received Unmute Request from initializer' ), re.escape(r'~~~Resuming message processing.~~~'), re.escape(r'Shutting down OutgoingBoundary'), re.escape(r'Shutting down ControlConnection') ]) patterns_r_per = [ r'ControlChannelConnectNotifier:{}: server closed' ] patterns_l = ([ re.escape( r'Control Ch: Received Mute Request from {}'. format(r.name)) for r in initializer + remaining ] + [ re.escape( r'Migrating all partitions to {} remaining ' r'workers'.format( len(initializer + remaining))), r'\^\^Migrating \d+ steps to {} workers'.format( len(initializer + remaining)) ] + [ r'\^\^Migrating step \d+ to outgoing ' r'boundary {}/[0-9a-f]{{12}}'.format(r.name) for r in initializer + remaining ] + [ re.escape( r'--Sending leaving worker done migration msg to cluster' ), re.escape( r'Connections: Finished shutdown procedure.'), re.escape(r'Shutting down ControlConnection'), re.escape(r'Shutting down TCPSink'), re.escape(r'Shutting down DataReceiver'), re.escape( r'Shutting down ReconnectingMetricsSink'), re.escape(r'Shutting down OutgoingBoundary'), re.escape(r'Shutting down Startup...'), re.escape(r'Shutting down DataChannel'), re.escape(r'metrics connection closed'), re.escape(r'TCPSink connection closed'), re.escape( r'ControlChannelConnectNotifier: server closed' ) ]) patterns_l_per = [] left_checkers = [] # initializer STDOUT checker left_checkers.append( RunnerChecker( initializer[0], patterns_i, timeout=join_timeout, start_from=start_froms[initializer[0]])) # remaining workers STDOUT checkers for runner in remaining: left_checkers.append( RunnerChecker(runner, patterns_r + [ p.format(runner.name) for p in patterns_r_per ], timeout=join_timeout, start_from=start_froms[runner])) # leaving workers STDOUT checkers for runner in left: left_checkers.append( RunnerChecker(runner, patterns_l + [ p.format(runner.name) for p in patterns_l_per ], timeout=join_timeout, start_from=start_froms[runner])) for lc in left_checkers: lc.start() # Send the shrink command send_shrink_cmd(host, external_port, names=[r.name for r in left]) # Wait for output checkers to join for lc in left_checkers: lc.join() if lc.error: outputs = runners_output_format(runners) raise AutoscaleTimeoutError( "'{}' timed out on SHRINK in {} " "seconds. The cluster had the following outputs:\n===\n{}" .format(lc.runner_name, lc.timeout, outputs), as_error=lc.error, as_steps=steps) else: # Handle the 0 case as a noop continue start_froms = {r: r.tell() for r in runners} # Validate autoscale via partition query try: partitions = query_partitions(host, external_port) phase_validate_partitions( runners, partitions, joined=[r.name for r in joined], left=[r.name for r in left]) except Exception as err: print( 'error validating {} have joined and {} have left'. format([r.name for r in joined], [r.name for r in left])) raise err # wait until sender completes (~10 seconds) sender.join(sender_timeout) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use Sink value to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: print('sink.data', len(sink.data)) print('await_values', len(await_values)) raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() # Stop metrics metrics.stop() # validate output phase_validate_output(runners, sink, expected) finally: for r in runners: r.stop() # Wait on runners to finish waiting on their subprocesses to exit for r in runners: r.join(runner_join_timeout) alive = [] for r in runners: if r.is_alive(): alive.append(r) for r in runners: ec = r.poll() if ec != 0: print('Worker {!r} exited with return code {}'.format( r.name, ec)) print('Its last 5 log lines were:') print('\n'.join(r.get_output().splitlines()[-5:])) print() if alive: alive_names = ', '.join((r.name for r in alive)) outputs = runners_output_format(runners) for a in alive: a.kill() clean_resilience_path(res_dir) if alive: raise PipelineTestError( "Runners [{}] failed to exit cleanly after" " {} seconds.\n" "Runner outputs are attached below:" "\n===\n{}".format(alive_names, runner_join_timeout, outputs)) except Exception as err: if not hasattr(err, 'as_steps'): err.as_steps = steps raise err
def _run(persistent_data, res_ops, command, ops=[], initial=None, sources=1, partition_multiplier=1, validate_output=True, sender_mps=1000, sender_interval=0.01): host = '127.0.0.1' sinks = 1 sink_mode = 'framed' batch_size = int(sender_mps * sender_interval) logging.debug("batch_size is {}".format(batch_size)) if not isinstance(ops, (list, tuple)): raise TypeError("ops must be a list or tuple of operations") # If no initial workers value is given, determine the minimum number # required at the start so that the cluster never goes below 1 worker. # If a number is given, then verify it is sufficient. if ops: if isinstance(ops[0], Recover): raise ValueError("The first operation cannot be Recover") lowest = lowest_point(ops) if lowest < 1: min_workers = abs(lowest) + 1 else: min_workers = 1 if isinstance(initial, int): logging.debug('initial: {}'.format(initial)) logging.debug('min: {}'.format(min_workers)) assert (initial >= min_workers) workers = initial else: workers = min_workers else: # Test is only for setup using initial workers assert (initial > 0) workers = initial logging.info("Initial cluster size: {}".format(workers)) partition_multiplier = 5 # Used in partition count creation # create the sequence generator and the reader msg = MultiSequenceGenerator(base_parts=workers * partition_multiplier - 1) # Start cluster logging.debug("Creating cluster") with Cluster(command=command, host=host, sources=sources, workers=workers, sinks=sinks, sink_mode=sink_mode, persistent_data=persistent_data) as cluster: # start senders for s in range(sources): sender = Sender(cluster.source_addrs[0], Reader(msg), batch_size=batch_size, interval=sender_interval, reconnect=True) cluster.add_sender(sender, start=True) # let the senders send some data first time.sleep(1) # loop over ops, keeping the result and passing it to the next op res = None assert (not cluster.get_crashed_workers()) for op in ops: res_ops.append(op) logging.info("Executing: {}".format(op)) res = op.apply(cluster, res) assert (not cluster.get_crashed_workers()) # Wait a full second for things to calm down time.sleep(1) # If using external senders, wait for them to stop cleanly if cluster.senders: # Tell the multi-sequence-sender to stop msg.stop() # wait for senders to reach the end of their readers and stop for s in cluster.senders: cluster.wait_for_sender(s) # Validate all sender values caught up stop_value = max(msg.seqs) t0 = time.time() while True: try: assert (len(msg.seqs) == msg.seqs.count(stop_value)) break except: if time.time() - t0 > 2: logging.error("msg.seqs aren't all equal: {}".format( msg.seqs)) raise time.sleep(0.1) # Create await_values for the sink based on the stop values from # the multi sequence generator await_values = [] for part, val in enumerate(msg.seqs): key = '{:07d}'.format(part).encode() data = '[{},{},{},{}]'.format( *[val - x for x in range(3, -1, -1)]).encode() await_values.append((key, data)) cluster.sink_await(values=await_values, func=parse_sink_value) logging.info("Completion condition achieved. Shutting down cluster.") # Use validator to validate the data in at-least-once mode # save sink data to a file if validate_output: # TODO: move to validations.py out_file = os.path.join(cluster.res_dir, 'received.txt') cluster.sinks[0].save(out_file) # Validate captured output logging.info("Validating output") # if senders == 0, using internal source if cluster.senders: cmd_validate = ( 'validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=stop_value)) else: cmd_validate = ('validator -i {out_file} -a'.format( out_file=out_file)) res = run_shell_cmd(cmd_validate) try: assert (res.success) logging.info("Validation successful") except: raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate worker actually underwent recovery if cluster.restarted_workers: # TODO: move to validations.py logging.info("Validating recovery") pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." for r in cluster.restarted_workers: stdout = r.get_output() try: assert (re.search(pattern, stdout) is not None) logging.info("{} recovered successfully".format(r.name)) except AssertionError: raise AssertionError( 'Worker {} does not appear to have performed ' 'recovery as expected.'.format(r.name))
def _test_log_rotation_file_size_trigger_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') expect = 2000 event_log_file_size = 50000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) alt_block = '--event-log-file-size {}'.format(event_log_file_size) alt_func = lambda x: x > 0 runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 * workers ports = get_port_values(num=num_ports, host=host) (input_ports, worker_ports) = (ports[:sources], [ ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3) ]) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, res_dir, workers, worker_ports, alt_block, alt_func) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() # Wait for runner to complete a log rotation log_rotated_checker = RunnerChecker(runners[1], log_rotated_patterns, timeout=AWAIT_TIMEOUT) log_rotated_checker.start() log_rotated_checker.join() if log_rotated_checker.error: raise log_rotated_checker.error # stop the worker in a non-graceful fashion so it doesn't remove # recovery files runners[-1].kill() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output() print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) res = run_shell_cmd(cmd_validate) try: assert (res.success) except AssertionError: print runners[-1].name print runners[-1].get_output() print '---' print runners[-2].name print runners[-2].get_output() print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate worker underwent log rotation, but not initializer i, r = 1, runners[1] stdout = r.get_output() try: assert (re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %d.%r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (i, r.name, log_rotated_pattern, stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout = runners[-1].get_output() try: assert (re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have performed ' 'recovery as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s' % stdout) finally: for r in runners: r.stop() clean_resilience_path(res_dir)
def _test_log_rotation_external_trigger_no_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 * workers ports = get_port_values(num=num_ports, host=host) (input_ports, worker_ports) = (ports[:sources], [ ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3) ]) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, res_dir, workers, worker_ports) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.5) # Trigger log rotation with external message cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m ' 'worker1'.format(host, external_port)) res = run_shell_cmd(cmd_external_trigger) try: assert (res.success) except AssertionError: raise AssertionError('External rotation trigger failed with ' 'the error:\n{}'.format(res.output)) # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output() print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) res = run_shell_cmd(cmd_validate) try: assert (res.success) except AssertionError: print runners[0].name print runners[0].get_output() print '---' print runners[1].name print runners[1].get_output() print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate all workers underwent log rotation for r in runners[1:]: stdout = r.get_output() try: assert (re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (r.name, log_rotated_pattern, stdout)) finally: for r in runners: r.stop() clean_resilience_path(res_dir)
def _test_restart(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = '/tmp/res-data' expect = 200 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=1, interval=0.05, reconnect=True) sender.start() time.sleep(0.2) # stop worker in a non-graceful fashion so that recovery files # aren't removed runners[-1].kill() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Wait for the last sent value expected at the worker stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output()[0] print '---' print 'sink data' print sink.data print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() # Validate worker actually underwent recovery pattern_restarting = "Restarting a listener ..." stdout, stderr = runners[-1].get_output() try: assert (re.search(pattern_restarting, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have reconnected ' 'as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s\n---\n' 'STDERR\n---\n%s' % (stdout, stderr)) finally: for r in runners: r.stop()
def _test_log_rotation_external_trigger_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = '/tmp/res-data' expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) command = '''{} \ --log-rotation \ --stop-pause {} '''.format(command, STOP_THE_WORLD_PAUSE) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.5) # Trigger log rotation with external message cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m ' 'worker1'.format(host, external_port)) success, stdout, retcode, cmd = ex_validate(cmd_external_trigger) try: assert (success) except AssertionError: raise AssertionError('External rotation trigger failed with ' 'the error:\n{}'.format(stdout)) # Check for log rotation log_rotated_checker = RunnerChecker(runners[1], log_rotated_patterns, timeout=AWAIT_TIMEOUT) log_rotated_checker.start() log_rotated_checker.join() if log_rotated_checker.error: raise log_rotated_checker.error # stop worker in a non-graceful fashion so that recovery files # aren't removed runners[-1].kill() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(30) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT) stopper.start() stopper.join() if stopper.error: for r in runners: print r.name print r.get_output()[0] print '---' raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) success, stdout, retcode, cmd = ex_validate(cmd_validate) try: assert (success) except AssertionError: print runners[0].name print runners[0].get_output()[0] print '---' print runners[1].name print runners[1].get_output()[0] print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(stdout)) # Validate all workers underwent log rotation r = runners[1] stdout, stderr = r.get_output() try: assert (re.search(log_rotated_pattern, stdout, re.M | re.S) is not None) except AssertionError: raise AssertionError('Worker %d.%r does not appear to have ' 'performed log rotation as expected.' ' The pattern %r ' 'is missing form the Worker output ' 'included below.\nSTDOUT\n---\n%s\n' '---\n' % (1, r.name, log_rotated_pattern, stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout, stderr = runners[-1].get_output() try: assert (re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError( 'Worker %d.%r does not appear to have ' 'performed recovery as expected. Worker ' 'output is ' 'included below.\nSTDOUT\n---\n%s\n---\n' 'STDERR\n---\n%s' % (len(runners) - 1, runners[-1].name, stdout, stderr)) finally: for r in runners: r.stop()
def _test_recovery(command): host = '127.0.0.1' sources = 1 workers = 2 res_dir = '/tmp/res-data' expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) setup_resilience_path(res_dir) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader = Reader(sequence_generator(expect)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners[0], timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=100, interval=0.05) sender.start() time.sleep(0.2) # stop worker runners[-1].stop() ## restart worker runners.append(runners[-1].respawn()) runners[-1].start() # wait until sender completes (~1 second) sender.join(5) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use metrics to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) success, stdout, retcode, cmd = ex_validate(cmd_validate) try: assert (success) except AssertionError: print runners[-1].get_output()[0] print '---' print runners[-2].get_output()[0] print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(stdout)) # Validate worker actually underwent recovery pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." stdout, stderr = runners[-1].get_output() try: assert (re.search(pattern, stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have performed ' 'recovery as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s\n---\n' 'STDERR\n---\n%s' % (stdout, stderr)) finally: for r in runners: r.stop() clean_up_resilience_path(res_dir)
def _run(command, persistent_data): host = '127.0.0.1' sources = 1 sinks = 1 sink_mode = 'framed' workers = 2 expect = 200 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))).encode() last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))).encode() await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) # Start cluster with Cluster(command=command, host=host, sources=sources, workers=workers, sinks=sinks, sink_mode=sink_mode, persistent_data=persistent_data) as cluster: # Create sender logging.debug("Creating sender") sender = Sender(cluster.source_addrs[0], Reader(sequence_generator(expect)), batch_size=1, interval=0.05, reconnect=True) cluster.add_sender(sender, start=True) # wait for some data to go through the system time.sleep(0.5) # stop worker in a non-graceful fashion so that recovery files # aren't removed logging.debug("Killing worker") killed = cluster.kill_worker(worker=-1) ## restart worker logging.debug("Restarting worker") cluster.restart_worker(killed) # wait until sender completes (~1 second) logging.debug("Waiting for sender to complete") cluster.wait_for_sender() # Wait for the last sent value expected at the worker logging.debug("Waiting for sink to complete") cluster.sink_await(await_values) # stop the cluster logging.debug("Stopping cluster") cluster.stop_cluster() logging.debug("validating restarted worker stdout") # Validate worker actually underwent recovery pattern_restarting = "Restarting a listener ..." try: assert (re.search(pattern_restarting, persistent_data['runner_data'][2].stdout) is not None) except AssertionError: raise AssertionError('Worker does not appear to have reconnected ' 'as expected. Worker output is ' 'included below.\nSTDOUT\n---\n%s' % stdout)
def _run(command, runner_data=[]): host = '127.0.0.1' sources = 1 sinks = 1 sink_mode = 'framed' workers = 2 expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) # Start cluster with Cluster(command=command, host=host, sources=sources, workers=workers, sinks=sinks, sink_mode=sink_mode, runner_data=runner_data) as cluster: # Create sender logging.debug("Creating sender") sender = Sender(cluster.source_addrs[0], Reader(sequence_generator(expect)), batch_size=100, interval=0.05, reconnect=True) cluster.add_sender(sender, start=True) # wait for some data to go through the system time.sleep(0.2) # stop worker in a non-graceful fashion so that recovery files # aren't removed logging.debug("Killing worker") killed = cluster.kill_worker(worker=-1) ## restart worker logging.debug("Restarting worker") cluster.restart_worker(killed) # wait until sender completes (~1 second) logging.debug("Waiting for sender to complete") cluster.wait_for_sender() # Wait for the last sent value expected at the worker logging.debug("Waiting for sink to complete") cluster.sink_await(await_values) # stop the cluster logging.debug("Stopping cluster") cluster.stop_cluster() # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(cluster.res_dir, 'received.txt') cluster.sinks[0].save(out_file, mode='giles') # Validate captured output logging.debug("Validating output") cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) res = run_shell_cmd(cmd_validate) try: assert (res.success) except AssertionError: raise AssertionError('Output validation failed with the following ' 'error:\n{}'.format(res.output)) # Validate worker actually underwent recovery logging.debug("Validating recovery from worker stdout") pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\." try: assert (re.search(pattern, cluster.runners[-1].get_output()) is not None) except AssertionError: raise AssertionError("Worker does not appear to have performed " "recovery as expected.")
def _test_autoscale_grow(command): host = '127.0.0.1' sources = 1 workers = 1 res_dir = '/tmp/res-data' expect = 2000 last_value_0 = '[{}]'.format(','.join( (str(expect - v) for v in range(6, -2, -2)))) last_value_1 = '[{}]'.format(','.join( (str(expect - 1 - v) for v in range(6, -2, -2)))) await_values = (struct.pack('>I', len(last_value_0)) + last_value_0, struct.pack('>I', len(last_value_1)) + last_value_1) patterns_i = [ re.escape(r'***Worker worker1 attempting to join the ' r'cluster. Sent necessary information.***'), re.escape(r'Migrating partitions to worker1'), re.escape(r'--All new workers have acked migration ' r'batch complete'), re.escape(r'~~~Resuming message processing.~~~') ] patterns_w = [ re.escape(r'***Successfully joined cluster!***'), re.escape(r'~~~Resuming message processing.~~~') ] setup_resilience_path(res_dir) runners = [] try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) reader1 = Reader(sequence_generator(expect - 1000)) reader2 = Reader(sequence_generator(expect, 1000)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) input_ports, control_port, external_port, data_port = (get_port_values( host, sources)) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for first runner (initializer) to report application ready runner_ready_checker = RunnerReadyChecker(runners, timeout=30) runner_ready_checker.start() runner_ready_checker.join() if runner_ready_checker.error: raise runner_ready_checker.error # start sender1 (0,1000] sender1 = Sender(host, input_ports[0], reader1, batch_size=10, interval=0.05) sender1.start() # wait until sender1 completes (~5 seconds) sender1.join(30) if sender1.error: raise sender1.error if sender1.is_alive(): sender1.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # create a new worker and have it join add_runner(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers) # Wait for runner to complete a log rotation join_checker_i = RunnerChecker(runners[0], patterns_i, timeout=30) join_checker_w = RunnerChecker(runners[1], patterns_w, timeout=30) join_checker_i.start() join_checker_w.start() join_checker_i.join() if join_checker_i.error: print('worker output:') print(runners[1].get_output()[0]) raise join_checker_i.error join_checker_w.join() if join_checker_w.error: print('initalizer output:') print(runners[0].get_output()[0]) raise join_checker_w.error # Start sender2 (1000, 2000] sender2 = Sender(host, input_ports[0], reader2, batch_size=10, interval=0.05) sender2.start() # wait until sender2 completes (~5 seconds) sender2.join(30) if sender2.error: raise sender2.error if sender2.is_alive(): sender2.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') # Use Sink value to determine when to stop runners and sink stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: raise stopper.error # stop application workers for r in runners: r.stop() # Stop sink sink.stop() print 'sink.data size: ', len(sink.data) # Stop metrics metrics.stop() # parse metrics data and validate worker has shifted from 1 to 2 # workers mp = MetricsParser() mp.load_string_list(metrics.data) mp.parse() # Now confirm that there are computations in worker1's metrics app_key = mp.data.keys()[0] # 'metrics:Sequence Window Printer' worker_metrics = [ v for v in mp.data[app_key].get('worker1', []) if v[0] == 'metrics' ] # Verify there is at least one entry for a computation with a nonzero # total value print('worker_metrics', worker_metrics) filtered = filter( lambda v: (v[1]['metric_category'] == 'computation' and v[1]['total'] > 0), worker_metrics) print('filtered', filtered) assert (len(filtered) > 0) # Use validator to validate the data in at-least-once mode # save sink data to a file out_file = os.path.join(res_dir, 'received.txt') sink.save(out_file, mode='giles') # Validate captured output cmd_validate = ('validator -i {out_file} -e {expect} -a'.format( out_file=out_file, expect=expect)) success, stdout, retcode, cmd = ex_validate(cmd_validate) try: assert (success) except AssertionError: print runners[-1].get_output()[0] print '---' print runners[-2].get_output()[0] print '---' raise AssertionError('Validation failed with the following ' 'error:\n{}'.format(stdout)) finally: for r in runners: r.stop()
def _autoscale_sequence(command, ops=[], cycles=1, initial=None): host = '127.0.0.1' sources = 1 if isinstance(ops, int): ops = [ops] # If no initial workers value is given, determine the minimum number # required at the start so that the cluster never goes below 1 worker. # If a number is given, then verify it is sufficient. if ops: lowest = lowest_point(ops * cycles) if lowest < 1: min_workers = abs(lowest) + 1 else: min_workers = 1 if isinstance(initial, int): assert (initial >= min_workers) workers = initial else: workers = min_workers else: # Test is only for setup using initial workers assert (initial > 0) workers = initial batch_size = 10 interval = 0.05 sender_timeout = 30 # Counted from when Sender is stopped runner_join_timeout = 30 res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.') setup_resilience_path(res_dir) steps = [] runners = [] try: try: # Create sink, metrics, reader, sender sink = Sink(host) metrics = Metrics(host) lowercase2 = [a + b for a in lowercase for b in lowercase] char_cycle = cycle(lowercase2) expected = Counter() def count_sent(s): expected[s] += 1 reader = Reader( iter_generator(items=char_cycle, to_string=lambda s: pack('>2sI', s, 1), on_next=count_sent)) # Start sink and metrics, and get their connection info sink.start() sink_host, sink_port = sink.get_connection_info() outputs = '{}:{}'.format(sink_host, sink_port) metrics.start() metrics_host, metrics_port = metrics.get_connection_info() time.sleep(0.05) num_ports = sources + 3 + (2 * (workers - 1)) ports = get_port_values(num=num_ports, host=host) (input_ports, (control_port, data_port, external_port), worker_ports) = (ports[:sources], ports[sources:sources + 3], zip(ports[-(2 * (workers - 1)):][::2], ports[-(2 * (workers - 1)):][1::2])) inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports]) # Prepare query functions with host and port pre-defined query_func_partitions = partial(partitions_query, host, external_port) query_func_partition_counts = partial(partition_counts_query, host, external_port) query_func_cluster_status = partial(cluster_status_query, host, external_port) # Start the initial runners start_runners(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, workers, worker_ports) # Verify cluster is processing messages obs = ObservabilityNotifier(query_func_cluster_status, test_cluster_is_processing) obs.start() obs.join() if obs.error: raise obs.error # Verify that `workers` workers are active # Create a partial function partial_test_worker_count = partial(test_worker_count, workers) obs = ObservabilityNotifier(query_func_cluster_status, partial_test_worker_count) obs.start() obs.join() if obs.error: raise obs.error # Verify all workers start with partitions obs = ObservabilityNotifier(query_func_partitions, test_all_workers_have_partitions) obs.start() obs.join() if obs.error: raise obs.error # start sender sender = Sender(host, input_ports[0], reader, batch_size=batch_size, interval=interval) sender.start() # Give the cluster 1 second to build up some state time.sleep(1) # Perform autoscale cycles for cyc in range(cycles): for joiners in ops: # Verify cluster is processing before proceeding obs = ObservabilityNotifier(query_func_cluster_status, test_cluster_is_processing, timeout=30) obs.start() obs.join() if obs.error: raise obs.error # Test for crashed workers test_crashed_workers(runners) # get partition data before autoscale operation begins pre_partitions = query_func_partitions() steps.append(joiners) joined = [] left = [] if joiners > 0: # autoscale: grow # create a new worker and have it join new_ports = get_port_values(num=(joiners * 2), host=host, base_port=25000) joiner_ports = zip(new_ports[::2], new_ports[1::2]) for i in range(joiners): add_runner(runners, command, host, inputs, outputs, metrics_port, control_port, external_port, data_port, res_dir, joiners, *joiner_ports[i]) joined.append(runners[-1]) # Verify cluster has resumed processing obs = ObservabilityNotifier(query_func_cluster_status, test_cluster_is_processing, timeout=120) obs.start() obs.join() if obs.error: raise obs.error # Test: all workers have partitions, partitions ids # for new workers have been migrated from pre-join # workers # create list of joining workers diff_names = {'joining': [r.name for r in joined]} # Create partial function of the test with the # data baked in tmp = partial(test_migrated_partitions, pre_partitions, diff_names) # Start the test notifier obs = ObservabilityNotifier( query_func_partitions, [test_all_workers_have_partitions, tmp]) obs.start() obs.join() if obs.error: raise obs.error elif joiners < 0: # autoscale: shrink # choose the most recent, still-alive runners to leave leavers = abs(joiners) idx = 1 while len(left) < leavers and idx < len(runners): if runners[-idx].is_alive(): left.append(runners[-idx]) idx += 1 if len(left) < leavers: raise AutoscaleTestError( "Not enough workers left to " "shrink! {} requested but " "only {} live non-initializer" "workers found!".format(joiners, len(left))) # Send the shrink command resp = send_shrink_cmd(host, external_port, names=[r.name for r in left]) print("Sent a shrink command for {}".format( [r.name for r in left])) print("Response was: {}".format(resp)) # Verify cluster has resumed processing obs = ObservabilityNotifier(query_func_cluster_status, test_cluster_is_processing, timeout=120) obs.start() obs.join() if obs.error: raise obs.error # Test: all workers have partitions, partitions ids # from departing workers have been migrated to remaining # workers # create list of leaving workers diff_names = {'leaving': [r.name for r in left]} # Create partial function of the test with the # data baked in tmp = partial(test_migrated_partitions, pre_partitions, diff_names) # Start the test notifier obs = ObservabilityNotifier( query_func_partitions, [test_all_workers_have_partitions, tmp]) obs.start() obs.join() if obs.error: raise obs.error else: # Handle the 0 case as a noop continue # Test for crashed workers test_crashed_workers(runners) # Validate autoscale via partition query try: partitions = partitions_query(host, external_port) phase_validate_partitions( runners, partitions, joined=[r.name for r in joined], left=[r.name for r in left]) except Exception as err: print( 'error validating {} have joined and {} have left'. format([r.name for r in joined], [r.name for r in left])) raise # Wait a second before the next operation, allowing some # more data to go through the system time.sleep(1) # Test for crashed workers test_crashed_workers(runners) # Test is done, so stop sender sender.stop() # wait until sender sends out its final batch and exits sender.join(sender_timeout) if sender.error: raise sender.error if sender.is_alive(): sender.stop() raise TimeoutError('Sender did not complete in the expected ' 'period') print('Sender sent {} messages'.format(sum(expected.values()))) # Use Sink value to determine when to stop runners and sink pack677 = '>I2sQ' pack27 = '>IsQ' await_values = [ pack(pack677, calcsize(pack677) - 4, c, v) for c, v in expected.items() ] #await_values = [pack(pack27, calcsize(pack27)-4, c, v) for c, v in # expected.items()] stopper = SinkAwaitValue(sink, await_values, 30) stopper.start() stopper.join() if stopper.error: print('sink.data', len(sink.data)) print('await_values', len(await_values)) raise stopper.error # stop application workers for r in runners: r.stop() # Test for crashed workers test_crashed_workers(runners) # Stop sink sink.stop() # Stop metrics metrics.stop() # validate output phase_validate_output(runners, sink, expected) finally: for r in runners: r.stop() # Wait on runners to finish waiting on their subprocesses to exit for r in runners: r.join(runner_join_timeout) alive = [] for r in runners: if r.is_alive(): alive.append(r) for r in runners: ec = r.poll() if ec != 0: print('Worker {!r} exited with return code {}'.format( r.name, ec)) print('Its last 5 log lines were:') print('\n'.join(r.get_output().splitlines()[-5:])) print() if alive: alive_names = ', '.join((r.name for r in alive)) outputs = runners_output_format(runners) for a in alive: a.kill() clean_resilience_path(res_dir) if alive: raise PipelineTestError( "Runners [{}] failed to exit cleanly after" " {} seconds.\n" "Runner outputs are attached below:" "\n===\n{}".format(alive_names, runner_join_timeout, outputs)) except Exception as err: if not hasattr(err, 'as_steps'): err.as_steps = steps if not hasattr(err, 'runners'): err.runners = runners raise
def _autoscale_run(command, ops=[], cycles=1, initial=None, runner_data=[], as_steps=[]): host = '127.0.0.1' sources = 1 sinks = 1 sink_mode = 'framed' if isinstance(ops, int): ops = [ops] # If no initial workers value is given, determine the minimum number # required at the start so that the cluster never goes below 1 worker. # If a number is given, then verify it is sufficient. if ops: lowest = lowest_point(ops * cycles) if lowest < 1: min_workers = abs(lowest) + 1 else: min_workers = 1 if isinstance(initial, int): assert (initial >= min_workers) workers = initial else: workers = min_workers else: # Test is only for setup using initial workers assert (initial > 0) workers = initial batch_size = 10 interval = 0.05 lowercase2 = [a + b for a in lowercase for b in lowercase] char_cycle = cycle(lowercase2) expected = Counter() def count_sent(s): expected[s] += 1 reader = Reader( iter_generator(items=char_cycle, to_string=lambda s: pack('>2sI', s, 1), on_next=count_sent)) # Start cluster logging.debug("Creating cluster") with Cluster(command=command, host=host, sources=sources, workers=workers, sinks=sinks, sink_mode=sink_mode, runner_data=runner_data) as cluster: # Create sender logging.debug("Creating sender") sender = Sender(cluster.source_addrs[0], reader, batch_size=50, interval=0.05, reconnect=True) cluster.add_sender(sender, start=True) # wait for some data to go through the system time.sleep(1) # Perform autoscale cycles logging.debug("Starting autoscale cycles") for cyc in range(cycles): for joiners in ops: # Verify cluster is processing before proceeding cluster.wait_to_resume_processing(timeout=120) # Test for crashed workers assert (not cluster.get_crashed_workers()) # get partition data before autoscale operation begins logging.debug("Get partition data before autoscale event") pre_partitions = cluster.get_partition_data() as_steps.append(joiners) joined = [] left = [] if joiners > 0: # autoscale: grow # create new workers and have them join logging.debug("grow by {}".format(joiners)) joined = cluster.grow(by=joiners) elif joiners < 0: # autoscale: shrink # choose the most recent, still-alive runners to leave leavers = abs(joiners) left = cluster.shrink(leavers) else: # Handle the 0 case as a noop continue # Wait until all live workers report 'ready' cluster.wait_to_resume_processing(timeout=120) # Test for crashed workers assert (not cluster.get_crashed_workers()) # Wait a second before the next operation, allowing some # more data to go through the system time.sleep(1) logging.debug("end of autoscale iteration") logging.debug("End of autoscale cycle") logging.debug("End of autoscale events. Entering final validation") time.sleep(2) # Test for crashed workers logging.debug("check for crashed") assert (not cluster.get_crashed_workers()) # Test is done, so stop sender cluster.stop_senders() # wait until sender sends out its final batch and exits cluster.wait_for_sender() logging.info('Sender sent {} messages'.format(sum(expected.values()))) # Use Sink value to determine when to stop runners and sink pack677 = '>I2sQ' await_values = [ pack(pack677, calcsize(pack677) - 4, c, v) for c, v in expected.items() ] cluster.sink_await(await_values, timeout=120) # validate output phase_validate_output(cluster.sinks[0].data, expected)
from end_points import (Reader, sequence_generator) from integration import (Sender) import sys import time wallaroo_hostsvc = sys.argv[1] num_start = int(sys.argv[2]) num_end = int(sys.argv[3]) batch_size = int(sys.argv[4]) interval = float(sys.argv[5]) num_part_keys = int(sys.argv[6]) senders = [] for p in range(0, num_part_keys): sender = Sender(wallaroo_hostsvc, Reader( sequence_generator(start=num_start, stop=num_end, partition='key_%d' % p)), batch_size=batch_size, interval=interval, reconnect=True) senders.append(sender) for s in senders: s.start() for s in senders: s.join() sys.exit(0)