def __exit__(self, _type, _value, _traceback): logging.debug("{}.__exit__({}, {}, {})".format(self, _type, _value, _traceback)) self.cluster.__exit__(None, None, None) #_type, _value, _traceback) if _type or _value or _traceback: save_logs_to_file(self.base_dir, self.log_stream, self.persistent_data)
def __exit__(self, _type, _value, _traceback): logging.debug("{}.__exit__({}, {}, {})".format(self, _type, _value, _traceback)) try: self.cluster.__exit__(None, None, None) except Exception as err: #logging.exception(err) pass if _type or _value or _traceback: crashed_workers = list( filter(lambda r: r.returncode not in (0, -9, -15), self.persistent_data.get('runner_data', []))) if crashed_workers: logging.error( "Some workers exited badly. The last {} lines of " "each were:\n\n{}".format( FROM_TAIL, runner_data_format(self.persistent_data.get( 'runner_data', []), from_tail=FROM_TAIL))) save_logs_to_file(self.base_dir, self.log_stream, self.persistent_data) if _value is not None: raise _value
def _test_restart(command): t0 = datetime.datetime.now() log_stream = add_in_memory_log_stream(level=logging.DEBUG) persistent_data = {} try: try: _run(command, persistent_data) except: logging.error("Restart_without_resilience test encountered an " "error.") # Do this ugly thing to use proper exception handling here try: raise except SinkAwaitTimeoutError: logging.error("SinkAWaitTimeoutError encountered.") raise except TimeoutError: logging.error("TimeoutError encountered.") raise except: if persistent_data.get('runner_data'): logging.error( "Some workers exited badly. The last {} lines of " "each were:\n\n{}".format( FROM_TAIL, runner_data_format( persistent_data.get('runner_data'), from_tail=FROM_TAIL))) raise except Exception as err: # save log stream to file try: base_dir = ('/tmp/wallaroo_test_errors/testing/correctness/' 'tests/restart_without_resilience/{time}'.format( time=t0.strftime('%Y%m%d_%H%M%S'))) save_logs_to_file(base_dir, log_stream, persistent_data) except Exception as err_inner: logging.exception(err_inner) logging.warning( "Encountered an error when saving logs files to {}".format( base_dir)) logging.exception(err) raise
def _test_resilience(command, ops=[], initial=None, sources=1, partition_multiplier=5, cycles=1, validate_output=True, sender_mps=1000, sender_interval=0.01, retry_count=5, api=None): """ Execute a resilience test for the given command. `command` - the command string to execute `ops` - the list of operations to perform. `initial` - (optional) the initial cluster size `sources` - the number of sources to use `partition_multiplier` - multiply number of workers by this to determine how many partitiosn to use `cycles` - how many times to repeat the list of operations `validate_output` - whether or not to validate the output `sender_mps` - messages per second to send from the sender (default 1000) `sender_interval` - seconds between sender batches (default 0.01) `retry_count` - number of times to retry a test after RunnerHasntStartedError (default 5) `api` - the string name of the API being tested. Optional, used for naming error logs. """ t0 = datetime.datetime.now() log_stream = add_in_memory_log_stream(level=logging.DEBUG) persistent_data = {} res_ops = [] try: try: _run(persistent_data=persistent_data, res_ops=res_ops, command=command, ops=ops * cycles, initial=initial, sources=sources, partition_multiplier=partition_multiplier, validate_output=validate_output, sender_mps=sender_mps, sender_interval=sender_interval) except: logging.error( "Resilience test encountered an error after the steps" " {}".format([o.name() for o in res_ops])) # Do this ugly thing to use proper exception handling here try: raise except RunnerHasntStartedError as err: logging.warn("Runner failed to start properly.") if retry_count > 0: logging.info("Restarting the test!") _test_resilience(command=command, ops=ops, initial=initial, sources=sources, partition_multiplier=partition_multiplier, cycles=cycles, validate_output=validate_output, sender_mps=sender_mps, sender_interval=sender_interval, retry_count=retry_count - 1) else: logging.error("Max retry attempts reached.") raise except SinkAwaitTimeoutError: logging.error("SinkAWaitTimeoutError encountered.") raise except TimeoutError: logging.error("TimeoutError encountered.") raise except: if persistent_data.get('runner_data'): logging.error( "Some workers exited badly. The last {} lines of " "each were:\n\n{}".format( FROM_TAIL, runner_data_format( persistent_data.get('runner_data'), from_tail=FROM_TAIL))) raise except Exception as err: # save log stream to file try: cwd = os.getcwd() trunc_head = cwd.find('/wallaroo/') + len('/wallaroo/') base_dir = ( '/tmp/wallaroo_test_errors/{head}/{api}/{ops}/{time}'.format( head=cwd[trunc_head:], api=api, time=t0.strftime('%Y%m%d_%H%M%S'), ops='_'.join( (o.name().replace(':', '') for o in ops * cycles)))) save_logs_to_file(base_dir, log_stream, persistent_data) except Exception as err_inner: logging.exception(err_inner) logging.warn( "Encountered an error when saving logs files to {}".format( base_dir)) logging.exception(err) raise
def run_test(api, cmd, validation_cmd, topology, workers=1): max_retries = 3 t0 = datetime.datetime.now() log_stream = add_in_memory_log_stream(level=logging.DEBUG) cwd = os.getcwd() trunc_head = cwd.find('/wallaroo/') + len('/wallaroo/') base_dir = ('/tmp/wallaroo_test_errors/{path}/{api}/{topo}/{workers}' '/{timestamp}'.format(path=cwd[trunc_head:], api=api, topo='_'.join(topology), workers='{}_workers'.format(workers), timestamp=t0.strftime('%Y%m%d_%H%M%S'))) persistent_data = {} steps_val = ' '.join('--{}'.format(s) for s in topology) output = 'received.txt' cmd_val = ("{cmd} {steps}".format(cmd=cmd, steps=steps_val)) validation_cmd_val = ("{validation_cmd} {steps} " "--output {output}".format( validation_cmd=validation_cmd, steps=steps_val, output=output)) expect_mod = get_expect_modifier(topology) logging.info("Expect mod is {} for topology {!r}".format( expect_mod, topology)) send, expect = find_send_and_expect_values(expect_mod) logging.info("Sending {} messages per key".format(send)) logging.info("Expecting {} final messages per key".format(expect)) # Run the test! attempt = 0 try: while True: attempt += 1 try: # clean up data collection before each attempt persistent_data.clear() log_stream.seek(0) log_stream.truncate() # start test attempt logging.info("Integration test attempt {}".format(attempt)) logging.debug("Running integration test with the following" " options:") gens = [(sequence_generator(send, 0, '>I', 'key_0'), 0), (sequence_generator(send, 0, '>I', 'key_1'), 0)] pipeline_test(generator=gens, expected=None, command=cmd_val, workers=workers, sources=1, sinks=1, mode='framed', batch_size=1, sink_expect=expect * len(gens), sink_stop_timeout=5, validate_file=output, persistent_data=persistent_data, log_error=False) # Test run was successful, break out of loop and proceed to # validation logging.info("Run phase complete. Proceeding to validation.") break except RunnerHasntStartedError: logging.warning("Runner failed to start properly.") if attempt < max_retries: logging.info("Restarting the test!") time.sleep(0.5) continue else: logging.error("Max retry attempts reached.") raise except ClusterError as err: outputs = runner_data_format(persistent_data.get( 'runner_data', []), from_tail=20, filter_fn=lambda r: True) logging.error("Worker outputs:\n\n{}\n".format(outputs)) raise except: outputs = runner_data_format(persistent_data.get( 'runner_data', []), from_tail=20) if outputs: logging.error("Worker outputs:\n\n{}\n".format(outputs)) raise except Exception as err: logging.exception("Encountered an error while running the test for" " %r\n===\n" % cmd) # Save logs to file in case of error try: save_logs_to_file(base_dir, log_stream, persistent_data) except Exception as err: logging.warning("failed to save logs to file") logging.exception(err) raise res = run_shell_cmd(validation_cmd_val) if res.success: if res.output: logging.info( "Validation command '%s' completed successfully " "with the output:\n--\n%s", ' '.join(res.command), res.output) else: logging.info("Validation command '%s' completed successfully", ' '.join(res.command)) else: outputs = runner_data_format(persistent_data.get('runner_data', [])) if outputs: logging.error("Application outputs:\n{}".format(outputs)) logging.error("Validation command\n '{}'\nfailed with the output:\n" "--\n{}".format(' '.join(res.command), res.output)) # Save logs to file in case of error save_logs_to_file(base_dir, log_stream, persistent_data) if logging.root.level > logging.ERROR: # If failed, and logging level means we didn't log error, include it # in exit message print(res.output) exit(res.return_code) raise ValidationError() # Reached the end and nothing broke. Success! logging.info( "Topology test completed successfully for topology {!r}".format( topology)) del persistent_data log_stream.close() logging.root.handlers.clear() del log_stream time.sleep(0.1)
def _test_resilience(command, ops=[], initial=None, source_type='tcp', source_name='Detector', source_number=1, partitions=40, cycles=1, validation_cmd=False, sender_mps=1000, sender_interval=0.01, retry_count=5, api=None): """ Execute a resilience test for the given command. `command` - the command string to execute `ops` - the list of operations to perform. `initial` - (optional) the initial cluster size `source_type` - the type of the source ('tcp', 'gensource', 'alo') `source_name` - the name of the source (e.g. 'Detector') `source_number` - the number of workers to start sources on (default: 1) `partitions` - number of partitions to use (default: 40) `cycles` - how many times to repeat the list of operations `validation_cmd` - The command to use for validation. Default is: False `sender_mps` - messages per second to send from the sender (default 1000) `sender_interval` - seconds between sender batches (default 0.01) `retry_count` - number of times to retry a test after RunnerHasntStartedError (default 5) `api` - the string name of the API being tested. Optional, used for naming error logs. """ t0 = datetime.datetime.now() log_stream = add_in_memory_log_stream(level=logging.DEBUG) persistent_data = {} res_ops = [] try: try: _run(persistent_data=persistent_data, res_ops=res_ops, command=command, ops=ops * cycles, initial=initial, source_type=source_type, source_name=source_name, source_number=source_number, partitions=partitions, validation_cmd=validation_cmd, sender_mps=sender_mps, sender_interval=sender_interval) except: logging.error( "Resilience test encountered an error after the steps" " {}".format([o.name() for o in res_ops])) # Do this ugly thing to use proper exception handling here try: raise except RunnerHasntStartedError as err: logging.warning("Runner failed to start properly.") if retry_count > 0: logging.info("Restarting the test!") _test_resilience(command=command, ops=ops, initial=initial, source_type=source_type, source_name=source_name, source_number=source_number, partitions=partitions, cycles=cycles, validation_cmd=validation_cmd, sender_mps=sender_mps, sender_interval=sender_interval, retry_count=retry_count - 1) else: logging.error("Max retry attempts reached.") raise except SinkAwaitTimeoutError: logging.error("SinkAWaitTimeoutError encountered.") raise except TimeoutError: logging.error("TimeoutError encountered.") raise except: crashed_workers = list( filter(lambda r: r.returncode not in (0, -9, -15), persistent_data.get('runner_data', []))) if crashed_workers: logging.error( "Some workers exited badly. The last {} lines of " "each were:\n\n{}".format( FROM_TAIL, runner_data_format(persistent_data.get( 'runner_data', []), from_tail=FROM_TAIL))) raise else: # no exception if SAVE_LOGS: # raise an error and save logs raise SaveLogs() except Exception as err: # save log stream to file try: cwd = os.getcwd() trunc_head = cwd.find('/wallaroo/') + len('/wallaroo/') test_root = '/tmp/wallaroo_test_errors' base_dir = ( '{test_root}/{head}/{api}/{src_type}/{src_num}/{ops}/{time}'. format(test_root=test_root, head=cwd[trunc_head:], api=api, src_type=source_type, src_num=source_number, time=t0.strftime('%Y%m%d_%H%M%S'), ops='_'.join( (o.name().replace(':', '') for o in ops * cycles)))) save_logs_to_file(base_dir, log_stream, persistent_data) except Exception as err_inner: logging.exception(err_inner) logging.warning( "Encountered an error when saving logs files to {}".format( base_dir)) logging.exception(err) raise