Example #1
0
 def __exit__(self, _type, _value, _traceback):
     logging.debug("{}.__exit__({}, {}, {})".format(self, _type, _value,
                                                    _traceback))
     self.cluster.__exit__(None, None, None)  #_type, _value, _traceback)
     if _type or _value or _traceback:
         save_logs_to_file(self.base_dir, self.log_stream,
                           self.persistent_data)
Example #2
0
    def __exit__(self, _type, _value, _traceback):
        logging.debug("{}.__exit__({}, {}, {})".format(self, _type, _value,
                                                       _traceback))
        try:
            self.cluster.__exit__(None, None, None)
        except Exception as err:
            #logging.exception(err)
            pass
        if _type or _value or _traceback:
            crashed_workers = list(
                filter(lambda r: r.returncode not in (0, -9, -15),
                       self.persistent_data.get('runner_data', [])))
            if crashed_workers:
                logging.error(
                    "Some workers exited badly. The last {} lines of "
                    "each were:\n\n{}".format(
                        FROM_TAIL,
                        runner_data_format(self.persistent_data.get(
                            'runner_data', []),
                                           from_tail=FROM_TAIL)))

            save_logs_to_file(self.base_dir, self.log_stream,
                              self.persistent_data)
        if _value is not None:
            raise _value
def _test_restart(command):
    t0 = datetime.datetime.now()
    log_stream = add_in_memory_log_stream(level=logging.DEBUG)
    persistent_data = {}
    try:
        try:
            _run(command, persistent_data)
        except:
            logging.error("Restart_without_resilience test encountered an "
                          "error.")
            # Do this ugly thing to use proper exception handling here
            try:
                raise
            except SinkAwaitTimeoutError:
                logging.error("SinkAWaitTimeoutError encountered.")
                raise
            except TimeoutError:
                logging.error("TimeoutError encountered.")
                raise
            except:
                if persistent_data.get('runner_data'):
                    logging.error(
                        "Some workers exited badly. The last {} lines of "
                        "each were:\n\n{}".format(
                            FROM_TAIL,
                            runner_data_format(
                                persistent_data.get('runner_data'),
                                from_tail=FROM_TAIL)))
                raise
    except Exception as err:
        # save log stream to file
        try:
            base_dir = ('/tmp/wallaroo_test_errors/testing/correctness/'
                        'tests/restart_without_resilience/{time}'.format(
                            time=t0.strftime('%Y%m%d_%H%M%S')))
            save_logs_to_file(base_dir, log_stream, persistent_data)
        except Exception as err_inner:
            logging.exception(err_inner)
            logging.warning(
                "Encountered an error when saving logs files to {}".format(
                    base_dir))
        logging.exception(err)
        raise
Example #4
0
def _test_resilience(command,
                     ops=[],
                     initial=None,
                     sources=1,
                     partition_multiplier=5,
                     cycles=1,
                     validate_output=True,
                     sender_mps=1000,
                     sender_interval=0.01,
                     retry_count=5,
                     api=None):
    """
    Execute a resilience test for the given command.

    `command` - the command string to execute
    `ops` - the list of operations to perform.
    `initial` - (optional) the initial cluster size
    `sources` - the number of sources to use
    `partition_multiplier` - multiply number of workers by this to determine
      how many partitiosn to use
    `cycles` - how many times to repeat the list of operations
    `validate_output` - whether or not to validate the output
    `sender_mps` - messages per second to send from the sender (default 1000)
    `sender_interval` - seconds between sender batches (default 0.01)
    `retry_count` - number of times to retry a test after RunnerHasntStartedError
                    (default 5)
    `api` - the string name of the API being tested. Optional, used for naming
            error logs.
    """
    t0 = datetime.datetime.now()
    log_stream = add_in_memory_log_stream(level=logging.DEBUG)
    persistent_data = {}
    res_ops = []
    try:
        try:
            _run(persistent_data=persistent_data,
                 res_ops=res_ops,
                 command=command,
                 ops=ops * cycles,
                 initial=initial,
                 sources=sources,
                 partition_multiplier=partition_multiplier,
                 validate_output=validate_output,
                 sender_mps=sender_mps,
                 sender_interval=sender_interval)
        except:
            logging.error(
                "Resilience test encountered an error after the steps"
                " {}".format([o.name() for o in res_ops]))
            # Do this ugly thing to use proper exception handling here
            try:
                raise
            except RunnerHasntStartedError as err:
                logging.warn("Runner failed to start properly.")
                if retry_count > 0:
                    logging.info("Restarting the test!")
                    _test_resilience(command=command,
                                     ops=ops,
                                     initial=initial,
                                     sources=sources,
                                     partition_multiplier=partition_multiplier,
                                     cycles=cycles,
                                     validate_output=validate_output,
                                     sender_mps=sender_mps,
                                     sender_interval=sender_interval,
                                     retry_count=retry_count - 1)
                else:
                    logging.error("Max retry attempts reached.")
                    raise
            except SinkAwaitTimeoutError:
                logging.error("SinkAWaitTimeoutError encountered.")
                raise
            except TimeoutError:
                logging.error("TimeoutError encountered.")
                raise
            except:
                if persistent_data.get('runner_data'):
                    logging.error(
                        "Some workers exited badly. The last {} lines of "
                        "each were:\n\n{}".format(
                            FROM_TAIL,
                            runner_data_format(
                                persistent_data.get('runner_data'),
                                from_tail=FROM_TAIL)))
                raise
    except Exception as err:
        # save log stream to file
        try:
            cwd = os.getcwd()
            trunc_head = cwd.find('/wallaroo/') + len('/wallaroo/')
            base_dir = (
                '/tmp/wallaroo_test_errors/{head}/{api}/{ops}/{time}'.format(
                    head=cwd[trunc_head:],
                    api=api,
                    time=t0.strftime('%Y%m%d_%H%M%S'),
                    ops='_'.join(
                        (o.name().replace(':', '') for o in ops * cycles))))
            save_logs_to_file(base_dir, log_stream, persistent_data)
        except Exception as err_inner:
            logging.exception(err_inner)
            logging.warn(
                "Encountered an error when saving logs files to {}".format(
                    base_dir))
        logging.exception(err)
        raise
Example #5
0
def run_test(api, cmd, validation_cmd, topology, workers=1):
    max_retries = 3
    t0 = datetime.datetime.now()
    log_stream = add_in_memory_log_stream(level=logging.DEBUG)
    cwd = os.getcwd()
    trunc_head = cwd.find('/wallaroo/') + len('/wallaroo/')
    base_dir = ('/tmp/wallaroo_test_errors/{path}/{api}/{topo}/{workers}'
                '/{timestamp}'.format(path=cwd[trunc_head:],
                                      api=api,
                                      topo='_'.join(topology),
                                      workers='{}_workers'.format(workers),
                                      timestamp=t0.strftime('%Y%m%d_%H%M%S')))
    persistent_data = {}

    steps_val = ' '.join('--{}'.format(s) for s in topology)
    output = 'received.txt'
    cmd_val = ("{cmd} {steps}".format(cmd=cmd, steps=steps_val))
    validation_cmd_val = ("{validation_cmd} {steps} "
                          "--output {output}".format(
                              validation_cmd=validation_cmd,
                              steps=steps_val,
                              output=output))

    expect_mod = get_expect_modifier(topology)
    logging.info("Expect mod is {} for topology {!r}".format(
        expect_mod, topology))
    send, expect = find_send_and_expect_values(expect_mod)
    logging.info("Sending {} messages per key".format(send))
    logging.info("Expecting {} final messages per key".format(expect))

    # Run the test!
    attempt = 0
    try:
        while True:
            attempt += 1
            try:
                # clean up data collection before each attempt
                persistent_data.clear()
                log_stream.seek(0)
                log_stream.truncate()

                # start test attempt
                logging.info("Integration test attempt {}".format(attempt))
                logging.debug("Running integration test with the following"
                              " options:")

                gens = [(sequence_generator(send, 0, '>I', 'key_0'), 0),
                        (sequence_generator(send, 0, '>I', 'key_1'), 0)]

                pipeline_test(generator=gens,
                              expected=None,
                              command=cmd_val,
                              workers=workers,
                              sources=1,
                              sinks=1,
                              mode='framed',
                              batch_size=1,
                              sink_expect=expect * len(gens),
                              sink_stop_timeout=5,
                              validate_file=output,
                              persistent_data=persistent_data,
                              log_error=False)
                # Test run was successful, break out of loop and proceed to
                # validation
                logging.info("Run phase complete. Proceeding to validation.")
                break
            except RunnerHasntStartedError:
                logging.warning("Runner failed to start properly.")
                if attempt < max_retries:
                    logging.info("Restarting the test!")
                    time.sleep(0.5)
                    continue
                else:
                    logging.error("Max retry attempts reached.")
                    raise
            except ClusterError as err:
                outputs = runner_data_format(persistent_data.get(
                    'runner_data', []),
                                             from_tail=20,
                                             filter_fn=lambda r: True)
                logging.error("Worker outputs:\n\n{}\n".format(outputs))
                raise
            except:
                outputs = runner_data_format(persistent_data.get(
                    'runner_data', []),
                                             from_tail=20)
                if outputs:
                    logging.error("Worker outputs:\n\n{}\n".format(outputs))
                raise
    except Exception as err:
        logging.exception("Encountered an error while running the test for"
                          " %r\n===\n" % cmd)
        # Save logs to file in case of error
        try:
            save_logs_to_file(base_dir, log_stream, persistent_data)
        except Exception as err:
            logging.warning("failed to save logs to file")
            logging.exception(err)
        raise

    res = run_shell_cmd(validation_cmd_val)
    if res.success:
        if res.output:
            logging.info(
                "Validation command '%s' completed successfully "
                "with the output:\n--\n%s", ' '.join(res.command), res.output)
        else:
            logging.info("Validation command '%s' completed successfully",
                         ' '.join(res.command))
    else:
        outputs = runner_data_format(persistent_data.get('runner_data', []))
        if outputs:
            logging.error("Application outputs:\n{}".format(outputs))
        logging.error("Validation command\n    '{}'\nfailed with the output:\n"
                      "--\n{}".format(' '.join(res.command), res.output))
        # Save logs to file in case of error
        save_logs_to_file(base_dir, log_stream, persistent_data)

        if logging.root.level > logging.ERROR:
            # If failed, and logging level means we didn't log error, include it
            # in exit message
            print(res.output)
            exit(res.return_code)
        raise ValidationError()

    # Reached the end and nothing broke. Success!
    logging.info(
        "Topology test completed successfully for topology {!r}".format(
            topology))
    del persistent_data
    log_stream.close()
    logging.root.handlers.clear()
    del log_stream
    time.sleep(0.1)
Example #6
0
def _test_resilience(command,
                     ops=[],
                     initial=None,
                     source_type='tcp',
                     source_name='Detector',
                     source_number=1,
                     partitions=40,
                     cycles=1,
                     validation_cmd=False,
                     sender_mps=1000,
                     sender_interval=0.01,
                     retry_count=5,
                     api=None):
    """
    Execute a resilience test for the given command.

    `command` - the command string to execute
    `ops` - the list of operations to perform.
    `initial` - (optional) the initial cluster size
    `source_type` - the type of the source ('tcp', 'gensource', 'alo')
    `source_name` - the name of the source (e.g. 'Detector')
    `source_number` - the number of workers to start sources on (default: 1)
    `partitions` - number of partitions to use (default: 40)
    `cycles` - how many times to repeat the list of operations
    `validation_cmd` - The command to use for validation. Default is: False
    `sender_mps` - messages per second to send from the sender (default 1000)
    `sender_interval` - seconds between sender batches (default 0.01)
    `retry_count` - number of times to retry a test after RunnerHasntStartedError
                    (default 5)
    `api` - the string name of the API being tested. Optional, used for naming
            error logs.
    """
    t0 = datetime.datetime.now()
    log_stream = add_in_memory_log_stream(level=logging.DEBUG)
    persistent_data = {}
    res_ops = []
    try:
        try:
            _run(persistent_data=persistent_data,
                 res_ops=res_ops,
                 command=command,
                 ops=ops * cycles,
                 initial=initial,
                 source_type=source_type,
                 source_name=source_name,
                 source_number=source_number,
                 partitions=partitions,
                 validation_cmd=validation_cmd,
                 sender_mps=sender_mps,
                 sender_interval=sender_interval)
        except:
            logging.error(
                "Resilience test encountered an error after the steps"
                " {}".format([o.name() for o in res_ops]))
            # Do this ugly thing to use proper exception handling here
            try:
                raise
            except RunnerHasntStartedError as err:
                logging.warning("Runner failed to start properly.")
                if retry_count > 0:
                    logging.info("Restarting the test!")
                    _test_resilience(command=command,
                                     ops=ops,
                                     initial=initial,
                                     source_type=source_type,
                                     source_name=source_name,
                                     source_number=source_number,
                                     partitions=partitions,
                                     cycles=cycles,
                                     validation_cmd=validation_cmd,
                                     sender_mps=sender_mps,
                                     sender_interval=sender_interval,
                                     retry_count=retry_count - 1)
                else:
                    logging.error("Max retry attempts reached.")
                    raise
            except SinkAwaitTimeoutError:
                logging.error("SinkAWaitTimeoutError encountered.")
                raise
            except TimeoutError:
                logging.error("TimeoutError encountered.")
                raise
            except:
                crashed_workers = list(
                    filter(lambda r: r.returncode not in (0, -9, -15),
                           persistent_data.get('runner_data', [])))
                if crashed_workers:
                    logging.error(
                        "Some workers exited badly. The last {} lines of "
                        "each were:\n\n{}".format(
                            FROM_TAIL,
                            runner_data_format(persistent_data.get(
                                'runner_data', []),
                                               from_tail=FROM_TAIL)))
                raise
        else:  # no exception
            if SAVE_LOGS:  # raise an error and save logs
                raise SaveLogs()
    except Exception as err:
        # save log stream to file
        try:
            cwd = os.getcwd()
            trunc_head = cwd.find('/wallaroo/') + len('/wallaroo/')
            test_root = '/tmp/wallaroo_test_errors'
            base_dir = (
                '{test_root}/{head}/{api}/{src_type}/{src_num}/{ops}/{time}'.
                format(test_root=test_root,
                       head=cwd[trunc_head:],
                       api=api,
                       src_type=source_type,
                       src_num=source_number,
                       time=t0.strftime('%Y%m%d_%H%M%S'),
                       ops='_'.join(
                           (o.name().replace(':', '') for o in ops * cycles))))
            save_logs_to_file(base_dir, log_stream, persistent_data)
        except Exception as err_inner:
            logging.exception(err_inner)
            logging.warning(
                "Encountered an error when saving logs files to {}".format(
                    base_dir))
        logging.exception(err)
        raise