Example #1
0
def given_data_sent(cluster):
    reader = Reader(iter_generator(items=[chr(x+65).encode()
                                          for x in range(INPUT_ITEMS)],
                                   to_bytes=lambda s: pack('>2sI', s, 1)))
    sender = Sender(cluster.source_addrs[0],
                    reader,
                    batch_size=50, interval=0.05, reconnect=True)
    cluster.add_sender(sender, start=True)
    time.sleep(0.5)
Example #2
0
def given_data_sent(cluster):
    values = [chr(x + 65) for x in range(INPUT_ITEMS)]
    reader = Reader(iter_generator(values))
    sender = Sender(cluster.source_addrs[0]["Dummy"],
                    reader,
                    batch_size=50,
                    interval=0.05,
                    reconnect=True)
    cluster.add_sender(sender, start=True)
    await_values = [
        struct.pack('>I', len(v.encode())) + v.encode() for v in values
    ]
    cluster.sink_await(await_values)
Example #3
0
def _autoscale_sequence(command, ops=[1], cycles=1, initial=None):
    host = '127.0.0.1'
    sources = 1

    if isinstance(ops, int):
        ops = [ops]

    # If no initial workers value is given, determine the minimum number
    # required at the start so that the cluster never goes below 1 worker.
    # If a number is given, then verify it is sufficient.
    bottom = min(min(compact_sign(ops * cycles)), sum(ops * cycles))
    if bottom < 1:
        min_workers = abs(bottom) + 1
    else:
        min_workers = 1
    if isinstance(initial, int):
        assert (initial >= min_workers)
        workers = initial
    else:
        workers = min_workers

    batch_size = 10
    interval = 0.05
    msgs_per_sec = int(batch_size / interval)
    base_time = 10  # Seconds
    cycle_time = 10  # seconds
    expect_time = base_time + cycle_time * cycles  # seconds
    expect = expect_time * msgs_per_sec
    sender_timeout = expect_time + 10  # seconds
    join_timeout = 200
    runner_join_timeout = 30

    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    setup_resilience_path(res_dir)

    steps = []

    runners = []
    try:
        try:
            # Create sink, metrics, reader, sender
            sink = Sink(host)
            metrics = Metrics(host)
            lowercase2 = [a + b for a in lowercase for b in lowercase]
            char_gen = cycle(lowercase2)
            chars = [next(char_gen) for i in range(expect)]
            expected = Counter(chars)

            reader = Reader(iter_generator(chars,
                                           lambda s: pack('>2sI', s, 1)))

            await_values = [
                pack('>I2sQ', 10, c, v) for c, v in expected.items()
            ]

            # Start sink and metrics, and get their connection info
            sink.start()
            sink_host, sink_port = sink.get_connection_info()
            outputs = '{}:{}'.format(sink_host, sink_port)

            metrics.start()
            metrics_host, metrics_port = metrics.get_connection_info()
            time.sleep(0.05)

            num_ports = sources + 3 + (2 * (workers - 1))
            ports = get_port_values(num=num_ports, host=host)
            (input_ports, (control_port, data_port, external_port),
             worker_ports) = (ports[:sources], ports[sources:sources + 3],
                              zip(ports[-(2 * (workers - 1)):][::2],
                                  ports[-(2 * (workers - 1)):][1::2]))
            inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

            start_runners(runners, command, host, inputs, outputs,
                          metrics_port, control_port, external_port, data_port,
                          res_dir, workers, worker_ports)

            # Wait for first runner (initializer) to report application ready
            runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
            runner_ready_checker.start()
            runner_ready_checker.join()
            if runner_ready_checker.error:
                raise runner_ready_checker.error

            # Get initial partition data
            partitions = query_partitions(host, external_port)
            # Verify all workers start with partitions
            assert (map(
                len, partitions['state_partitions']
                ['letter-state'].values()).count(0) == 0)

            # start sender
            sender = Sender(host,
                            input_ports[0],
                            reader,
                            batch_size=batch_size,
                            interval=interval)
            sender.start()

            time.sleep(2)

            # Perform autoscale cycles
            start_froms = {r: 0 for r in runners}
            for cyc in range(cycles):
                for joiners in ops:
                    steps.append(joiners)
                    joined = []
                    left = []
                    if joiners > 0:
                        # create a new worker and have it join
                        new_ports = get_port_values(num=(joiners * 2),
                                                    host=host,
                                                    base_port=25000)
                        joiner_ports = zip(new_ports[::2], new_ports[1::2])
                        for i in range(joiners):
                            add_runner(runners, command, host, inputs, outputs,
                                       metrics_port, control_port,
                                       external_port, data_port, res_dir,
                                       joiners, *joiner_ports[i])
                            joined.append(runners[-1])
                            start_froms[runners[-1]] = 0

                        patterns_i = ([
                            re.escape('***Worker {} attempting to join the '
                                      'cluster. Sent necessary information.***'
                                      .format(r.name)) for r in joined
                        ] + [
                            re.escape('Migrating partitions to {}'.format(
                                r.name)) for r in joined
                        ] + [
                            re.escape('--All new workers have acked migration '
                                      'batch complete'),
                            re.escape('~~~Resuming message processing.~~~')
                        ])
                        patterns_j = [
                            re.escape('***Successfully joined cluster!***'),
                            re.escape('~~~Resuming message processing.~~~')
                        ]

                        # Wait for runners to complete joining
                        join_checkers = []
                        join_checkers.append(
                            RunnerChecker(runners[0],
                                          patterns_i,
                                          timeout=join_timeout,
                                          start_from=start_froms[runners[0]]))
                        for runner in joined:
                            join_checkers.append(
                                RunnerChecker(runner,
                                              patterns_j,
                                              timeout=join_timeout,
                                              start_from=start_froms[runner]))
                        for jc in join_checkers:
                            jc.start()
                        for jc in join_checkers:
                            jc.join()
                            if jc.error:
                                outputs = runners_output_format(runners)
                                raise AutoscaleTimeoutError(
                                    "'{}' timed out on JOIN in {} "
                                    "seconds. The cluster had the following outputs:\n===\n{}"
                                    .format(jc.runner_name, jc.timeout,
                                            outputs),
                                    as_error=jc.error,
                                    as_steps=steps)

                    elif joiners < 0:  # joiners < 0, e.g. leavers!
                        # choose the most recent, still-alive runners to leave
                        leavers = abs(joiners)
                        idx = 1
                        while len(left) < leavers and idx < len(runners):
                            if runners[-idx].is_alive():
                                left.append(runners[-idx])
                            idx += 1
                        if len(left) < leavers:
                            raise AutoscaleTestError(
                                "Not enough workers left to "
                                "shrink! {} requested but "
                                "only {} live non-initializer"
                                "workers found!".format(joiners, len(left)))

                        # Create the checkers

                        initializer = [runners[0]]
                        remaining = [
                            r for r in runners
                            if r.is_alive() and r not in initializer + left
                        ]

                        patterns_i = ([
                            r'ExternalChannelConnectNotifier: initializer: '
                            r'server closed', r'Saving topology!',
                            r'Saving worker names to file: .*?initializer.'
                            r'workers'
                        ] + [
                            re.escape(
                                r'LocalTopology._save_worker_names: {}'.format(
                                    r.name)) for r in initializer + remaining
                        ] + [
                            re.escape(r'~~~Initiating shrink~~~'),
                            re.escape(r'-- Remaining workers:')
                        ] + [
                            re.escape(r'-- -- {}'.format(r.name))
                            for n in initializer + remaining
                        ] + [
                            re.escape(r'~~~Stopping message processing for '
                                      r'leaving workers.~~~'),
                            re.escape(r'~~~Resuming message processing.~~~')
                        ])

                        patterns_r = ([
                            re.escape(
                                r'Control Ch: Received Mute Request from initializer'
                            ),
                            re.escape(
                                r'~~~Stopping message processing for leaving workers.~~~'
                            ),
                            re.escape(
                                r'DataChannelConnectNotifier: server closed'),
                            re.escape(
                                r'ControlSenderConnectNotifier: server closed'
                            ),
                            re.escape(r'BoundaryNotify: closed'),
                            re.escape(
                                r'Control Ch: Received Unmute Request from initializer'
                            ),
                            re.escape(r'~~~Resuming message processing.~~~'),
                            re.escape(r'Shutting down OutgoingBoundary'),
                            re.escape(r'Shutting down ControlConnection')
                        ])
                        patterns_r_per = [
                            r'ControlChannelConnectNotifier:{}: server closed'
                        ]

                        patterns_l = ([
                            re.escape(
                                r'Control Ch: Received Mute Request from {}'.
                                format(r.name))
                            for r in initializer + remaining
                        ] + [
                            re.escape(
                                r'Migrating all partitions to {} remaining '
                                r'workers'.format(
                                    len(initializer + remaining))),
                            r'\^\^Migrating \d+ steps to {} workers'.format(
                                len(initializer + remaining))
                        ] + [
                            r'\^\^Migrating step \d+ to outgoing '
                            r'boundary {}/[0-9a-f]{{12}}'.format(r.name)
                            for r in initializer + remaining
                        ] + [
                            re.escape(
                                r'--Sending leaving worker done migration msg to cluster'
                            ),
                            re.escape(
                                r'Connections: Finished shutdown procedure.'),
                            re.escape(r'Shutting down ControlConnection'),
                            re.escape(r'Shutting down TCPSink'),
                            re.escape(r'Shutting down DataReceiver'),
                            re.escape(
                                r'Shutting down ReconnectingMetricsSink'),
                            re.escape(r'Shutting down OutgoingBoundary'),
                            re.escape(r'Shutting down Startup...'),
                            re.escape(r'Shutting down DataChannel'),
                            re.escape(r'metrics connection closed'),
                            re.escape(r'TCPSink connection closed'),
                            re.escape(
                                r'ControlChannelConnectNotifier: server closed'
                            )
                        ])
                        patterns_l_per = []

                        left_checkers = []

                        # initializer STDOUT checker
                        left_checkers.append(
                            RunnerChecker(
                                initializer[0],
                                patterns_i,
                                timeout=join_timeout,
                                start_from=start_froms[initializer[0]]))

                        # remaining workers STDOUT checkers
                        for runner in remaining:
                            left_checkers.append(
                                RunnerChecker(runner,
                                              patterns_r + [
                                                  p.format(runner.name)
                                                  for p in patterns_r_per
                                              ],
                                              timeout=join_timeout,
                                              start_from=start_froms[runner]))

                        # leaving workers STDOUT checkers
                        for runner in left:
                            left_checkers.append(
                                RunnerChecker(runner,
                                              patterns_l + [
                                                  p.format(runner.name)
                                                  for p in patterns_l_per
                                              ],
                                              timeout=join_timeout,
                                              start_from=start_froms[runner]))
                        for lc in left_checkers:
                            lc.start()

                        # Send the shrink command
                        send_shrink_cmd(host,
                                        external_port,
                                        names=[r.name for r in left])

                        # Wait for output checkers to join
                        for lc in left_checkers:
                            lc.join()
                            if lc.error:
                                outputs = runners_output_format(runners)
                                raise AutoscaleTimeoutError(
                                    "'{}' timed out on SHRINK in {} "
                                    "seconds. The cluster had the following outputs:\n===\n{}"
                                    .format(lc.runner_name, lc.timeout,
                                            outputs),
                                    as_error=lc.error,
                                    as_steps=steps)

                    else:  # Handle the 0 case as a noop
                        continue

                    start_froms = {r: r.tell() for r in runners}

                    # Validate autoscale via partition query
                    try:
                        partitions = query_partitions(host, external_port)
                        phase_validate_partitions(
                            runners,
                            partitions,
                            joined=[r.name for r in joined],
                            left=[r.name for r in left])
                    except Exception as err:
                        print(
                            'error validating {} have joined and {} have left'.
                            format([r.name for r in joined],
                                   [r.name for r in left]))
                        raise err

            # wait until sender completes (~10 seconds)
            sender.join(sender_timeout)
            if sender.error:
                raise sender.error
            if sender.is_alive():
                sender.stop()
                raise TimeoutError('Sender did not complete in the expected '
                                   'period')

            # Use Sink value to determine when to stop runners and sink
            stopper = SinkAwaitValue(sink, await_values, 30)
            stopper.start()
            stopper.join()
            if stopper.error:
                print('sink.data', len(sink.data))
                print('await_values', len(await_values))
                raise stopper.error

            # stop application workers
            for r in runners:
                r.stop()

            # Stop sink
            sink.stop()

            # Stop metrics
            metrics.stop()

            # validate output
            phase_validate_output(runners, sink, expected)

        finally:
            for r in runners:
                r.stop()
            # Wait on runners to finish waiting on their subprocesses to exit
            for r in runners:
                r.join(runner_join_timeout)
            alive = []
            for r in runners:
                if r.is_alive():
                    alive.append(r)
            for r in runners:
                ec = r.poll()
                if ec != 0:
                    print('Worker {!r} exited with return code {}'.format(
                        r.name, ec))
                    print('Its last 5 log lines were:')
                    print('\n'.join(r.get_output().splitlines()[-5:]))
                    print()
            if alive:
                alive_names = ', '.join((r.name for r in alive))
                outputs = runners_output_format(runners)
                for a in alive:
                    a.kill()
            clean_resilience_path(res_dir)
            if alive:
                raise PipelineTestError(
                    "Runners [{}] failed to exit cleanly after"
                    " {} seconds.\n"
                    "Runner outputs are attached below:"
                    "\n===\n{}".format(alive_names, runner_join_timeout,
                                       outputs))

    except Exception as err:
        if not hasattr(err, 'as_steps'):
            err.as_steps = steps
        raise err
Example #4
0
def _run(persistent_data,
         res_ops,
         command,
         ops=[],
         initial=None,
         sources=1,
         partition_multiplier=1,
         validate_output=True,
         sender_mps=1000,
         sender_interval=0.01):
    host = '127.0.0.1'
    sinks = 1
    sink_mode = 'framed'
    batch_size = int(sender_mps * sender_interval)
    logging.debug("batch_size is {}".format(batch_size))

    if not isinstance(ops, (list, tuple)):
        raise TypeError("ops must be a list or tuple of operations")

    # If no initial workers value is given, determine the minimum number
    # required at the start so that the cluster never goes below 1 worker.
    # If a number is given, then verify it is sufficient.
    if ops:
        if isinstance(ops[0], Recover):
            raise ValueError("The first operation cannot be Recover")
        lowest = lowest_point(ops)
        if lowest < 1:
            min_workers = abs(lowest) + 1
        else:
            min_workers = 1
        if isinstance(initial, int):
            logging.debug('initial: {}'.format(initial))
            logging.debug('min: {}'.format(min_workers))
            assert (initial >= min_workers)
            workers = initial
        else:
            workers = min_workers
    else:  # Test is only for setup using initial workers
        assert (initial > 0)
        workers = initial

    logging.info("Initial cluster size: {}".format(workers))

    partition_multiplier = 5  # Used in partition count creation
    # create the sequence generator and the reader
    msg = MultiSequenceGenerator(base_parts=workers * partition_multiplier - 1)

    # Start cluster
    logging.debug("Creating cluster")
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 persistent_data=persistent_data) as cluster:

        # start senders
        for s in range(sources):
            sender = Sender(cluster.source_addrs[0],
                            Reader(msg),
                            batch_size=batch_size,
                            interval=sender_interval,
                            reconnect=True)
            cluster.add_sender(sender, start=True)

        # let the senders send some data first
        time.sleep(1)

        # loop over ops, keeping the result and passing it to the next op
        res = None
        assert (not cluster.get_crashed_workers())
        for op in ops:
            res_ops.append(op)
            logging.info("Executing: {}".format(op))
            res = op.apply(cluster, res)
            assert (not cluster.get_crashed_workers())

        # Wait a full second for things to calm down
        time.sleep(1)

        # If using external senders, wait for them to stop cleanly
        if cluster.senders:
            # Tell the multi-sequence-sender to stop
            msg.stop()

            # wait for senders to reach the end of their readers and stop
            for s in cluster.senders:
                cluster.wait_for_sender(s)

            # Validate all sender values caught up
            stop_value = max(msg.seqs)
            t0 = time.time()
            while True:
                try:
                    assert (len(msg.seqs) == msg.seqs.count(stop_value))
                    break
                except:
                    if time.time() - t0 > 2:
                        logging.error("msg.seqs aren't all equal: {}".format(
                            msg.seqs))
                        raise
                time.sleep(0.1)

            # Create await_values for the sink based on the stop values from
            # the multi sequence generator
            await_values = []
            for part, val in enumerate(msg.seqs):
                key = '{:07d}'.format(part).encode()
                data = '[{},{},{},{}]'.format(
                    *[val - x for x in range(3, -1, -1)]).encode()
                await_values.append((key, data))
            cluster.sink_await(values=await_values, func=parse_sink_value)

        logging.info("Completion condition achieved. Shutting down cluster.")

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        if validate_output:
            # TODO: move to validations.py
            out_file = os.path.join(cluster.res_dir, 'received.txt')
            cluster.sinks[0].save(out_file)

            # Validate captured output
            logging.info("Validating output")
            # if senders == 0, using internal source
            if cluster.senders:
                cmd_validate = (
                    'validator -i {out_file} -e {expect} -a'.format(
                        out_file=out_file, expect=stop_value))
            else:
                cmd_validate = ('validator -i {out_file} -a'.format(
                    out_file=out_file))
            res = run_shell_cmd(cmd_validate)
            try:
                assert (res.success)
                logging.info("Validation successful")
            except:
                raise AssertionError('Validation failed with the following '
                                     'error:\n{}'.format(res.output))

        # Validate worker actually underwent recovery
        if cluster.restarted_workers:
            # TODO: move to validations.py
            logging.info("Validating recovery")
            pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
            for r in cluster.restarted_workers:
                stdout = r.get_output()
                try:
                    assert (re.search(pattern, stdout) is not None)
                    logging.info("{} recovered successfully".format(r.name))
                except AssertionError:
                    raise AssertionError(
                        'Worker {} does not appear to have performed '
                        'recovery as expected.'.format(r.name))
Example #5
0
def _test_log_rotation_file_size_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    event_log_file_size = 50000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)
    alt_block = '--event-log-file-size {}'.format(event_log_file_size)
    alt_func = lambda x: x > 0

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 * workers
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, worker_ports) = (ports[:sources], [
            ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3)
        ])
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      res_dir, workers, worker_ports, alt_block, alt_func)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        # Wait for runner to complete a log rotation
        log_rotated_checker = RunnerChecker(runners[1],
                                            log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop the worker in a non-graceful fashion so it doesn't remove
        # recovery files
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            print runners[-1].name
            print runners[-1].get_output()
            print '---'
            print runners[-2].name
            print runners[-2].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate worker underwent log rotation, but not initializer
        i, r = 1, runners[1]
        stdout = r.get_output()
        try:
            assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                    is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n' %
                                 (i, r.name, log_rotated_pattern, stdout))

        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have performed '
                                 'recovery as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s' % stdout)
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
Example #6
0
def _test_log_rotation_external_trigger_no_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        num_ports = sources + 3 * workers
        ports = get_port_values(num=num_ports, host=host)
        (input_ports, worker_ports) = (ports[:sources], [
            ports[sources:][i:i + 3] for i in range(0, len(ports[sources:]), 3)
        ])
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      res_dir, workers, worker_ports)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        time.sleep(0.5)
        # Trigger log rotation with external message
        cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m '
                                'worker1'.format(host, external_port))

        res = run_shell_cmd(cmd_external_trigger)
        try:
            assert (res.success)
        except AssertionError:
            raise AssertionError('External rotation trigger failed with '
                                 'the error:\n{}'.format(res.output))

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            print runners[0].name
            print runners[0].get_output()
            print '---'
            print runners[1].name
            print runners[1].get_output()
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate all workers underwent log rotation
        for r in runners[1:]:
            stdout = r.get_output()
            try:
                assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                        is not None)
            except AssertionError:
                raise AssertionError('Worker %r does not appear to have '
                                     'performed log rotation as expected.'
                                     ' The pattern %r '
                                     'is missing form the Worker output '
                                     'included below.\nSTDOUT\n---\n%s\n'
                                     '---\n' %
                                     (r.name, log_rotated_pattern, stdout))
    finally:
        for r in runners:
            r.stop()
        clean_resilience_path(res_dir)
def _test_restart(command):

    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = '/tmp/res-data'
    expect = 200
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=1,
                        interval=0.05,
                        reconnect=True)
        sender.start()
        time.sleep(0.2)

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Wait for the last sent value expected at the worker
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()[0]
                print '---'
            print 'sink data'
            print sink.data
            print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()

        # Validate worker actually underwent recovery
        pattern_restarting = "Restarting a listener ..."
        stdout, stderr = runners[-1].get_output()
        try:
            assert (re.search(pattern_restarting, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have reconnected '
                                 'as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s\n---\n'
                                 'STDERR\n---\n%s' % (stdout, stderr))
    finally:
        for r in runners:
            r.stop()
Example #8
0
def _test_log_rotation_external_trigger_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    command = '''{} \
        --log-rotation \
        --stop-pause {}
    '''.format(command, STOP_THE_WORLD_PAUSE)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()

        time.sleep(0.5)
        # Trigger log rotation with external message
        cmd_external_trigger = ('external_sender -e {}:{} -t rotate-log -m '
                                'worker1'.format(host, external_port))

        success, stdout, retcode, cmd = ex_validate(cmd_external_trigger)
        try:
            assert (success)
        except AssertionError:
            raise AssertionError('External rotation trigger failed with '
                                 'the error:\n{}'.format(stdout))

        # Check for log rotation
        log_rotated_checker = RunnerChecker(runners[1],
                                            log_rotated_patterns,
                                            timeout=AWAIT_TIMEOUT)
        log_rotated_checker.start()
        log_rotated_checker.join()
        if log_rotated_checker.error:
            raise log_rotated_checker.error

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        runners[-1].kill()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(30)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, AWAIT_TIMEOUT)
        stopper.start()
        stopper.join()
        if stopper.error:
            for r in runners:
                print r.name
                print r.get_output()[0]
                print '---'
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[0].name
            print runners[0].get_output()[0]
            print '---'
            print runners[1].name
            print runners[1].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

        # Validate all workers underwent log rotation
        r = runners[1]
        stdout, stderr = r.get_output()
        try:
            assert (re.search(log_rotated_pattern, stdout, re.M | re.S)
                    is not None)
        except AssertionError:
            raise AssertionError('Worker %d.%r does not appear to have '
                                 'performed log rotation as expected.'
                                 ' The pattern %r '
                                 'is missing form the Worker output '
                                 'included below.\nSTDOUT\n---\n%s\n'
                                 '---\n' %
                                 (1, r.name, log_rotated_pattern, stdout))
        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout, stderr = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError(
                'Worker %d.%r does not appear to have '
                'performed recovery as expected. Worker '
                'output is '
                'included below.\nSTDOUT\n---\n%s\n---\n'
                'STDERR\n---\n%s' %
                (len(runners) - 1, runners[-1].name, stdout, stderr))
    finally:
        for r in runners:
            r.stop()
Example #9
0
def _test_recovery(command):
    host = '127.0.0.1'
    sources = 1
    workers = 2
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))

    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader = Reader(sequence_generator(expect))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners[0], timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender
        sender = Sender(host,
                        input_ports[0],
                        reader,
                        batch_size=100,
                        interval=0.05)
        sender.start()
        time.sleep(0.2)

        # stop worker
        runners[-1].stop()

        ## restart worker
        runners.append(runners[-1].respawn())
        runners[-1].start()

        # wait until sender completes (~1 second)
        sender.join(5)
        if sender.error:
            raise sender.error
        if sender.is_alive():
            sender.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use metrics to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[-1].get_output()[0]
            print '---'
            print runners[-2].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

        # Validate worker actually underwent recovery
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        stdout, stderr = runners[-1].get_output()
        try:
            assert (re.search(pattern, stdout) is not None)
        except AssertionError:
            raise AssertionError('Worker does not appear to have performed '
                                 'recovery as expected. Worker output is '
                                 'included below.\nSTDOUT\n---\n%s\n---\n'
                                 'STDERR\n---\n%s' % (stdout, stderr))

    finally:
        for r in runners:
            r.stop()
        clean_up_resilience_path(res_dir)
def _run(command, persistent_data):
    host = '127.0.0.1'
    sources = 1
    sinks = 1
    sink_mode = 'framed'
    workers = 2
    expect = 200
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2)))).encode()
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2)))).encode()
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    # Start cluster
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 persistent_data=persistent_data) as cluster:

        # Create sender
        logging.debug("Creating sender")
        sender = Sender(cluster.source_addrs[0],
                        Reader(sequence_generator(expect)),
                        batch_size=1,
                        interval=0.05,
                        reconnect=True)
        cluster.add_sender(sender, start=True)

        # wait for some data to go through the system
        time.sleep(0.5)

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        logging.debug("Killing worker")
        killed = cluster.kill_worker(worker=-1)

        ## restart worker
        logging.debug("Restarting worker")
        cluster.restart_worker(killed)

        # wait until sender completes (~1 second)
        logging.debug("Waiting for sender to complete")
        cluster.wait_for_sender()

        # Wait for the last sent value expected at the worker
        logging.debug("Waiting for sink to complete")
        cluster.sink_await(await_values)

        # stop the cluster
        logging.debug("Stopping cluster")
        cluster.stop_cluster()

    logging.debug("validating restarted worker stdout")
    # Validate worker actually underwent recovery
    pattern_restarting = "Restarting a listener ..."
    try:
        assert (re.search(pattern_restarting,
                          persistent_data['runner_data'][2].stdout)
                is not None)
    except AssertionError:
        raise AssertionError('Worker does not appear to have reconnected '
                             'as expected. Worker output is '
                             'included below.\nSTDOUT\n---\n%s' % stdout)
Example #11
0
def _run(command, runner_data=[]):
    host = '127.0.0.1'
    sources = 1
    sinks = 1
    sink_mode = 'framed'
    workers = 2
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))
    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    # Start cluster
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 runner_data=runner_data) as cluster:
        # Create sender
        logging.debug("Creating sender")
        sender = Sender(cluster.source_addrs[0],
                        Reader(sequence_generator(expect)),
                        batch_size=100,
                        interval=0.05,
                        reconnect=True)
        cluster.add_sender(sender, start=True)

        # wait for some data to go through the system
        time.sleep(0.2)

        # stop worker in a non-graceful fashion so that recovery files
        # aren't removed
        logging.debug("Killing worker")
        killed = cluster.kill_worker(worker=-1)

        ## restart worker
        logging.debug("Restarting worker")
        cluster.restart_worker(killed)

        # wait until sender completes (~1 second)
        logging.debug("Waiting for sender to complete")
        cluster.wait_for_sender()

        # Wait for the last sent value expected at the worker
        logging.debug("Waiting for sink to complete")
        cluster.sink_await(await_values)

        # stop the cluster
        logging.debug("Stopping cluster")
        cluster.stop_cluster()

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(cluster.res_dir, 'received.txt')
        cluster.sinks[0].save(out_file, mode='giles')

        # Validate captured output
        logging.debug("Validating output")
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        res = run_shell_cmd(cmd_validate)
        try:
            assert (res.success)
        except AssertionError:
            raise AssertionError('Output validation failed with the following '
                                 'error:\n{}'.format(res.output))

        # Validate worker actually underwent recovery
        logging.debug("Validating recovery from worker stdout")
        pattern = "RESILIENCE\: Replayed \d+ entries from recovery log file\."
        try:
            assert (re.search(pattern, cluster.runners[-1].get_output())
                    is not None)
        except AssertionError:
            raise AssertionError("Worker does not appear to have performed "
                                 "recovery as expected.")
Example #12
0
def _test_autoscale_grow(command):
    host = '127.0.0.1'
    sources = 1
    workers = 1
    res_dir = '/tmp/res-data'
    expect = 2000
    last_value_0 = '[{}]'.format(','.join(
        (str(expect - v) for v in range(6, -2, -2))))
    last_value_1 = '[{}]'.format(','.join(
        (str(expect - 1 - v) for v in range(6, -2, -2))))

    await_values = (struct.pack('>I', len(last_value_0)) + last_value_0,
                    struct.pack('>I', len(last_value_1)) + last_value_1)

    patterns_i = [
        re.escape(r'***Worker worker1 attempting to join the '
                  r'cluster. Sent necessary information.***'),
        re.escape(r'Migrating partitions to worker1'),
        re.escape(r'--All new workers have acked migration '
                  r'batch complete'),
        re.escape(r'~~~Resuming message processing.~~~')
    ]
    patterns_w = [
        re.escape(r'***Successfully joined cluster!***'),
        re.escape(r'~~~Resuming message processing.~~~')
    ]

    setup_resilience_path(res_dir)

    runners = []
    try:
        # Create sink, metrics, reader, sender
        sink = Sink(host)
        metrics = Metrics(host)
        reader1 = Reader(sequence_generator(expect - 1000))
        reader2 = Reader(sequence_generator(expect, 1000))

        # Start sink and metrics, and get their connection info
        sink.start()
        sink_host, sink_port = sink.get_connection_info()
        outputs = '{}:{}'.format(sink_host, sink_port)

        metrics.start()
        metrics_host, metrics_port = metrics.get_connection_info()
        time.sleep(0.05)

        input_ports, control_port, external_port, data_port = (get_port_values(
            host, sources))
        inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

        start_runners(runners, command, host, inputs, outputs, metrics_port,
                      control_port, external_port, data_port, res_dir, workers)

        # Wait for first runner (initializer) to report application ready
        runner_ready_checker = RunnerReadyChecker(runners, timeout=30)
        runner_ready_checker.start()
        runner_ready_checker.join()
        if runner_ready_checker.error:
            raise runner_ready_checker.error

        # start sender1 (0,1000]
        sender1 = Sender(host,
                         input_ports[0],
                         reader1,
                         batch_size=10,
                         interval=0.05)
        sender1.start()

        # wait until sender1 completes (~5 seconds)
        sender1.join(30)
        if sender1.error:
            raise sender1.error
        if sender1.is_alive():
            sender1.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # create a new worker and have it join
        add_runner(runners, command, host, inputs, outputs, metrics_port,
                   control_port, external_port, data_port, res_dir, workers)

        # Wait for runner to complete a log rotation
        join_checker_i = RunnerChecker(runners[0], patterns_i, timeout=30)
        join_checker_w = RunnerChecker(runners[1], patterns_w, timeout=30)
        join_checker_i.start()
        join_checker_w.start()
        join_checker_i.join()
        if join_checker_i.error:
            print('worker output:')
            print(runners[1].get_output()[0])
            raise join_checker_i.error
        join_checker_w.join()
        if join_checker_w.error:
            print('initalizer output:')
            print(runners[0].get_output()[0])
            raise join_checker_w.error

        # Start sender2 (1000, 2000]
        sender2 = Sender(host,
                         input_ports[0],
                         reader2,
                         batch_size=10,
                         interval=0.05)
        sender2.start()

        # wait until sender2 completes (~5 seconds)
        sender2.join(30)
        if sender2.error:
            raise sender2.error
        if sender2.is_alive():
            sender2.stop()
            raise TimeoutError('Sender did not complete in the expected '
                               'period')

        # Use Sink value to determine when to stop runners and sink
        stopper = SinkAwaitValue(sink, await_values, 30)
        stopper.start()
        stopper.join()
        if stopper.error:
            raise stopper.error

        # stop application workers
        for r in runners:
            r.stop()

        # Stop sink
        sink.stop()
        print 'sink.data size: ', len(sink.data)

        # Stop metrics
        metrics.stop()

        # parse metrics data and validate worker has shifted from 1 to 2
        # workers
        mp = MetricsParser()
        mp.load_string_list(metrics.data)
        mp.parse()
        # Now confirm that there are computations in worker1's metrics
        app_key = mp.data.keys()[0]  # 'metrics:Sequence Window Printer'
        worker_metrics = [
            v for v in mp.data[app_key].get('worker1', []) if v[0] == 'metrics'
        ]
        # Verify there is at least one entry for a computation with a nonzero
        # total value
        print('worker_metrics', worker_metrics)
        filtered = filter(
            lambda v:
            (v[1]['metric_category'] == 'computation' and v[1]['total'] > 0),
            worker_metrics)
        print('filtered', filtered)
        assert (len(filtered) > 0)

        # Use validator to validate the data in at-least-once mode
        # save sink data to a file
        out_file = os.path.join(res_dir, 'received.txt')
        sink.save(out_file, mode='giles')

        # Validate captured output
        cmd_validate = ('validator -i {out_file} -e {expect} -a'.format(
            out_file=out_file, expect=expect))
        success, stdout, retcode, cmd = ex_validate(cmd_validate)
        try:
            assert (success)
        except AssertionError:
            print runners[-1].get_output()[0]
            print '---'
            print runners[-2].get_output()[0]
            print '---'
            raise AssertionError('Validation failed with the following '
                                 'error:\n{}'.format(stdout))

    finally:
        for r in runners:
            r.stop()
Example #13
0
def _autoscale_sequence(command, ops=[], cycles=1, initial=None):
    host = '127.0.0.1'
    sources = 1

    if isinstance(ops, int):
        ops = [ops]

    # If no initial workers value is given, determine the minimum number
    # required at the start so that the cluster never goes below 1 worker.
    # If a number is given, then verify it is sufficient.
    if ops:
        lowest = lowest_point(ops * cycles)
        if lowest < 1:
            min_workers = abs(lowest) + 1
        else:
            min_workers = 1
        if isinstance(initial, int):
            assert (initial >= min_workers)
            workers = initial
        else:
            workers = min_workers
    else:  # Test is only for setup using initial workers
        assert (initial > 0)
        workers = initial

    batch_size = 10
    interval = 0.05
    sender_timeout = 30  # Counted from when Sender is stopped
    runner_join_timeout = 30

    res_dir = tempfile.mkdtemp(dir='/tmp/', prefix='res-data.')
    setup_resilience_path(res_dir)

    steps = []

    runners = []
    try:
        try:
            # Create sink, metrics, reader, sender
            sink = Sink(host)
            metrics = Metrics(host)
            lowercase2 = [a + b for a in lowercase for b in lowercase]
            char_cycle = cycle(lowercase2)
            expected = Counter()

            def count_sent(s):
                expected[s] += 1

            reader = Reader(
                iter_generator(items=char_cycle,
                               to_string=lambda s: pack('>2sI', s, 1),
                               on_next=count_sent))

            # Start sink and metrics, and get their connection info
            sink.start()
            sink_host, sink_port = sink.get_connection_info()
            outputs = '{}:{}'.format(sink_host, sink_port)

            metrics.start()
            metrics_host, metrics_port = metrics.get_connection_info()
            time.sleep(0.05)

            num_ports = sources + 3 + (2 * (workers - 1))
            ports = get_port_values(num=num_ports, host=host)
            (input_ports, (control_port, data_port, external_port),
             worker_ports) = (ports[:sources], ports[sources:sources + 3],
                              zip(ports[-(2 * (workers - 1)):][::2],
                                  ports[-(2 * (workers - 1)):][1::2]))
            inputs = ','.join(['{}:{}'.format(host, p) for p in input_ports])

            # Prepare query functions with host and port pre-defined
            query_func_partitions = partial(partitions_query, host,
                                            external_port)
            query_func_partition_counts = partial(partition_counts_query, host,
                                                  external_port)
            query_func_cluster_status = partial(cluster_status_query, host,
                                                external_port)

            # Start the initial runners
            start_runners(runners, command, host, inputs, outputs,
                          metrics_port, control_port, external_port, data_port,
                          res_dir, workers, worker_ports)

            # Verify cluster is processing messages
            obs = ObservabilityNotifier(query_func_cluster_status,
                                        test_cluster_is_processing)
            obs.start()
            obs.join()
            if obs.error:
                raise obs.error

            # Verify that `workers` workers are active
            # Create a partial function
            partial_test_worker_count = partial(test_worker_count, workers)
            obs = ObservabilityNotifier(query_func_cluster_status,
                                        partial_test_worker_count)
            obs.start()
            obs.join()
            if obs.error:
                raise obs.error

            # Verify all workers start with partitions
            obs = ObservabilityNotifier(query_func_partitions,
                                        test_all_workers_have_partitions)
            obs.start()
            obs.join()
            if obs.error:
                raise obs.error

            # start sender
            sender = Sender(host,
                            input_ports[0],
                            reader,
                            batch_size=batch_size,
                            interval=interval)
            sender.start()
            # Give the cluster 1 second to build up some state
            time.sleep(1)

            # Perform autoscale cycles
            for cyc in range(cycles):
                for joiners in ops:
                    # Verify cluster is processing before proceeding
                    obs = ObservabilityNotifier(query_func_cluster_status,
                                                test_cluster_is_processing,
                                                timeout=30)
                    obs.start()
                    obs.join()
                    if obs.error:
                        raise obs.error

                    # Test for crashed workers
                    test_crashed_workers(runners)

                    # get partition data before autoscale operation begins
                    pre_partitions = query_func_partitions()
                    steps.append(joiners)
                    joined = []
                    left = []
                    if joiners > 0:  # autoscale: grow
                        # create a new worker and have it join
                        new_ports = get_port_values(num=(joiners * 2),
                                                    host=host,
                                                    base_port=25000)
                        joiner_ports = zip(new_ports[::2], new_ports[1::2])
                        for i in range(joiners):
                            add_runner(runners, command, host, inputs, outputs,
                                       metrics_port, control_port,
                                       external_port, data_port, res_dir,
                                       joiners, *joiner_ports[i])
                            joined.append(runners[-1])

                        # Verify cluster has resumed processing
                        obs = ObservabilityNotifier(query_func_cluster_status,
                                                    test_cluster_is_processing,
                                                    timeout=120)
                        obs.start()
                        obs.join()
                        if obs.error:
                            raise obs.error

                        # Test: all workers have partitions, partitions ids
                        # for new workers have been migrated from pre-join
                        # workers
                        # create list of joining workers
                        diff_names = {'joining': [r.name for r in joined]}
                        # Create partial function of the test with the
                        # data baked in
                        tmp = partial(test_migrated_partitions, pre_partitions,
                                      diff_names)
                        # Start the test notifier
                        obs = ObservabilityNotifier(
                            query_func_partitions,
                            [test_all_workers_have_partitions, tmp])
                        obs.start()
                        obs.join()
                        if obs.error:
                            raise obs.error

                    elif joiners < 0:  # autoscale: shrink
                        # choose the most recent, still-alive runners to leave
                        leavers = abs(joiners)
                        idx = 1
                        while len(left) < leavers and idx < len(runners):
                            if runners[-idx].is_alive():
                                left.append(runners[-idx])
                            idx += 1
                        if len(left) < leavers:
                            raise AutoscaleTestError(
                                "Not enough workers left to "
                                "shrink! {} requested but "
                                "only {} live non-initializer"
                                "workers found!".format(joiners, len(left)))

                        # Send the shrink command
                        resp = send_shrink_cmd(host,
                                               external_port,
                                               names=[r.name for r in left])
                        print("Sent a shrink command for {}".format(
                            [r.name for r in left]))
                        print("Response was: {}".format(resp))

                        # Verify cluster has resumed processing
                        obs = ObservabilityNotifier(query_func_cluster_status,
                                                    test_cluster_is_processing,
                                                    timeout=120)
                        obs.start()
                        obs.join()
                        if obs.error:
                            raise obs.error

                        # Test: all workers have partitions, partitions ids
                        # from departing workers have been migrated to remaining
                        # workers
                        # create list of leaving workers
                        diff_names = {'leaving': [r.name for r in left]}
                        # Create partial function of the test with the
                        # data baked in
                        tmp = partial(test_migrated_partitions, pre_partitions,
                                      diff_names)
                        # Start the test notifier
                        obs = ObservabilityNotifier(
                            query_func_partitions,
                            [test_all_workers_have_partitions, tmp])
                        obs.start()
                        obs.join()
                        if obs.error:
                            raise obs.error

                    else:  # Handle the 0 case as a noop
                        continue

                    # Test for crashed workers
                    test_crashed_workers(runners)

                    # Validate autoscale via partition query
                    try:
                        partitions = partitions_query(host, external_port)
                        phase_validate_partitions(
                            runners,
                            partitions,
                            joined=[r.name for r in joined],
                            left=[r.name for r in left])
                    except Exception as err:
                        print(
                            'error validating {} have joined and {} have left'.
                            format([r.name for r in joined],
                                   [r.name for r in left]))
                        raise

                    # Wait a second before the next operation, allowing some
                    # more data to go through the system
                    time.sleep(1)

            # Test for crashed workers
            test_crashed_workers(runners)

            # Test is done, so stop sender
            sender.stop()

            # wait until sender sends out its final batch and exits
            sender.join(sender_timeout)
            if sender.error:
                raise sender.error
            if sender.is_alive():
                sender.stop()
                raise TimeoutError('Sender did not complete in the expected '
                                   'period')

            print('Sender sent {} messages'.format(sum(expected.values())))

            # Use Sink value to determine when to stop runners and sink
            pack677 = '>I2sQ'
            pack27 = '>IsQ'
            await_values = [
                pack(pack677,
                     calcsize(pack677) - 4, c, v) for c, v in expected.items()
            ]
            #await_values = [pack(pack27, calcsize(pack27)-4, c, v) for c, v in
            #                expected.items()]
            stopper = SinkAwaitValue(sink, await_values, 30)
            stopper.start()
            stopper.join()
            if stopper.error:
                print('sink.data', len(sink.data))
                print('await_values', len(await_values))
                raise stopper.error

            # stop application workers
            for r in runners:
                r.stop()

            # Test for crashed workers
            test_crashed_workers(runners)

            # Stop sink
            sink.stop()

            # Stop metrics
            metrics.stop()

            # validate output
            phase_validate_output(runners, sink, expected)

        finally:
            for r in runners:
                r.stop()
            # Wait on runners to finish waiting on their subprocesses to exit
            for r in runners:
                r.join(runner_join_timeout)
            alive = []
            for r in runners:
                if r.is_alive():
                    alive.append(r)
            for r in runners:
                ec = r.poll()
                if ec != 0:
                    print('Worker {!r} exited with return code {}'.format(
                        r.name, ec))
                    print('Its last 5 log lines were:')
                    print('\n'.join(r.get_output().splitlines()[-5:]))
                    print()
            if alive:
                alive_names = ', '.join((r.name for r in alive))
                outputs = runners_output_format(runners)
                for a in alive:
                    a.kill()
            clean_resilience_path(res_dir)
            if alive:
                raise PipelineTestError(
                    "Runners [{}] failed to exit cleanly after"
                    " {} seconds.\n"
                    "Runner outputs are attached below:"
                    "\n===\n{}".format(alive_names, runner_join_timeout,
                                       outputs))
    except Exception as err:
        if not hasattr(err, 'as_steps'):
            err.as_steps = steps
        if not hasattr(err, 'runners'):
            err.runners = runners
        raise
Example #14
0
def _autoscale_run(command,
                   ops=[],
                   cycles=1,
                   initial=None,
                   runner_data=[],
                   as_steps=[]):
    host = '127.0.0.1'
    sources = 1
    sinks = 1
    sink_mode = 'framed'

    if isinstance(ops, int):
        ops = [ops]

    # If no initial workers value is given, determine the minimum number
    # required at the start so that the cluster never goes below 1 worker.
    # If a number is given, then verify it is sufficient.
    if ops:
        lowest = lowest_point(ops * cycles)
        if lowest < 1:
            min_workers = abs(lowest) + 1
        else:
            min_workers = 1
        if isinstance(initial, int):
            assert (initial >= min_workers)
            workers = initial
        else:
            workers = min_workers
    else:  # Test is only for setup using initial workers
        assert (initial > 0)
        workers = initial

    batch_size = 10
    interval = 0.05

    lowercase2 = [a + b for a in lowercase for b in lowercase]
    char_cycle = cycle(lowercase2)
    expected = Counter()

    def count_sent(s):
        expected[s] += 1

    reader = Reader(
        iter_generator(items=char_cycle,
                       to_string=lambda s: pack('>2sI', s, 1),
                       on_next=count_sent))

    # Start cluster
    logging.debug("Creating cluster")
    with Cluster(command=command,
                 host=host,
                 sources=sources,
                 workers=workers,
                 sinks=sinks,
                 sink_mode=sink_mode,
                 runner_data=runner_data) as cluster:

        # Create sender
        logging.debug("Creating sender")
        sender = Sender(cluster.source_addrs[0],
                        reader,
                        batch_size=50,
                        interval=0.05,
                        reconnect=True)
        cluster.add_sender(sender, start=True)
        # wait for some data to go through the system
        time.sleep(1)

        # Perform autoscale cycles
        logging.debug("Starting autoscale cycles")
        for cyc in range(cycles):
            for joiners in ops:
                # Verify cluster is processing before proceeding
                cluster.wait_to_resume_processing(timeout=120)

                # Test for crashed workers
                assert (not cluster.get_crashed_workers())

                # get partition data before autoscale operation begins
                logging.debug("Get partition data before autoscale event")
                pre_partitions = cluster.get_partition_data()
                as_steps.append(joiners)
                joined = []
                left = []

                if joiners > 0:  # autoscale: grow
                    # create new workers and have them join
                    logging.debug("grow by {}".format(joiners))
                    joined = cluster.grow(by=joiners)

                elif joiners < 0:  # autoscale: shrink
                    # choose the most recent, still-alive runners to leave
                    leavers = abs(joiners)
                    left = cluster.shrink(leavers)

                else:  # Handle the 0 case as a noop
                    continue

                # Wait until all live workers report 'ready'
                cluster.wait_to_resume_processing(timeout=120)

                # Test for crashed workers
                assert (not cluster.get_crashed_workers())

                # Wait a second before the next operation, allowing some
                # more data to go through the system
                time.sleep(1)
                logging.debug("end of autoscale iteration")
            logging.debug("End of autoscale cycle")
        logging.debug("End of autoscale events. Entering final validation")
        time.sleep(2)

        # Test for crashed workers
        logging.debug("check for crashed")
        assert (not cluster.get_crashed_workers())

        # Test is done, so stop sender
        cluster.stop_senders()

        # wait until sender sends out its final batch and exits
        cluster.wait_for_sender()

        logging.info('Sender sent {} messages'.format(sum(expected.values())))

        # Use Sink value to determine when to stop runners and sink
        pack677 = '>I2sQ'
        await_values = [
            pack(pack677,
                 calcsize(pack677) - 4, c, v) for c, v in expected.items()
        ]
        cluster.sink_await(await_values, timeout=120)

        # validate output
        phase_validate_output(cluster.sinks[0].data, expected)
Example #15
0
from end_points import (Reader, sequence_generator)
from integration import (Sender)

import sys
import time

wallaroo_hostsvc = sys.argv[1]
num_start = int(sys.argv[2])
num_end = int(sys.argv[3])
batch_size = int(sys.argv[4])
interval = float(sys.argv[5])
num_part_keys = int(sys.argv[6])

senders = []
for p in range(0, num_part_keys):
    sender = Sender(wallaroo_hostsvc,
                    Reader(
                        sequence_generator(start=num_start,
                                           stop=num_end,
                                           partition='key_%d' % p)),
                    batch_size=batch_size,
                    interval=interval,
                    reconnect=True)
    senders.append(sender)

for s in senders:
    s.start()
for s in senders:
    s.join()

sys.exit(0)