예제 #1
0
def main():
    args = get_args(sys.argv)

    consumer_count = int(get_mandatory_arg(args, "--consumers"))
    topic = get_mandatory_arg(args, "--topic")
    print_mod = int(get_mandatory_arg(args, "--print-mod"))

    console_out(f"Starting...", "TEST RUNNER")
    console_out(f"Cluster status:", "TEST RUNNER")
    subprocess.call(["bash", "../cluster/cluster-status.sh"])

    broker_manager = BrokerManager()
    broker_manager.load_initial_nodes()
    initial_nodes = broker_manager.get_initial_nodes()
    console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")

    msg_monitor = MessageMonitor(print_mod)
    consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                       "TEST RUNNER", topic)
    consumer_manager.add_consumers(consumer_count, 1)

    monitor_thread = threading.Thread(target=msg_monitor.process_messages)
    monitor_thread.start()

    consumer_manager.start_consumers()

    while True:
        try:
            time.sleep(1)
        except KeyboardInterrupt:
            break

    try:
        consumer_manager.stop_all_consumers()
        msg_monitor.stop_consuming()
        monitor_thread.join()
    except Exception as e:
        console_out("Failed to clean up test correctly: " + str(e),
                    "TEST RUNNER")
예제 #2
0
def main():
    args = get_args(sys.argv)

    node_count = 3
    count = -1  # no limit
    tests = int(get_mandatory_arg(args, "--tests"))
    actions = int(get_mandatory_arg(args, "--actions"))
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    cluster_size = get_optional_arg(args, "--cluster", "3")
    queue = get_mandatory_arg(args, "--queue")
    sac = get_mandatory_arg(args, "--sac")
    chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed")
    chaos_min_interval = int(
        get_optional_arg(args, "--chaos-min-interval", "30"))
    chaos_max_interval = int(
        get_optional_arg(args, "--chaos-max-interval", "120"))
    message_type = "sequence"
    queue_type = get_mandatory_arg(args, "--queue-type")

    sac_enabled = True
    if sac.upper() == "FALSE":
        sac_enabled = False

    for test_number in range(tests):

        print("")
        console_out(f"TEST RUN: {str(test_number)} --------------------------",
                    "TEST RUNNER")
        subprocess.call(
            ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"])
        console_out(f"Waiting for cluster...", "TEST RUNNER")
        time.sleep(30)
        console_out(f"Cluster status:", "TEST RUNNER")
        subprocess.call(["bash", "../cluster/cluster-status.sh"])

        broker_manager = BrokerManager()
        broker_manager.load_initial_nodes()
        initial_nodes = broker_manager.get_initial_nodes()

        console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")

        pub_node = broker_manager.get_random_init_node()
        con_node = broker_manager.get_random_init_node()
        console_out(f"publish to: {pub_node}", "TEST RUNNER")
        console_out(f"consume from: {con_node}", "TEST RUNNER")

        print_mod = in_flight_max * 5
        queue_name = queue + "_" + str(test_number)

        mgmt_node = broker_manager.get_random_init_node()
        queue_created = False
        while queue_created == False:
            if sac_enabled:
                queue_created = broker_manager.create_sac_queue(
                    mgmt_node, queue_name, cluster_size, queue_type)
            else:
                queue_created = broker_manager.create_queue(
                    mgmt_node, queue_name, cluster_size, queue_type)
            if queue_created == False:
                time.sleep(5)

        time.sleep(10)

        msg_monitor = MessageMonitor(print_mod)
        publisher = RabbitPublisher(f"PUBLISHER(Test:{test_number} Id:P1)",
                                    initial_nodes, pub_node, in_flight_max,
                                    120, print_mod)
        consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                           "TEST RUNNER")
        consumer_manager.add_consumers(1, test_number, queue_name)

        stats = QueueStats('jack', 'jack', queue_name)
        chaos = ChaosExecutor(initial_nodes)

        if chaos_mode == "partitions":
            chaos.only_partitions()
        elif chaos_mode == "nodes":
            chaos.only_kill_nodes()

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        consumer_manager.start_consumers()

        pub_thread = threading.Thread(target=publisher.publish_direct,
                                      args=(queue_name, count, 1, 0,
                                            "sequence"))
        pub_thread.start()
        console_out("publisher started", "TEST RUNNER")

        for action_num in range(0, actions):
            wait_sec = random.randint(chaos_min_interval, chaos_max_interval)
            console_out(f"waiting for {wait_sec} seconds before next action",
                        "TEST RUNNER")
            time.sleep(wait_sec)

            console_out(
                f"execute chaos action {str(action_num)} of test {str(test_number)}",
                "TEST RUNNER")
            chaos.execute_chaos_action()
            subprocess.call(["bash", "../cluster/cluster-status.sh"])

        time.sleep(60)
        console_out("repairing cluster", "TEST RUNNER")
        chaos.repair()
        console_out("repaired cluster", "TEST RUNNER")

        publisher.stop(True)

        console_out("starting grace period for consumer to catch up",
                    "TEST RUNNER")
        ctr = 0

        while ctr < grace_period_sec:
            if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count(
            ) and len(publisher.get_msg_set().difference(
                    msg_monitor.get_msg_set())) == 0:
                break
            time.sleep(1)
            ctr += 1

        confirmed_set = publisher.get_msg_set()
        lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")

        if len(lost_msgs) > 0:
            console_out(f"Lost messages count: {len(lost_msgs)}",
                        "TEST RUNNER")
            for msg in lost_msgs:
                console_out(f"Lost message: {msg}", "TEST RUNNER")

        console_out(
            f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
            "TEST RUNNER")
        success = True

        if msg_monitor.get_out_of_order() == True:
            console_out("FAILED TEST: OUT OF ORDER MESSAGES", "TEST RUNNER")
            success = False

        if len(lost_msgs) > 0:
            console_out("FAILED TEST: LOST MESSAGES", "TEST RUNNER")
            success = False

        if success == True:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END------------------------------------",
                    "TEST RUNNER")

        try:
            consumer_manager.stop_all_consumers()
            con_thread.join()
            pub_thread.join()
        except Exception as e:
            console_out("Failed to clean up test correctly: " + str(e),
                        "TEST RUNNER")

        console_out(f"TEST {str(test_number)} COMPLETE", "TEST RUNNER")
예제 #3
0
def main():
    args = get_args(sys.argv)

    node_count = 3
    count = -1  # no limit
    tests = int(get_mandatory_arg(args, "--tests"))
    run_minutes = int(get_mandatory_arg(args, "--run-minutes"))
    consumer_count = int(get_mandatory_arg(args, "--consumers"))
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    cluster_size = get_optional_arg(args, "--cluster", "3")
    queue = get_mandatory_arg(args, "--queue")
    queue_type = get_mandatory_arg(args, "--queue-type")

    message_type = "sequence"

    for test_number in range(tests):

        print("")
        console_out(f"TEST RUN: {str(test_number)} --------------------------",
                    "TEST RUNNER")
        subprocess.call(
            ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"])
        console_out(f"Waiting for cluster...", "TEST RUNNER")
        time.sleep(30)
        console_out(f"Cluster status:", "TEST RUNNER")
        subprocess.call(["bash", "../cluster/cluster-status.sh"])

        broker_manager = BrokerManager()
        broker_manager.load_initial_nodes()
        initial_nodes = broker_manager.get_initial_nodes()
        console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")

        print_mod = 5000
        queue_name = queue + "_" + str(test_number)
        mgmt_node = broker_manager.get_random_init_node()
        queue_created = False

        while queue_created == False:
            queue_created = broker_manager.create_sac_queue(
                mgmt_node, queue_name, cluster_size, queue_type)
            if queue_created == False:
                time.sleep(5)

        time.sleep(10)

        msg_monitor = MessageMonitor(print_mod)
        stats = QueueStats('jack', 'jack', queue_name)
        chaos = ChaosExecutor(initial_nodes)
        consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                           "TEST RUNNER")

        pub_node = broker_manager.get_random_init_node()
        publisher = RabbitPublisher(str(test_number), initial_nodes, pub_node,
                                    in_flight_max, 120, print_mod)
        consumer_manager.add_consumers(consumer_count, test_number, queue_name)

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        consumer_manager.start_consumers()

        pub_thread = threading.Thread(target=publisher.publish_direct,
                                      args=(queue_name, count, 1, 0,
                                            "sequence"))
        pub_thread.start()
        console_out("publisher started", "TEST RUNNER")

        init_wait_sec = 20
        console_out(
            f"Will start chaos and consumer actions in {init_wait_sec} seconds",
            "TEST RUNNER")
        time.sleep(init_wait_sec)

        chaos_thread = threading.Thread(
            target=chaos.start_random_single_action_and_repair, args=(90, ))
        chaos_thread.start()
        console_out("Chaos executor started", "TEST RUNNER")

        consumer_action_thread = threading.Thread(
            target=consumer_manager.start_random_consumer_actions,
            args=(5, 30))
        consumer_action_thread.start()
        console_out("Consumer actions started", "TEST RUNNER")

        ctr = 0
        while ctr < run_minutes:
            time.sleep(60)
            ctr += 1
            console_out(
                f"Test at {ctr} minute mark, {run_minutes-ctr} minutes left",
                "TEST RUNNER")

        try:
            chaos.stop_random_single_action_and_repair()
            consumer_manager.stop_random_consumer_actions()
            chaos_thread.join()
            consumer_action_thread.join()
        except Exception as e:
            console_out("Failed to stop chaos cleanly: " + str(e),
                        "TEST RUNNER")

        console_out("Resuming consumers", "TEST RUNNER")
        consumer_manager.resume_all_consumers()

        publisher.stop(True)
        console_out("starting grace period for consumer to catch up",
                    "TEST RUNNER")
        ctr = 0

        while ctr < grace_period_sec:
            if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count(
            ) and len(publisher.get_msg_set().difference(
                    msg_monitor.get_msg_set())) == 0:
                break
            time.sleep(1)
            ctr += 1

        confirmed_set = publisher.get_msg_set()
        not_consumed_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")
        console_out(
            f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
            "TEST RUNNER")

        success = True
        if len(not_consumed_msgs) > 0:
            console_out(
                f"FAILED TEST: Potential failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}",
                "TEST RUNNER")
            success = False

        if msg_monitor.get_out_of_order() == True:
            success = False
            console_out(f"FAILED TEST: Received out-of-order messages",
                        "TEST RUNNER")

        if success:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END------------------------------------",
                    "TEST RUNNER")

        try:
            consumer_manager.stop_all_consumers()
            pub_thread.join()
            msg_monitor.stop_consuming()
            monitor_thread.join()
        except Exception as e:
            console_out("Failed to clean up test correctly: " + str(e),
                        "TEST RUNNER")

        console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
def main():
    print("quorum-queue-test.py")
    args = get_args(sys.argv)

    count = -1  # no limit
    tests = int(get_mandatory_arg(args, "--tests"))
    actions = int(get_mandatory_arg(args, "--actions"))
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    cluster_size = get_optional_arg(args, "--cluster", "3")
    queue = get_mandatory_arg(args, "--queue")
    sac_enabled = is_true(get_mandatory_arg(args, "--sac"))
    chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed")
    chaos_min_interval = int(
        get_optional_arg(args, "--chaos-min-interval", "30"))
    chaos_max_interval = int(
        get_optional_arg(args, "--chaos-max-interval", "120"))
    prefetch = int(get_optional_arg(args, "--pre-fetch", "10"))
    rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta",
                                             ["3.7", "3.8-beta", "3.8-alpha"])

    for test_number in range(1, tests + 1):

        print("")
        console_out(
            f"TEST RUN: {str(test_number)} of {tests}--------------------------",
            "TEST RUNNER")
        setup_complete = False

        while not setup_complete:
            broker_manager = BrokerManager()
            broker_manager.deploy(cluster_size, True, rmq_version, False)
            initial_nodes = broker_manager.get_initial_nodes()

            console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")

            print_mod = in_flight_max * 5
            queue_name = queue + "_" + str(test_number)

            mgmt_node = broker_manager.get_random_init_node()
            queue_created = False
            qc_ctr = 0
            while queue_created == False and qc_ctr < 20:
                qc_ctr += 1
                if sac_enabled:
                    queue_created = broker_manager.create_quorum_sac_queue(
                        mgmt_node, queue_name, cluster_size, 0)
                else:
                    queue_created = broker_manager.create_quorum_queue(
                        mgmt_node, queue_name, cluster_size, 0)

                if queue_created:
                    setup_complete = True
                else:
                    time.sleep(5)

        time.sleep(10)

        msg_monitor = MessageMonitor("qqt", test_number, print_mod, True,
                                     False)
        publisher = RabbitPublisher(1, test_number, broker_manager,
                                    in_flight_max, 120, print_mod)
        publisher.configure_sequence_direct(queue_name, count, 0, 1)
        consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                           "TEST RUNNER", False)
        consumer_manager.add_consumers(1, test_number, queue_name, prefetch)

        chaos = ChaosExecutor(initial_nodes)

        if chaos_mode == "partitions":
            chaos.only_partitions()
        elif chaos_mode == "nodes":
            chaos.only_kill_nodes()

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        consumer_manager.start_consumers()

        pub_thread = threading.Thread(target=publisher.start_publishing)
        pub_thread.start()
        console_out("publisher started", "TEST RUNNER")

        for action_num in range(1, actions + 1):
            wait_sec = random.randint(chaos_min_interval, chaos_max_interval)
            console_out(f"waiting for {wait_sec} seconds before next action",
                        "TEST RUNNER")
            time.sleep(wait_sec)

            console_out(
                f"execute chaos action {str(action_num)}/{actions} of test {str(test_number)}",
                "TEST RUNNER")
            chaos.execute_chaos_action()
            subprocess.call(["bash", "../cluster/cluster-status.sh"])

        time.sleep(60)
        console_out("repairing cluster", "TEST RUNNER")
        chaos.repair()
        console_out("repaired cluster", "TEST RUNNER")

        publisher.stop_publishing()

        console_out("starting grace period for consumer to catch up",
                    "TEST RUNNER")
        ctr = 0

        while True:
            ms_since_last_msg = datetime.datetime.now(
            ) - msg_monitor.get_last_msg_time()
            if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count(
            ) and len(publisher.get_msg_set().difference(
                    msg_monitor.get_msg_set())) == 0:
                break
            elif ctr > grace_period_sec and ms_since_last_msg.total_seconds(
            ) > 15:
                break
            time.sleep(1)
            ctr += 1

        confirmed_set = publisher.get_msg_set()
        lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")

        if len(lost_msgs) > 0:
            console_out(f"Lost messages count: {len(lost_msgs)}",
                        "TEST RUNNER")
            for msg in lost_msgs:
                console_out(f"Lost message: {msg}", "TEST RUNNER")

        console_out(
            f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
            "TEST RUNNER")
        success = True

        if msg_monitor.get_out_of_order() == True:
            console_out("FAILED TEST: OUT OF ORDER MESSAGES", "TEST RUNNER")
            success = False

        if len(lost_msgs) > 0:
            console_out("FAILED TEST: LOST MESSAGES", "TEST RUNNER")
            success = False

        if success == True:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END------------------------------------",
                    "TEST RUNNER")

        try:
            consumer_manager.stop_all_consumers()
            pub_thread.join()
        except Exception as e:
            console_out("Failed to clean up test correctly: " + str(e),
                        "TEST RUNNER")

        console_out(f"TEST {str(test_number)} COMPLETE", "TEST RUNNER")
예제 #5
0
def main():
    print("publish-consume.py")
    args = get_args(sys.argv)

    # cluster
    new_cluster = is_true(
        get_optional_arg_validated(args, "--new-cluster", "false",
                                   ["true", "false"]))
    if new_cluster:
        cluster_size = int(get_mandatory_arg(args, "--cluster-size"))
    else:
        cluster_size = int(get_optional_arg(args, "--cluster-size", "3"))

    rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta",
                                             ["3.7", "3.8-beta", "3.8-alpha"])

    # queues and exchanges
    exchanges = as_list(get_optional_arg(args, "--exchanges", ""))
    queue_name = get_mandatory_arg(args, "--queue")
    queue_type = get_optional_arg_validated(args, "--queue-type", "mirrored",
                                            ["mirrored", "quorum"])
    qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0"))
    rep_factor = int(get_optional_arg(args, "--rep-factor", str(cluster_size)))
    sac_enabled = is_true(
        get_optional_arg_validated(args, "--sac", "false", ["true", "false"]))

    if rmq_version == "3.7":
        if sac_enabled:
            console_out("Cannot use SAC mode with RabbitMQ 3.7", "TEST RUNNER")
            exit(1)

        if queue_type == "quorum":
            console_out("Cannot use quorum queues with RabbitMQ 3.7",
                        "TEST RUNNER")
            exit(1)

    # publisher
    publisher_count = int(get_optional_arg(args, "--publishers", "1"))
    pub_mode = get_optional_arg_validated(args, "--pub-mode", "direct",
                                          ["direct", "exchange"])
    msg_mode = get_optional_arg_validated(
        args, "--msg-mode", "sequence",
        ["sequence", "partitioned-sequence", "large-msgs", "hello"])
    count = int(get_mandatory_arg(args, "--msgs"))
    dup_rate = float(get_optional_arg(args, "--dup-rate", "0"))
    sequence_count = int(get_optional_arg(args, "--sequences", 1))
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10))

    # consumers
    consumer_count = int(get_optional_arg(args, "--consumers", "1"))
    prefetch = int(get_optional_arg(args, "--pre-fetch", "10"))
    analyze = is_true(
        get_optional_arg_validated(args, "--analyze", "true",
                                   ["true", "false"]))

    print_mod = get_optional_arg(args, "--print-mod", in_flight_max * 5)

    broker_manager = BrokerManager()
    broker_manager.deploy(cluster_size, new_cluster, rmq_version, False)

    mgmt_node = broker_manager.get_random_init_node()
    queue_created = False
    while queue_created == False:
        if queue_type == "mirrored":
            if sac_enabled:
                queue_created = broker_manager.create_standard_sac_queue(
                    mgmt_node, queue_name, rep_factor)
            else:
                queue_created = broker_manager.create_standard_queue(
                    mgmt_node, queue_name, rep_factor)
        elif queue_type == "quorum":
            if sac_enabled:
                queue_created = broker_manager.create_quorum_sac_queue(
                    mgmt_node, queue_name, rep_factor, qq_max_length)
            else:
                queue_created = broker_manager.create_quorum_queue(
                    mgmt_node, queue_name, rep_factor, qq_max_length)

        if queue_created == False:
            time.sleep(5)

    broker_manager.declare_exchanges(queue_name, exchanges)

    time.sleep(10)

    if consumer_count > 0:
        msg_monitor = MessageMonitor("pub-con", 1, print_mod, analyze, False)
        consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                           "TEST RUNNER", False)
        consumer_manager.add_consumers(consumer_count, 1, queue_name, prefetch)

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        consumer_manager.start_consumers()

    if publisher_count > 0:
        pub_manager = PublisherManager(broker_manager, 1, "TEST RUNNER",
                                       publisher_count, in_flight_max,
                                       print_mod)

        if pub_mode == "direct":
            if msg_mode == "sequence":
                pub_manager.add_sequence_direct_publishers(
                    queue_name, count, dup_rate, sequence_count)
            elif pub_mode == "partitioned-sequence":
                print("Cannot use partitioned sequence mode with direct mode")
                exit(1)
            elif pub_mode == "large-msgs":
                msg_size = int(get_mandatory_arg(args, "--msg-size"))
                pub_manager.add_large_msgs_direct_publishers(
                    queue_name, count, dup_rate, msg_size)
            else:
                pub_manager.add_hello_msgs_direct_publishers(
                    queue_name, count, dup_rate)
        elif pub_mode == "exchange":
            if len(exchanges) == 0:
                console_out("No exchanges provided", "TEST RUNNER")
                exit(1)

            if msg_mode == "sequence":
                pub_manager.add_sequence_to_exchanges_publishers(
                    exchanges, "", count, dup_rate, sequence_count)
            elif msg_mode == "partitioned-sequence":
                pub_manager.add_partitioned_sequence_to_exchanges_publishers(
                    exchanges, count, dup_rate, sequence_count)
            elif msg_mode == "large-msgs":
                msg_size = int(get_mandatory_arg(args, "--msg-size"))
                pub_manager.add_large_msgs_to_exchanges_publishers(
                    exchanges, "", count, dup_rate, msg_size)
            else:
                pub_manager.add_hello_msgs_to_exchanges_publishers(
                    exchanges, "", count, dup_rate)

        pub_manager.start_publishers()

    while True:
        try:
            console_out(
                "Press + to add a consumer, - to remove a consumer, ! to remove the active consumer (SAC only)",
                "TEST_RUNNER")
            input_str = input()
            if input_str == "+":
                consumer_manager.add_consumer_and_start_consumer(
                    1, queue_name, prefetch)
            elif input_str == "-":
                consumer_manager.stop_and_remove_oldest_consumer()
            else:
                consumer_manager.stop_and_remove_specfic_consumer(input_str)
        except KeyboardInterrupt:
            if publisher_count > 0:
                console_out(
                    "Stopping publishers. Starting grace period for consumers to catch up.",
                    "TEST_RUNNER")
                pub_manager.stop_all_publishers()
            break

    if publisher_count > 0 and consumer_count > 0:
        try:
            ctr = 0
            while ctr < 300:
                if msg_monitor.get_unique_count(
                ) >= pub_manager.get_total_pos_ack_count() and len(
                        pub_manager.get_total_msg_set().difference(
                            msg_monitor.get_msg_set())) == 0:
                    break
                time.sleep(1)
                ctr += 1
        except KeyboardInterrupt:
            console_out("Grace period ended", "TEST RUNNER")

        confirmed_set = pub_manager.get_total_msg_set()
        lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")
        console_out(
            f"Confirmed count: {pub_manager.get_total_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
            "TEST RUNNER")

        if analyze:
            success = True
            if len(lost_msgs) > 0:
                console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}",
                            "TEST RUNNER")
                success = False

            if msg_monitor.get_out_of_order() == True:
                success = False
                console_out(f"FAILED TEST: Received out-of-order messages",
                            "TEST RUNNER")

            if success:
                console_out("TEST OK", "TEST RUNNER")

    elif publisher_count > 0:
        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")
        console_out(
            f"Confirmed count: {pub_manager.get_total_pos_ack_count()}",
            "TEST RUNNER")
    elif consumer_count > 0:
        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")
        console_out(
            f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
            "TEST RUNNER")

    console_out("RESULTS END------------------------------------",
                "TEST RUNNER")

    try:
        if consumer_count > 0:
            consumer_manager.stop_all_consumers()
            msg_monitor.stop_consuming()
            monitor_thread.join(10)
    except Exception as e:
        console_out("Failed to clean up test correctly: " + str(e),
                    "TEST RUNNER")

    console_out(f"TEST 1 COMPLETE", "TEST RUNNER")
예제 #6
0
def main():
    args = get_args(sys.argv)

    tests = int(get_mandatory_arg(args, "--tests"))
    run_minutes = int(get_mandatory_arg(args, "--run-minutes"))
    consumer_count = int(get_mandatory_arg(args, "--consumers"))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    topic = get_mandatory_arg(args, "--topic")
    partitions = get_mandatory_arg(args, "--partitions")

    cluster_size = get_optional_arg(args, "--cluster", "3")
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 100))
    min_insync_reps = int(get_optional_arg(args, "--min-insync-replicas", "1"))
    unclean_failover = get_optional_arg(args, "--unclean-failover", "false")
    sequence_count = int(get_optional_arg(args, "--sequences", "1"))
    rep_factor = get_optional_arg(args, "--rep-factor", "3")
    acks_mode = get_optional_arg(args, "--acks-mode", "all")
    print_mod = int(get_optional_arg(args, "--print-mod", "0"))

    if print_mod == 0:
        print_mod = in_flight_max * 3

    for test_number in range(tests):

        print("")
        console_out(f"TEST RUN: {str(test_number)} --------------------------",
                    "TEST RUNNER")
        subprocess.call(
            ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"])
        console_out(f"Waiting for cluster...", "TEST RUNNER")
        time.sleep(30)
        console_out(f"Cluster status:", "TEST RUNNER")
        subprocess.call(["bash", "../cluster/cluster-status.sh"])

        broker_manager = BrokerManager()
        broker_manager.load_initial_nodes()
        initial_nodes = broker_manager.get_initial_nodes()
        console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")
        broker_manager.correct_advertised_listeners()

        topic_name = topic + "_" + str(test_number)
        mgmt_node = broker_manager.get_random_init_node()
        console_out(f"Creating topic {topic_name} using node {mgmt_node}",
                    "TEST RUNNER")
        broker_manager.create_topic(mgmt_node, topic_name, rep_factor,
                                    partitions, min_insync_reps,
                                    unclean_failover)

        time.sleep(10)

        msg_monitor = MessageMonitor(print_mod)
        chaos = ChaosExecutor(broker_manager)
        consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                           "TEST RUNNER", topic_name)

        pub_node = broker_manager.get_random_init_node()
        producer = KafkaProducer(test_number, 1, broker_manager, acks_mode,
                                 in_flight_max, print_mod)
        producer.create_producer()
        producer.configure_as_sequence(sequence_count)
        consumer_manager.add_consumers(consumer_count, test_number)

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        consumer_manager.start_consumers()

        pub_thread = threading.Thread(target=producer.start_producing,
                                      args=(topic_name, 10000000))
        pub_thread.start()
        console_out("producer started", "TEST RUNNER")

        init_wait_sec = 20
        console_out(
            f"Will start chaos and consumer actions in {init_wait_sec} seconds",
            "TEST RUNNER")
        time.sleep(init_wait_sec)

        chaos_thread = threading.Thread(
            target=chaos.start_random_single_action_and_repair, args=(120, ))
        chaos_thread.start()
        console_out("Chaos executor started", "TEST RUNNER")

        consumer_action_thread = threading.Thread(
            target=consumer_manager.start_random_consumer_actions,
            args=(60, 61))
        consumer_action_thread.start()
        console_out("Consumer actions started", "TEST RUNNER")

        ctr = 0
        while ctr < run_minutes:
            time.sleep(60)
            console_out(
                f"Test at {ctr} minute mark, {run_minutes-ctr} minutes left",
                "TEST RUNNER")
            ctr += 1

        try:
            chaos.stop_random_single_action_and_repair()
            consumer_manager.stop_random_consumer_actions()
            chaos_thread.join()
            consumer_action_thread.join()
        except Exception as e:
            console_out("Failed to stop chaos cleanly: " + str(e),
                        "TEST RUNNER")

        console_out("Resuming consumers", "TEST RUNNER")
        consumer_manager.resume_all_consumers()

        publisher.stop(True)
        console_out("starting grace period for consumer to catch up",
                    "TEST RUNNER")
        ctr = 0

        while ctr < grace_period_sec:
            if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count(
            ) and len(publisher.get_msg_set().difference(
                    msg_monitor.get_msg_set())) == 0:
                break
            time.sleep(1)
            ctr += 1

        confirmed_set = publisher.get_msg_set()
        lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

        console_out("RESULTS------------------------------------",
                    "TEST RUNNER")
        console_out(
            f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
            "TEST RUNNER")

        success = True
        if len(lost_msgs) > 0:
            console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}",
                        "TEST RUNNER")
            success = False

        if msg_monitor.get_out_of_order() == True:
            success = False
            console_out(f"FAILED TEST: Received out-of-order messages",
                        "TEST RUNNER")

        if success:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END------------------------------------",
                    "TEST RUNNER")

        try:
            consumer_manager.stop_all_consumers()
            pub_thread.join()
            msg_monitor.stop_consuming()
            monitor_thread.join()
        except Exception as e:
            console_out("Failed to clean up test correctly: " + str(e),
                        "TEST RUNNER")

        console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
예제 #7
0
def main():
    print("random-test.py")
    #signal.signal(signal.SIGINT, interuppt_handler)
    args = get_args(sys.argv)

    count = -1  # no limit
    test_name = get_mandatory_arg(args, "--test-name")
    tests = int(get_mandatory_arg(args, "--tests"))
    run_minutes = int(get_mandatory_arg(args, "--run-minutes"))
    consumer_count = int(get_mandatory_arg(args, "--consumers"))
    prefetch = int(get_optional_arg(args, "--pre-fetch", "10"))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    queue = get_mandatory_arg(args, "--queue")
    queue_type = get_mandatory_arg(args, "--queue-type")
    analyze = is_true(get_optional_arg(args, "--analyze", "true"))

    if queue_type == "quorum":
        qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0"))

    sac_enabled = is_true(get_mandatory_arg(args, "--sac"))
    log_messages = is_true(get_optional_arg(args, "--log-msgs", "false"))

    publisher_count = int(get_optional_arg(args, "--publishers", "1"))
    if publisher_count > 0:
        in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10"))
        print_mod = int(
            get_optional_arg(args, "--print-mod", f"{in_flight_max * 5}"))
        sequence_count = int(get_optional_arg(args, "--sequences", "1"))
    else:
        print_mod = int(get_optional_arg(args, "--print-mod", f"1000"))

    new_cluster = is_true(get_optional_arg(args, "--new-cluster", "true"))
    cluster_size = get_optional_arg(args, "--cluster", "3")
    rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta",
                                             ["3.7", "3.8-beta", "3.8-alpha"])
    stop_mode = get_optional_arg_validated(args, "--stop-mode", "crash",
                                           ["crash", "close", "cancel"])

    use_toxiproxy = False
    consumer_hard_close = False
    if stop_mode == "crash":
        use_toxiproxy = True
    elif stop_mode == "close":
        consumer_hard_close = True

    include_chaos = is_true(get_optional_arg(args, "--chaos-actions", "true"))
    if include_chaos:
        chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed")
        chaos_min_interval = int(
            get_optional_arg(args, "--chaos-min-interval", "60"))
        chaos_max_interval = int(
            get_optional_arg(args, "--chaos-max-interval", "120"))

    include_con_actions = is_true(
        get_optional_arg(args, "--consumer-actions", "true"))
    if include_con_actions:
        con_action_min_interval = int(
            get_optional_arg(args, "--consumer-min-interval", "20"))
        con_action_max_interval = int(
            get_optional_arg(args, "--consumer-max-interval", "60"))

    failed_test_log = list()
    failed_tests = set()

    for test_number in range(tests):

        print("")
        subprocess.call(["mkdir", f"logs/{test_name}/{str(test_number)}"])
        console_out(f"TEST RUN: {str(test_number)} --------------------------",
                    "TEST RUNNER")
        broker_manager = BrokerManager()
        broker_manager.deploy(cluster_size, new_cluster, rmq_version,
                              use_toxiproxy)
        initial_nodes = broker_manager.get_initial_nodes()
        console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")

        queue_name = queue + "_" + str(test_number)
        mgmt_node = broker_manager.get_random_init_node()
        queue_created = False

        while queue_created == False:
            if queue_type == "mirrored":
                if sac_enabled:
                    queue_created = broker_manager.create_standard_sac_queue(
                        mgmt_node, queue_name, cluster_size)
                else:
                    queue_created = broker_manager.create_standard_queue(
                        mgmt_node, queue_name, cluster_size)
            elif queue_type == "quorum":
                if sac_enabled:
                    queue_created = broker_manager.create_quorum_sac_queue(
                        mgmt_node, queue_name, cluster_size, qq_max_length)
                else:
                    queue_created = broker_manager.create_quorum_queue(
                        mgmt_node, queue_name, cluster_size, qq_max_length)

            if queue_created == False:
                time.sleep(5)

        time.sleep(10)

        msg_monitor = MessageMonitor(test_name, test_number, print_mod,
                                     analyze, log_messages)
        chaos = ChaosExecutor(initial_nodes)

        if include_chaos:
            if chaos_mode == "partitions":
                chaos.only_partitions()
            elif chaos_mode == "nodes":
                chaos.only_kill_nodes()

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        if consumer_count > 0:
            consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                               "TEST RUNNER", use_toxiproxy)
            consumer_manager.add_consumers(consumer_count, test_number,
                                           queue_name, prefetch)
            consumer_manager.start_consumers()

        if publisher_count == 1:
            publisher = RabbitPublisher(1, test_number, broker_manager,
                                        in_flight_max, 120, print_mod)
            publisher.configure_sequence_direct(queue_name, count, 0,
                                                sequence_count)

            pub_thread = threading.Thread(target=publisher.start_publishing)
            pub_thread.start()
            console_out("publisher started", "TEST RUNNER")

        if include_con_actions or include_chaos:
            init_wait_sec = 20
            console_out(
                f"Will start chaos and consumer actions in {init_wait_sec} seconds",
                "TEST RUNNER")
            time.sleep(init_wait_sec)

        if include_chaos:
            chaos_thread = threading.Thread(
                target=chaos.start_random_single_action_and_repair,
                args=(chaos_min_interval, chaos_max_interval))
            chaos_thread.start()
            console_out("Chaos executor started", "TEST RUNNER")

        if include_con_actions:
            consumer_action_thread = threading.Thread(
                target=consumer_manager.start_random_consumer_actions,
                args=(con_action_min_interval, con_action_max_interval,
                      consumer_hard_close))
            consumer_action_thread.start()
            console_out("Consumer actions started", "TEST RUNNER")

        ctr = 0
        run_seconds = run_minutes * 60
        while ctr < run_seconds and not stop_please:
            try:
                time.sleep(1)
                ctr += 1

                if ctr % 60 == 0:
                    console_out(
                        f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left",
                        "TEST RUNNER")
            except KeyboardInterrupt:
                console_out(
                    f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)",
                    "TEST RUNNER")
                break

        try:
            chaos.stop_random_single_action_and_repair()

            if consumer_count > 0:
                consumer_manager.stop_random_consumer_actions()

            if include_chaos:
                chaos_thread.join(30)

            if include_con_actions:
                consumer_action_thread.join(30)
        except Exception as e:
            console_out("Failed to stop chaos cleanly: " + str(e),
                        "TEST RUNNER")

        if publisher_count > 0:
            publisher.stop_publishing()

        if consumer_count > 0:
            console_out("Resuming consumers", "TEST RUNNER")
            consumer_manager.resume_all_consumers()

            console_out("Starting grace period for consumer to catch up",
                        "TEST RUNNER")
            ctr = 0

            try:
                while ctr < grace_period_sec:
                    if publisher_count > 0 and msg_monitor.get_unique_count(
                    ) >= publisher.get_pos_ack_count() and len(
                            publisher.get_msg_set().difference(
                                msg_monitor.get_msg_set())) == 0:
                        break
                    time.sleep(1)
                    ctr += 1
            except KeyboardInterrupt:
                console_out("Grace period ended", "TEST RUNNER")

        console_out("RESULTS ----------------------------------------",
                    "TEST RUNNER")
        if publisher_count > 0:
            confirmed_set = publisher.get_msg_set()
            not_consumed_msgs = confirmed_set.difference(
                msg_monitor.get_msg_set())
            console_out(
                f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
                "TEST RUNNER")
        else:
            not_consumed_msgs = set()
            console_out(
                f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
                "TEST RUNNER")

        success = True
        if consumer_count > 0:
            if len(not_consumed_msgs) > 0:
                if sac_enabled:
                    console_out(
                        f"FAILED TEST: Potential message loss or failure of consumers to consume or failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}",
                        "TEST RUNNER")
                else:
                    console_out(
                        f"FAILED TEST: Potential message loss or failure of consumers to consume. Not consumed count: {len(not_consumed_msgs)}",
                        "TEST RUNNER")
                failed_test_log.append(
                    f"Test {test_number} FAILURE: Potential Message Loss. {len(not_consumed_msgs)} messsages."
                )
                failed_tests.add(test_number)

                lost_ctr = 0
                sorted_msgs = list(not_consumed_msgs)
                sorted_msgs.sort()
                for msg in sorted_msgs:
                    console_out(f"Lost? {msg}", "TEST RUNNER")
                    lost_ctr += 1
                    if lost_ctr > 500:
                        console_out("More than 500, truncated list",
                                    "TEST RUNNER")
                        break

                success = False

            if msg_monitor.get_out_of_order() == True:
                success = False
                console_out(f"FAILED TEST: Received out-of-order messages",
                            "TEST RUNNER")
                failed_test_log.append(
                    f"Test {test_number} FAILURE: Received out-of-order messages"
                )
                failed_tests.add(test_number)

        if success:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END ------------------------------------",
                    "TEST RUNNER")

        try:
            if consumer_count > 0:
                consumer_manager.stop_all_consumers()

            if publisher_count == 1:
                pub_thread.join(30)
            msg_monitor.stop_consuming()
            monitor_thread.join(30)
        except Exception as e:
            console_out_exception("Failed to clean up test correctly.", e,
                                  "TEST RUNNER")

        broker_manager.zip_log_files(test_name, test_number)
        console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")

    console_out("", "TEST RUNNER")
    console_out("SUMMARY", "TEST RUNNER")
    console_out(f"OK {tests - len(failed_tests)} FAIL {len(failed_tests)}",
                "TEST RUNNER")
    for line in failed_test_log:
        console_out(line, "TEST RUNNER")

    console_out("TEST RUN COMPLETE", "TEST RUNNER")
def main():
    args = get_args(sys.argv)

    tests = int(get_mandatory_arg(args, "--tests"))
    run_minutes = int(get_mandatory_arg(args, "--run-minutes"))
    consumer_count = 1
    topic = get_mandatory_arg(args, "--topic")
    idempotence = is_true(get_mandatory_arg(args, "--idempotence"))
    partitions = 1

    cluster_size = get_optional_arg(args, "--cluster", "3")
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10000))
    buffering_max = int(get_optional_arg(args, "--buffering-max-ms", 0))
    min_insync_reps = 1
    unclean_failover = "false"
    sequence_count = 1
    rep_factor = get_optional_arg(args, "--rep-factor", "3")
    acks_mode = get_optional_arg(args, "--acks-mode", "all")
    print_mod = int(get_optional_arg(args, "--print-mod", "0"))
    new_cluster = is_true(get_optional_arg(args, "--new-cluster", "true"))
    group_id = get_optional_arg(args, "--group-id", str(uuid.uuid1()))

    if print_mod == 0:
        print_mod = in_flight_max * 3;

    for test_number in range(tests):

        print("")
        console_out(f"TEST RUN: {str(test_number)} with idempotence={idempotence}--------------------------", "TEST RUNNER")
        broker_manager = BrokerManager("confluent", True)
        
        if new_cluster:
            broker_manager.deploy(cluster_size, True)
            
        broker_manager.load_initial_nodes()
        initial_nodes = broker_manager.get_initial_nodes()
        console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")
        broker_manager.correct_advertised_listeners()

        topic_name = topic + "_" + str(test_number)
        mgmt_node = broker_manager.get_random_init_node()
        console_out(f"Creating topic {topic_name} using node {mgmt_node}", "TEST RUNNER")
        broker_manager.create_topic(mgmt_node, topic_name, rep_factor, partitions, min_insync_reps, unclean_failover)
        
        time.sleep(10)

        msg_monitor = MessageMonitor(print_mod, True)
        chaos = ChaosExecutor(broker_manager)
        
        pub_node = broker_manager.get_random_init_node()
        producer = KafkaProducer(test_number, 1, broker_manager, acks_mode, in_flight_max, print_mod)
        
        if idempotence:
            producer.create_idempotent_producer(10000000, buffering_max)
        else:
            producer.create_producer(1000000, buffering_max)

        producer.configure_as_sequence(sequence_count)
        
        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()
        
        pub_thread = threading.Thread(target=producer.start_producing,args=(topic_name, 1000000000))
        pub_thread.start()
        console_out("producer started", "TEST RUNNER")

        init_wait_sec = 20
        console_out(f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER")
        time.sleep(init_wait_sec)

        chaos_thread = threading.Thread(target=chaos.start_kill_leader_or_connections,args=(topic_name, 0))
        chaos_thread.start()
        console_out("Chaos executor started", "TEST RUNNER")

        ctr = 1
        while ctr < run_minutes:
            time.sleep(60)
            console_out(f"Test at {ctr} minute mark, {run_minutes-ctr} minutes left", "TEST RUNNER")
            ctr += 1

        producer.stop_producing()

        try:
            chaos.stop_chaos_actions()
            chaos_thread.join()
            console_out(f"Chaos executor shutdown", "TEST RUNNER")
        except Exception as e:
            console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER")
        

        subprocess.call(["bash", "../cluster/cluster-status.sh"])
        time.sleep(60)
        
        consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", topic_name, group_id)
        consumer_manager.add_consumers(consumer_count, test_number)
        consumer_manager.start_consumers()
        
        ctr = 0
        
        while ctr < 300:
            if msg_monitor.get_unique_count() >= producer.get_pos_ack_count() and len(producer.get_msg_set().difference(msg_monitor.get_msg_set())) == 0:
               break
            time.sleep(1)
            ctr += 1

        confirmed_set = producer.get_msg_set()
        lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set())
        duplicates = msg_monitor.get_receive_count() - msg_monitor.get_unique_count()

        console_out("RESULTS------------------------------------", "TEST RUNNER")
        console_out(f"Confirmed count: {producer.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER")
        console_out(f"Duplication count: {duplicates}", "TEST RUNNER")

        success = True
        if len(lost_msgs) > 0:
            console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}", "TEST RUNNER")
            success = False

        if idempotence and msg_monitor.get_out_of_order():
            success = False
            console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER")

        if idempotence and duplicates:
            success = False
            console_out(f"FAILED TEST: Duplicates", "TEST RUNNER")

        if success:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END------------------------------------", "TEST RUNNER")

        try:
            consumer_manager.stop_all_consumers()
            pub_thread.join()
            msg_monitor.stop_consuming()
            monitor_thread.join()
        except Exception as e:
            console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER")

        console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
예제 #9
0
def main():

    #signal.signal(signal.SIGINT, interuppt_handler)
    args = get_args(sys.argv)

    count = -1 # no limit
    tests = int(get_mandatory_arg(args, "--tests"))
    run_minutes = int(get_mandatory_arg(args, "--run-minutes"))
    consumer_count = int(get_mandatory_arg(args, "--consumers"))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    queue = get_mandatory_arg(args, "--queue")
    queue_type = get_mandatory_arg(args, "--queue-type")
    sac = get_mandatory_arg(args, "--sac")

    publisher_count = int(get_optional_arg(args, "--publishers", "1"))
    print_mod = int(get_optional_arg(args, "--print-mod", "0"))
    new_cluster = get_optional_arg(args, "--new-cluster", "true")
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10"))
    sequence_count = int(get_optional_arg(args, "--sequences", "1"))
    cluster_size = get_optional_arg(args, "--cluster", "3")
    chaos = get_optional_arg(args, "--chaos-actions", "true")
    chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed")
    chaos_min_interval = int(get_optional_arg(args, "--chaos-min-interval", "60"))
    chaos_max_interval = int(get_optional_arg(args, "--chaos-max-interval", "120"))
    consumer_actions = get_optional_arg(args, "--consumer-actions", "true")
    con_action_min_interval = int(get_optional_arg(args, "--consumer-min-interval", "20"))
    con_action_max_interval = int(get_optional_arg(args, "--consumer-max-interval", "60"))

    if print_mod == 0:
        print_mod = in_flight_max * 5

    include_chaos = True
    if chaos.upper() == "FALSE":
        include_chaos = False

    include_con_actions = True
    if consumer_actions.upper() == "FALSE":
        include_con_actions = False

    sac_enabled = True
    if sac.upper() == "FALSE":
        sac_enabled = False

    message_type = "sequence"
    
    for test_number in range(tests):

        print("")
        console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER")
        if new_cluster.upper() == "TRUE":
            subprocess.call(["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"])
            console_out(f"Waiting for cluster...", "TEST RUNNER")
            time.sleep(30)

        console_out(f"Cluster status:", "TEST RUNNER")
        subprocess.call(["bash", "../cluster/cluster-status.sh"])
        
        broker_manager = BrokerManager()
        broker_manager.load_initial_nodes()
        initial_nodes = broker_manager.get_initial_nodes()
        console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")

        queue_name = queue + "_" + str(test_number)
        mgmt_node = broker_manager.get_random_init_node()
        queue_created = False

        while queue_created == False:  
            if sac_enabled:  
                queue_created = broker_manager.create_sac_queue(mgmt_node, queue_name, cluster_size, queue_type)
            else:
                queue_created = broker_manager.create_queue(mgmt_node, queue_name, cluster_size, queue_type)

            if queue_created == False:
                time.sleep(5)

        time.sleep(10)

        msg_monitor = MessageMonitor(print_mod)
        stats = QueueStats('jack', 'jack', queue_name)
        chaos = ChaosExecutor(initial_nodes)

        if chaos_mode == "partitions":
            chaos.only_partitions()
        elif chaos_mode == "nodes":
            chaos.only_kill_nodes()

        consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER")

        pub_node = broker_manager.get_random_init_node()
        publisher = RabbitPublisher(f"PUBLISHER(Test:{test_number} Id:P1)", initial_nodes, pub_node, in_flight_max, 120, print_mod)
        consumer_manager.add_consumers(consumer_count, test_number, queue_name)

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()
        
        consumer_manager.start_consumers()

        if publisher_count == 1:
            pub_thread = threading.Thread(target=publisher.publish_direct,args=(queue_name, count, sequence_count, 0, "sequence"))
            pub_thread.start()
            console_out("publisher started", "TEST RUNNER")

        if include_con_actions or include_chaos:
            init_wait_sec = 20
            console_out(f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER")
            time.sleep(init_wait_sec)

        if include_chaos:
            chaos_thread = threading.Thread(target=chaos.start_random_single_action_and_repair,args=(chaos_min_interval,chaos_max_interval))
            chaos_thread.start()
            console_out("Chaos executor started", "TEST RUNNER")

        if include_con_actions:
            consumer_action_thread = threading.Thread(target=consumer_manager.start_random_consumer_actions,args=(con_action_min_interval, con_action_max_interval))
            consumer_action_thread.start()
            console_out("Consumer actions started", "TEST RUNNER")

        
        ctr = 0
        run_seconds = run_minutes * 60
        while ctr < run_seconds and not stop_please:
            try:
                time.sleep(1)
                ctr += 1

                if ctr % 60 == 0:
                    console_out(f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left", "TEST RUNNER")
            except KeyboardInterrupt:
                console_out(f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)", "TEST RUNNER")
                break

        try:
            chaos.stop_random_single_action_and_repair()
            consumer_manager.stop_random_consumer_actions()
            
            if include_chaos:
                chaos_thread.join()

            if include_con_actions:
                consumer_action_thread.join()
        except Exception as e:
            console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER")

        console_out("Resuming consumers", "TEST RUNNER")
        consumer_manager.resume_all_consumers()
        
        if publisher_count == 1:
            publisher.stop(True)

        console_out("starting grace period for consumer to catch up", "TEST RUNNER")
        ctr = 0
        
        while ctr < grace_period_sec:
            if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count() and len(publisher.get_msg_set().difference(msg_monitor.get_msg_set())) == 0:
                break
            time.sleep(1)
            ctr += 1

        confirmed_set = publisher.get_msg_set()
        not_consumed_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

        console_out("RESULTS ----------------------------------------", "TEST RUNNER")
        console_out(f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER")

        success = True
        if len(not_consumed_msgs) > 0:
            console_out(f"FAILED TEST: Potential failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER")
            success = False

        if msg_monitor.get_out_of_order() == True:
            success = False
            console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER")

        if success:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END ------------------------------------", "TEST RUNNER")

        try:
            consumer_manager.stop_all_consumers()
            
            if publisher_count == 1:
                pub_thread.join()
            msg_monitor.stop_consuming()
            monitor_thread.join()
        except Exception as e:
            console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER")

        console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
def main():
    args = get_args(sys.argv)

    cluster_size = get_optional_arg(args, "--cluster", "3")
    new_cluster = is_true(get_mandatory_arg(args, "--new-cluster"))
    use_blockade = is_true(get_optional_arg(args, "--use-blockade", "true"))
    image_version = get_optional_arg(args, "--image-version", "confluent")
    
    consumer_count = int(get_optional_arg(args, "--consumers", "1"))
    group_id = get_optional_arg(args, "--group-id", str(uuid.uuid1()))
    grace_period_sec = int(get_optional_arg(args, "--grace-period-sec", "300"))
    topic, is_new_topic = get_topic(new_cluster, args)
        
    partitions = get_optional_arg(args, "--partitions", "3")
    rep_factor = get_optional_arg(args, "--rep-factor", "3")


    analyze = is_true(get_optional_arg(args, "--analyze", "true"))
    producer_count = int(get_optional_arg(args, "--producers", 1))
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 100))
    min_insync_reps = int(get_optional_arg(args, "--min-insync-replicas", "1"))
    unclean_failover = get_optional_arg(args, "--unclean-failover", "false")
    sequence_count = int(get_optional_arg(args, "--sequences", "1"))
    acks_mode = get_optional_arg(args, "--acks-mode", "all")
    print_mod = int(get_optional_arg(args, "--print-mod", "0"))

    if print_mod == 0:
        print_mod = in_flight_max * 3;
    
    test_number = 1
    console_out(f"Starting...", "TEST RUNNER")

    broker_manager = BrokerManager(image_version, use_blockade)
    broker_manager.deploy(cluster_size, new_cluster)

    initial_nodes = broker_manager.get_initial_nodes()
    console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")
    
    topic_name = topic
    
    if new_cluster or is_new_topic:
        mgmt_node = broker_manager.get_random_init_node()
        console_out(f"Creating topic {topic_name} using node {mgmt_node}", "TEST RUNNER")
        broker_manager.create_topic(mgmt_node, topic_name, rep_factor, partitions, min_insync_reps, unclean_failover)
    
    time.sleep(10)

    msg_monitor = MessageMonitor(print_mod, analyze)
    
    prod_manager = ProducerManager(broker_manager, "TEST RUNNER", topic_name)
    prod_manager.add_producers(producer_count, test_number, acks_mode, in_flight_max, print_mod, sequence_count)

    consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", topic_name, group_id)
    consumer_manager.add_consumers(consumer_count, test_number)

    monitor_thread = threading.Thread(target=msg_monitor.process_messages)
    monitor_thread.start()
    
    consumer_manager.start_consumers()
    time.sleep(30)
    prod_manager.start_producers()
    

    while True:
        try:
            command = input("a=add consumer, r=remove consumer - then hit enter")
            if command == "a":
                consumer_manager.add_consumer_and_start_consumer(test_number)
            elif command == "r":
                consumer_manager.stop_and_remove_consumer()
            else:
                console_out("Unknown command", "TEST_RUNNER")
        except KeyboardInterrupt:
            console_out("Stopping producer. Starting grace period for consumers to catch up.", "TEST_RUNNER")
            prod_manager.stop_all_producers()
            break

    if producer_count > 0:
        try:
            ctr = 0
            while ctr < grace_period_sec:
                if msg_monitor.get_unique_count() >= prod_manager.get_total_pos_ack_count() and len(prod_manager.get_total_msg_set().difference(msg_monitor.get_msg_set())) == 0:
                    break
                time.sleep(1)
                ctr += 1
        except KeyboardInterrupt:
            console_out("Grace period ended", "TEST RUNNER")

    confirmed_set = prod_manager.get_total_msg_set()
    lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

    console_out("RESULTS------------------------------------", "TEST RUNNER")
    console_out(f"Confirmed count: {prod_manager.get_total_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER")

    if analyze:
        success = True
        if len(lost_msgs) > 0:
            console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}", "TEST RUNNER")
            success = False

        if msg_monitor.get_out_of_order() == True:
            success = False
            console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER")

        if success:
            console_out("TEST OK", "TEST RUNNER")
    console_out("RESULTS END------------------------------------", "TEST RUNNER")

    try:
        consumer_manager.stop_all_consumers()
        msg_monitor.stop_consuming()
        monitor_thread.join()
        prod_manager.stop_all_producers()
    except Exception as e:
        console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER")

    console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
예제 #11
0
def main():
    args = get_args(sys.argv)

    new_cluster = get_mandatory_arg(args, "--new-cluster")
    consumer_count = int(get_mandatory_arg(args, "--consumers"))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    topic = get_mandatory_arg(args, "--topic")
    partitions = get_mandatory_arg(args, "--partitions")

    cluster_size = get_optional_arg(args, "--cluster", "3")
    in_flight_max = int(get_optional_arg(args, "--in-flight-max", 100))
    min_insync_reps = int(get_optional_arg(args, "--min-insync-replicas", "1"))
    unclean_failover = get_optional_arg(args, "--unclean-failover", "false")
    sequence_count = int(get_optional_arg(args, "--sequences", "1"))
    rep_factor = get_optional_arg(args, "--rep-factor", "3")
    acks_mode = get_optional_arg(args, "--acks-mode", "all")
    print_mod = int(get_optional_arg(args, "--print-mod", "0"))

    if print_mod == 0:
        print_mod = in_flight_max * 3

    test_number = 1
    console_out(f"Starting...", "TEST RUNNER")

    if new_cluster.upper() == "TRUE":
        subprocess.call(
            ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"])
        console_out(f"Waiting for cluster...", "TEST RUNNER")
        time.sleep(30)
    else:
        console_out(f"Using existing cluster...", "TEST RUNNER")

    console_out(f"Cluster status:", "TEST RUNNER")
    subprocess.call(["bash", "../cluster/cluster-status.sh"])

    broker_manager = BrokerManager()
    broker_manager.load_initial_nodes()
    initial_nodes = broker_manager.get_initial_nodes()
    console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")
    broker_manager.correct_advertised_listeners()

    topic_name = topic
    mgmt_node = broker_manager.get_random_init_node()
    console_out(f"Creating topic {topic_name} using node {mgmt_node}",
                "TEST RUNNER")
    broker_manager.create_topic(mgmt_node, topic_name, rep_factor, partitions,
                                min_insync_reps, unclean_failover)

    time.sleep(10)

    msg_monitor = MessageMonitor(print_mod)
    chaos = ChaosExecutor(broker_manager)
    consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                       "TEST RUNNER", topic_name)

    pub_node = broker_manager.get_random_init_node()
    producer = KafkaProducer(test_number, 1, broker_manager, acks_mode,
                             in_flight_max, print_mod)
    producer.create_producer()
    producer.configure_as_sequence(sequence_count)
    consumer_manager.add_consumers(consumer_count, test_number)

    monitor_thread = threading.Thread(target=msg_monitor.process_messages)
    monitor_thread.start()

    consumer_manager.start_consumers()

    pub_thread = threading.Thread(target=producer.start_producing,
                                  args=(topic_name, 10000000))
    pub_thread.start()
    console_out("producer started", "TEST RUNNER")

    while True:
        try:
            time.sleep(1)
        except KeyboardInterrupt:
            console_out(
                "Stopping producer. Starting grace period for consumers to catch up.",
                "TEST_RUNNER")
            producer.stop_producing()
            break

    try:
        ctr = 0
        while ctr < grace_period_sec:
            if msg_monitor.get_unique_count() >= producer.get_pos_ack_count(
            ) and len(producer.get_msg_set().difference(
                    msg_monitor.get_msg_set())) == 0:
                break
            time.sleep(1)
            ctr += 1
    except KeyboardInterrupt:
        console_out("Grace period ended", "TEST RUNNER")

    confirmed_set = producer.get_msg_set()
    lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set())

    console_out("RESULTS------------------------------------", "TEST RUNNER")
    console_out(
        f"Confirmed count: {producer.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
        "TEST RUNNER")

    success = True
    if len(lost_msgs) > 0:
        console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}",
                    "TEST RUNNER")
        success = False

    if msg_monitor.get_out_of_order() == True:
        success = False
        console_out(f"FAILED TEST: Received out-of-order messages",
                    "TEST RUNNER")

    if success:
        console_out("TEST OK", "TEST RUNNER")

    console_out("RESULTS END------------------------------------",
                "TEST RUNNER")

    try:
        consumer_manager.stop_all_consumers()
        msg_monitor.stop_consuming()
        monitor_thread.join()
        pub_thread.join()
    except Exception as e:
        console_out("Failed to clean up test correctly: " + str(e),
                    "TEST RUNNER")

    console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
예제 #12
0
        broker_ip = ""
        broker_port = ""
    else:
        broker_ip = get_mandatory_arg(args, "--broker-ip")
        broker_port = get_mandatory_arg(args, "--broker-port")
        amqproxy_ip = ""
        amqproxy_port = ""

    publish_mode = get_mandatory_arg_validated(args, "--pub-mode", ["async", "sync", "new-conn-per-msg", "fire-and-forget"])
    delay_seconds = int(get_optional_arg(args, "--pub-delay", "0"))

    if delay_seconds > 0:
        console_out(f"Starting with delay of {delay_seconds} seconds", "TEST RUNNER")
        time.sleep(delay_seconds)

    broker_manager = BrokerManager(mgmt_ip, mgmt_port, broker_name, broker_ip, broker_port, amqproxy_ip, amqproxy_port, user, password, use_https, virtual_host)

    queue_created = False

    while queue_created == False:  
        queue_created = broker_manager.create_queue(queue, False)

        if queue_created == False:
            time.sleep(5)

    time.sleep(2)

    if use_toxiproxy:
        proxy_created = False
        while proxy_created == False: 
            proxy_created = broker_manager.add_proxy("clients")
예제 #13
0
import logging
log = logging.getLogger('werkzeug')
log.setLevel(logging.ERROR)

args = get_args(sys.argv)
queue = get_optional_arg(args, "--queue", f"q{random.randint(0, 100000)}")
print_mod = int(get_optional_arg(args, "--print-mod", "1000"))
use_confirms = is_true(get_mandatory_arg(args, "--use-confirms"))
mgmt_ip = get_mandatory_arg(args, "--mgmt-ip")
broker_name = get_mandatory_arg(args, "--broker-name")
broker_ip = get_mandatory_arg(args, "--broker-ip")
broker_port = get_mandatory_arg(args, "--broker-port")
amqproxy_ip = get_mandatory_arg(args, "--amqproxy-ip")
amqproxy_port = get_mandatory_arg(args, "--amqproxy-port")

broker_manager = BrokerManager(mgmt_ip, broker_name, broker_ip, broker_port,
                               amqproxy_ip, amqproxy_port)

queue_created = False

while queue_created == False:
    queue_created = broker_manager.create_queue(queue, False)

    if queue_created == False:
        time.sleep(5)

time.sleep(2)

proxy_publisher = SimplePublisher(broker_manager, f"PUBLISHER", True,
                                  use_confirms, True, print_mod)
nonproxy_publisher = SimplePublisher(broker_manager, f"PUBLISHER", False,
                                     use_confirms, True, print_mod)