def main(): args = get_args(sys.argv) consumer_count = int(get_mandatory_arg(args, "--consumers")) topic = get_mandatory_arg(args, "--topic") print_mod = int(get_mandatory_arg(args, "--print-mod")) console_out(f"Starting...", "TEST RUNNER") console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) broker_manager = BrokerManager() broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") msg_monitor = MessageMonitor(print_mod) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", topic) consumer_manager.add_consumers(consumer_count, 1) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() while True: try: time.sleep(1) except KeyboardInterrupt: break try: consumer_manager.stop_all_consumers() msg_monitor.stop_consuming() monitor_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER")
def main(): args = get_args(sys.argv) node_count = 3 count = -1 # no limit tests = int(get_mandatory_arg(args, "--tests")) actions = int(get_mandatory_arg(args, "--actions")) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10)) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) cluster_size = get_optional_arg(args, "--cluster", "3") queue = get_mandatory_arg(args, "--queue") sac = get_mandatory_arg(args, "--sac") chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int( get_optional_arg(args, "--chaos-min-interval", "30")) chaos_max_interval = int( get_optional_arg(args, "--chaos-max-interval", "120")) message_type = "sequence" queue_type = get_mandatory_arg(args, "--queue-type") sac_enabled = True if sac.upper() == "FALSE": sac_enabled = False for test_number in range(tests): print("") console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") subprocess.call( ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"]) console_out(f"Waiting for cluster...", "TEST RUNNER") time.sleep(30) console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) broker_manager = BrokerManager() broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") pub_node = broker_manager.get_random_init_node() con_node = broker_manager.get_random_init_node() console_out(f"publish to: {pub_node}", "TEST RUNNER") console_out(f"consume from: {con_node}", "TEST RUNNER") print_mod = in_flight_max * 5 queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if sac_enabled: queue_created = broker_manager.create_sac_queue( mgmt_node, queue_name, cluster_size, queue_type) else: queue_created = broker_manager.create_queue( mgmt_node, queue_name, cluster_size, queue_type) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(print_mod) publisher = RabbitPublisher(f"PUBLISHER(Test:{test_number} Id:P1)", initial_nodes, pub_node, in_flight_max, 120, print_mod) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER") consumer_manager.add_consumers(1, test_number, queue_name) stats = QueueStats('jack', 'jack', queue_name) chaos = ChaosExecutor(initial_nodes) if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() pub_thread = threading.Thread(target=publisher.publish_direct, args=(queue_name, count, 1, 0, "sequence")) pub_thread.start() console_out("publisher started", "TEST RUNNER") for action_num in range(0, actions): wait_sec = random.randint(chaos_min_interval, chaos_max_interval) console_out(f"waiting for {wait_sec} seconds before next action", "TEST RUNNER") time.sleep(wait_sec) console_out( f"execute chaos action {str(action_num)} of test {str(test_number)}", "TEST RUNNER") chaos.execute_chaos_action() subprocess.call(["bash", "../cluster/cluster-status.sh"]) time.sleep(60) console_out("repairing cluster", "TEST RUNNER") chaos.repair() console_out("repaired cluster", "TEST RUNNER") publisher.stop(True) console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while ctr < grace_period_sec: if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count( ) and len(publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") if len(lost_msgs) > 0: console_out(f"Lost messages count: {len(lost_msgs)}", "TEST RUNNER") for msg in lost_msgs: console_out(f"Lost message: {msg}", "TEST RUNNER") console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if msg_monitor.get_out_of_order() == True: console_out("FAILED TEST: OUT OF ORDER MESSAGES", "TEST RUNNER") success = False if len(lost_msgs) > 0: console_out("FAILED TEST: LOST MESSAGES", "TEST RUNNER") success = False if success == True: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() con_thread.join() pub_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number)} COMPLETE", "TEST RUNNER")
def main(): args = get_args(sys.argv) node_count = 3 count = -1 # no limit tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = int(get_mandatory_arg(args, "--consumers")) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10)) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) cluster_size = get_optional_arg(args, "--cluster", "3") queue = get_mandatory_arg(args, "--queue") queue_type = get_mandatory_arg(args, "--queue-type") message_type = "sequence" for test_number in range(tests): print("") console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") subprocess.call( ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"]) console_out(f"Waiting for cluster...", "TEST RUNNER") time.sleep(30) console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) broker_manager = BrokerManager() broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") print_mod = 5000 queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: queue_created = broker_manager.create_sac_queue( mgmt_node, queue_name, cluster_size, queue_type) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(print_mod) stats = QueueStats('jack', 'jack', queue_name) chaos = ChaosExecutor(initial_nodes) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER") pub_node = broker_manager.get_random_init_node() publisher = RabbitPublisher(str(test_number), initial_nodes, pub_node, in_flight_max, 120, print_mod) consumer_manager.add_consumers(consumer_count, test_number, queue_name) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() pub_thread = threading.Thread(target=publisher.publish_direct, args=(queue_name, count, 1, 0, "sequence")) pub_thread.start() console_out("publisher started", "TEST RUNNER") init_wait_sec = 20 console_out( f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) chaos_thread = threading.Thread( target=chaos.start_random_single_action_and_repair, args=(90, )) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") consumer_action_thread = threading.Thread( target=consumer_manager.start_random_consumer_actions, args=(5, 30)) consumer_action_thread.start() console_out("Consumer actions started", "TEST RUNNER") ctr = 0 while ctr < run_minutes: time.sleep(60) ctr += 1 console_out( f"Test at {ctr} minute mark, {run_minutes-ctr} minutes left", "TEST RUNNER") try: chaos.stop_random_single_action_and_repair() consumer_manager.stop_random_consumer_actions() chaos_thread.join() consumer_action_thread.join() except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") console_out("Resuming consumers", "TEST RUNNER") consumer_manager.resume_all_consumers() publisher.stop(True) console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while ctr < grace_period_sec: if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count( ) and len(publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() not_consumed_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if len(not_consumed_msgs) > 0: console_out( f"FAILED TEST: Potential failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() pub_thread.join() msg_monitor.stop_consuming() monitor_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
def main(): print("quorum-queue-test.py") args = get_args(sys.argv) count = -1 # no limit tests = int(get_mandatory_arg(args, "--tests")) actions = int(get_mandatory_arg(args, "--actions")) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10)) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) cluster_size = get_optional_arg(args, "--cluster", "3") queue = get_mandatory_arg(args, "--queue") sac_enabled = is_true(get_mandatory_arg(args, "--sac")) chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int( get_optional_arg(args, "--chaos-min-interval", "30")) chaos_max_interval = int( get_optional_arg(args, "--chaos-max-interval", "120")) prefetch = int(get_optional_arg(args, "--pre-fetch", "10")) rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta", ["3.7", "3.8-beta", "3.8-alpha"]) for test_number in range(1, tests + 1): print("") console_out( f"TEST RUN: {str(test_number)} of {tests}--------------------------", "TEST RUNNER") setup_complete = False while not setup_complete: broker_manager = BrokerManager() broker_manager.deploy(cluster_size, True, rmq_version, False) initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") print_mod = in_flight_max * 5 queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False qc_ctr = 0 while queue_created == False and qc_ctr < 20: qc_ctr += 1 if sac_enabled: queue_created = broker_manager.create_quorum_sac_queue( mgmt_node, queue_name, cluster_size, 0) else: queue_created = broker_manager.create_quorum_queue( mgmt_node, queue_name, cluster_size, 0) if queue_created: setup_complete = True else: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor("qqt", test_number, print_mod, True, False) publisher = RabbitPublisher(1, test_number, broker_manager, in_flight_max, 120, print_mod) publisher.configure_sequence_direct(queue_name, count, 0, 1) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", False) consumer_manager.add_consumers(1, test_number, queue_name, prefetch) chaos = ChaosExecutor(initial_nodes) if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() pub_thread = threading.Thread(target=publisher.start_publishing) pub_thread.start() console_out("publisher started", "TEST RUNNER") for action_num in range(1, actions + 1): wait_sec = random.randint(chaos_min_interval, chaos_max_interval) console_out(f"waiting for {wait_sec} seconds before next action", "TEST RUNNER") time.sleep(wait_sec) console_out( f"execute chaos action {str(action_num)}/{actions} of test {str(test_number)}", "TEST RUNNER") chaos.execute_chaos_action() subprocess.call(["bash", "../cluster/cluster-status.sh"]) time.sleep(60) console_out("repairing cluster", "TEST RUNNER") chaos.repair() console_out("repaired cluster", "TEST RUNNER") publisher.stop_publishing() console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while True: ms_since_last_msg = datetime.datetime.now( ) - msg_monitor.get_last_msg_time() if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count( ) and len(publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break elif ctr > grace_period_sec and ms_since_last_msg.total_seconds( ) > 15: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") if len(lost_msgs) > 0: console_out(f"Lost messages count: {len(lost_msgs)}", "TEST RUNNER") for msg in lost_msgs: console_out(f"Lost message: {msg}", "TEST RUNNER") console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if msg_monitor.get_out_of_order() == True: console_out("FAILED TEST: OUT OF ORDER MESSAGES", "TEST RUNNER") success = False if len(lost_msgs) > 0: console_out("FAILED TEST: LOST MESSAGES", "TEST RUNNER") success = False if success == True: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() pub_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number)} COMPLETE", "TEST RUNNER")
def main(): print("publish-consume.py") args = get_args(sys.argv) # cluster new_cluster = is_true( get_optional_arg_validated(args, "--new-cluster", "false", ["true", "false"])) if new_cluster: cluster_size = int(get_mandatory_arg(args, "--cluster-size")) else: cluster_size = int(get_optional_arg(args, "--cluster-size", "3")) rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta", ["3.7", "3.8-beta", "3.8-alpha"]) # queues and exchanges exchanges = as_list(get_optional_arg(args, "--exchanges", "")) queue_name = get_mandatory_arg(args, "--queue") queue_type = get_optional_arg_validated(args, "--queue-type", "mirrored", ["mirrored", "quorum"]) qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0")) rep_factor = int(get_optional_arg(args, "--rep-factor", str(cluster_size))) sac_enabled = is_true( get_optional_arg_validated(args, "--sac", "false", ["true", "false"])) if rmq_version == "3.7": if sac_enabled: console_out("Cannot use SAC mode with RabbitMQ 3.7", "TEST RUNNER") exit(1) if queue_type == "quorum": console_out("Cannot use quorum queues with RabbitMQ 3.7", "TEST RUNNER") exit(1) # publisher publisher_count = int(get_optional_arg(args, "--publishers", "1")) pub_mode = get_optional_arg_validated(args, "--pub-mode", "direct", ["direct", "exchange"]) msg_mode = get_optional_arg_validated( args, "--msg-mode", "sequence", ["sequence", "partitioned-sequence", "large-msgs", "hello"]) count = int(get_mandatory_arg(args, "--msgs")) dup_rate = float(get_optional_arg(args, "--dup-rate", "0")) sequence_count = int(get_optional_arg(args, "--sequences", 1)) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10)) # consumers consumer_count = int(get_optional_arg(args, "--consumers", "1")) prefetch = int(get_optional_arg(args, "--pre-fetch", "10")) analyze = is_true( get_optional_arg_validated(args, "--analyze", "true", ["true", "false"])) print_mod = get_optional_arg(args, "--print-mod", in_flight_max * 5) broker_manager = BrokerManager() broker_manager.deploy(cluster_size, new_cluster, rmq_version, False) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if queue_type == "mirrored": if sac_enabled: queue_created = broker_manager.create_standard_sac_queue( mgmt_node, queue_name, rep_factor) else: queue_created = broker_manager.create_standard_queue( mgmt_node, queue_name, rep_factor) elif queue_type == "quorum": if sac_enabled: queue_created = broker_manager.create_quorum_sac_queue( mgmt_node, queue_name, rep_factor, qq_max_length) else: queue_created = broker_manager.create_quorum_queue( mgmt_node, queue_name, rep_factor, qq_max_length) if queue_created == False: time.sleep(5) broker_manager.declare_exchanges(queue_name, exchanges) time.sleep(10) if consumer_count > 0: msg_monitor = MessageMonitor("pub-con", 1, print_mod, analyze, False) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", False) consumer_manager.add_consumers(consumer_count, 1, queue_name, prefetch) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() if publisher_count > 0: pub_manager = PublisherManager(broker_manager, 1, "TEST RUNNER", publisher_count, in_flight_max, print_mod) if pub_mode == "direct": if msg_mode == "sequence": pub_manager.add_sequence_direct_publishers( queue_name, count, dup_rate, sequence_count) elif pub_mode == "partitioned-sequence": print("Cannot use partitioned sequence mode with direct mode") exit(1) elif pub_mode == "large-msgs": msg_size = int(get_mandatory_arg(args, "--msg-size")) pub_manager.add_large_msgs_direct_publishers( queue_name, count, dup_rate, msg_size) else: pub_manager.add_hello_msgs_direct_publishers( queue_name, count, dup_rate) elif pub_mode == "exchange": if len(exchanges) == 0: console_out("No exchanges provided", "TEST RUNNER") exit(1) if msg_mode == "sequence": pub_manager.add_sequence_to_exchanges_publishers( exchanges, "", count, dup_rate, sequence_count) elif msg_mode == "partitioned-sequence": pub_manager.add_partitioned_sequence_to_exchanges_publishers( exchanges, count, dup_rate, sequence_count) elif msg_mode == "large-msgs": msg_size = int(get_mandatory_arg(args, "--msg-size")) pub_manager.add_large_msgs_to_exchanges_publishers( exchanges, "", count, dup_rate, msg_size) else: pub_manager.add_hello_msgs_to_exchanges_publishers( exchanges, "", count, dup_rate) pub_manager.start_publishers() while True: try: console_out( "Press + to add a consumer, - to remove a consumer, ! to remove the active consumer (SAC only)", "TEST_RUNNER") input_str = input() if input_str == "+": consumer_manager.add_consumer_and_start_consumer( 1, queue_name, prefetch) elif input_str == "-": consumer_manager.stop_and_remove_oldest_consumer() else: consumer_manager.stop_and_remove_specfic_consumer(input_str) except KeyboardInterrupt: if publisher_count > 0: console_out( "Stopping publishers. Starting grace period for consumers to catch up.", "TEST_RUNNER") pub_manager.stop_all_publishers() break if publisher_count > 0 and consumer_count > 0: try: ctr = 0 while ctr < 300: if msg_monitor.get_unique_count( ) >= pub_manager.get_total_pos_ack_count() and len( pub_manager.get_total_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 except KeyboardInterrupt: console_out("Grace period ended", "TEST RUNNER") confirmed_set = pub_manager.get_total_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Confirmed count: {pub_manager.get_total_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") if analyze: success = True if len(lost_msgs) > 0: console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}", "TEST RUNNER") success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") elif publisher_count > 0: console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Confirmed count: {pub_manager.get_total_pos_ack_count()}", "TEST RUNNER") elif consumer_count > 0: console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: if consumer_count > 0: consumer_manager.stop_all_consumers() msg_monitor.stop_consuming() monitor_thread.join(10) except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST 1 COMPLETE", "TEST RUNNER")
def main(): args = get_args(sys.argv) tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = int(get_mandatory_arg(args, "--consumers")) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) topic = get_mandatory_arg(args, "--topic") partitions = get_mandatory_arg(args, "--partitions") cluster_size = get_optional_arg(args, "--cluster", "3") in_flight_max = int(get_optional_arg(args, "--in-flight-max", 100)) min_insync_reps = int(get_optional_arg(args, "--min-insync-replicas", "1")) unclean_failover = get_optional_arg(args, "--unclean-failover", "false") sequence_count = int(get_optional_arg(args, "--sequences", "1")) rep_factor = get_optional_arg(args, "--rep-factor", "3") acks_mode = get_optional_arg(args, "--acks-mode", "all") print_mod = int(get_optional_arg(args, "--print-mod", "0")) if print_mod == 0: print_mod = in_flight_max * 3 for test_number in range(tests): print("") console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") subprocess.call( ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"]) console_out(f"Waiting for cluster...", "TEST RUNNER") time.sleep(30) console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) broker_manager = BrokerManager() broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") broker_manager.correct_advertised_listeners() topic_name = topic + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() console_out(f"Creating topic {topic_name} using node {mgmt_node}", "TEST RUNNER") broker_manager.create_topic(mgmt_node, topic_name, rep_factor, partitions, min_insync_reps, unclean_failover) time.sleep(10) msg_monitor = MessageMonitor(print_mod) chaos = ChaosExecutor(broker_manager) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", topic_name) pub_node = broker_manager.get_random_init_node() producer = KafkaProducer(test_number, 1, broker_manager, acks_mode, in_flight_max, print_mod) producer.create_producer() producer.configure_as_sequence(sequence_count) consumer_manager.add_consumers(consumer_count, test_number) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() pub_thread = threading.Thread(target=producer.start_producing, args=(topic_name, 10000000)) pub_thread.start() console_out("producer started", "TEST RUNNER") init_wait_sec = 20 console_out( f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) chaos_thread = threading.Thread( target=chaos.start_random_single_action_and_repair, args=(120, )) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") consumer_action_thread = threading.Thread( target=consumer_manager.start_random_consumer_actions, args=(60, 61)) consumer_action_thread.start() console_out("Consumer actions started", "TEST RUNNER") ctr = 0 while ctr < run_minutes: time.sleep(60) console_out( f"Test at {ctr} minute mark, {run_minutes-ctr} minutes left", "TEST RUNNER") ctr += 1 try: chaos.stop_random_single_action_and_repair() consumer_manager.stop_random_consumer_actions() chaos_thread.join() consumer_action_thread.join() except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") console_out("Resuming consumers", "TEST RUNNER") consumer_manager.resume_all_consumers() publisher.stop(True) console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while ctr < grace_period_sec: if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count( ) and len(publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if len(lost_msgs) > 0: console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}", "TEST RUNNER") success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() pub_thread.join() msg_monitor.stop_consuming() monitor_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
def main(): print("random-test.py") #signal.signal(signal.SIGINT, interuppt_handler) args = get_args(sys.argv) count = -1 # no limit test_name = get_mandatory_arg(args, "--test-name") tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = int(get_mandatory_arg(args, "--consumers")) prefetch = int(get_optional_arg(args, "--pre-fetch", "10")) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) queue = get_mandatory_arg(args, "--queue") queue_type = get_mandatory_arg(args, "--queue-type") analyze = is_true(get_optional_arg(args, "--analyze", "true")) if queue_type == "quorum": qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0")) sac_enabled = is_true(get_mandatory_arg(args, "--sac")) log_messages = is_true(get_optional_arg(args, "--log-msgs", "false")) publisher_count = int(get_optional_arg(args, "--publishers", "1")) if publisher_count > 0: in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10")) print_mod = int( get_optional_arg(args, "--print-mod", f"{in_flight_max * 5}")) sequence_count = int(get_optional_arg(args, "--sequences", "1")) else: print_mod = int(get_optional_arg(args, "--print-mod", f"1000")) new_cluster = is_true(get_optional_arg(args, "--new-cluster", "true")) cluster_size = get_optional_arg(args, "--cluster", "3") rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta", ["3.7", "3.8-beta", "3.8-alpha"]) stop_mode = get_optional_arg_validated(args, "--stop-mode", "crash", ["crash", "close", "cancel"]) use_toxiproxy = False consumer_hard_close = False if stop_mode == "crash": use_toxiproxy = True elif stop_mode == "close": consumer_hard_close = True include_chaos = is_true(get_optional_arg(args, "--chaos-actions", "true")) if include_chaos: chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int( get_optional_arg(args, "--chaos-min-interval", "60")) chaos_max_interval = int( get_optional_arg(args, "--chaos-max-interval", "120")) include_con_actions = is_true( get_optional_arg(args, "--consumer-actions", "true")) if include_con_actions: con_action_min_interval = int( get_optional_arg(args, "--consumer-min-interval", "20")) con_action_max_interval = int( get_optional_arg(args, "--consumer-max-interval", "60")) failed_test_log = list() failed_tests = set() for test_number in range(tests): print("") subprocess.call(["mkdir", f"logs/{test_name}/{str(test_number)}"]) console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") broker_manager = BrokerManager() broker_manager.deploy(cluster_size, new_cluster, rmq_version, use_toxiproxy) initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if queue_type == "mirrored": if sac_enabled: queue_created = broker_manager.create_standard_sac_queue( mgmt_node, queue_name, cluster_size) else: queue_created = broker_manager.create_standard_queue( mgmt_node, queue_name, cluster_size) elif queue_type == "quorum": if sac_enabled: queue_created = broker_manager.create_quorum_sac_queue( mgmt_node, queue_name, cluster_size, qq_max_length) else: queue_created = broker_manager.create_quorum_queue( mgmt_node, queue_name, cluster_size, qq_max_length) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(test_name, test_number, print_mod, analyze, log_messages) chaos = ChaosExecutor(initial_nodes) if include_chaos: if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() if consumer_count > 0: consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", use_toxiproxy) consumer_manager.add_consumers(consumer_count, test_number, queue_name, prefetch) consumer_manager.start_consumers() if publisher_count == 1: publisher = RabbitPublisher(1, test_number, broker_manager, in_flight_max, 120, print_mod) publisher.configure_sequence_direct(queue_name, count, 0, sequence_count) pub_thread = threading.Thread(target=publisher.start_publishing) pub_thread.start() console_out("publisher started", "TEST RUNNER") if include_con_actions or include_chaos: init_wait_sec = 20 console_out( f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) if include_chaos: chaos_thread = threading.Thread( target=chaos.start_random_single_action_and_repair, args=(chaos_min_interval, chaos_max_interval)) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") if include_con_actions: consumer_action_thread = threading.Thread( target=consumer_manager.start_random_consumer_actions, args=(con_action_min_interval, con_action_max_interval, consumer_hard_close)) consumer_action_thread.start() console_out("Consumer actions started", "TEST RUNNER") ctr = 0 run_seconds = run_minutes * 60 while ctr < run_seconds and not stop_please: try: time.sleep(1) ctr += 1 if ctr % 60 == 0: console_out( f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left", "TEST RUNNER") except KeyboardInterrupt: console_out( f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)", "TEST RUNNER") break try: chaos.stop_random_single_action_and_repair() if consumer_count > 0: consumer_manager.stop_random_consumer_actions() if include_chaos: chaos_thread.join(30) if include_con_actions: consumer_action_thread.join(30) except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") if publisher_count > 0: publisher.stop_publishing() if consumer_count > 0: console_out("Resuming consumers", "TEST RUNNER") consumer_manager.resume_all_consumers() console_out("Starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 try: while ctr < grace_period_sec: if publisher_count > 0 and msg_monitor.get_unique_count( ) >= publisher.get_pos_ack_count() and len( publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 except KeyboardInterrupt: console_out("Grace period ended", "TEST RUNNER") console_out("RESULTS ----------------------------------------", "TEST RUNNER") if publisher_count > 0: confirmed_set = publisher.get_msg_set() not_consumed_msgs = confirmed_set.difference( msg_monitor.get_msg_set()) console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") else: not_consumed_msgs = set() console_out( f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if consumer_count > 0: if len(not_consumed_msgs) > 0: if sac_enabled: console_out( f"FAILED TEST: Potential message loss or failure of consumers to consume or failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") else: console_out( f"FAILED TEST: Potential message loss or failure of consumers to consume. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") failed_test_log.append( f"Test {test_number} FAILURE: Potential Message Loss. {len(not_consumed_msgs)} messsages." ) failed_tests.add(test_number) lost_ctr = 0 sorted_msgs = list(not_consumed_msgs) sorted_msgs.sort() for msg in sorted_msgs: console_out(f"Lost? {msg}", "TEST RUNNER") lost_ctr += 1 if lost_ctr > 500: console_out("More than 500, truncated list", "TEST RUNNER") break success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") failed_test_log.append( f"Test {test_number} FAILURE: Received out-of-order messages" ) failed_tests.add(test_number) if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END ------------------------------------", "TEST RUNNER") try: if consumer_count > 0: consumer_manager.stop_all_consumers() if publisher_count == 1: pub_thread.join(30) msg_monitor.stop_consuming() monitor_thread.join(30) except Exception as e: console_out_exception("Failed to clean up test correctly.", e, "TEST RUNNER") broker_manager.zip_log_files(test_name, test_number) console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER") console_out("", "TEST RUNNER") console_out("SUMMARY", "TEST RUNNER") console_out(f"OK {tests - len(failed_tests)} FAIL {len(failed_tests)}", "TEST RUNNER") for line in failed_test_log: console_out(line, "TEST RUNNER") console_out("TEST RUN COMPLETE", "TEST RUNNER")
def main(): args = get_args(sys.argv) tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = 1 topic = get_mandatory_arg(args, "--topic") idempotence = is_true(get_mandatory_arg(args, "--idempotence")) partitions = 1 cluster_size = get_optional_arg(args, "--cluster", "3") in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10000)) buffering_max = int(get_optional_arg(args, "--buffering-max-ms", 0)) min_insync_reps = 1 unclean_failover = "false" sequence_count = 1 rep_factor = get_optional_arg(args, "--rep-factor", "3") acks_mode = get_optional_arg(args, "--acks-mode", "all") print_mod = int(get_optional_arg(args, "--print-mod", "0")) new_cluster = is_true(get_optional_arg(args, "--new-cluster", "true")) group_id = get_optional_arg(args, "--group-id", str(uuid.uuid1())) if print_mod == 0: print_mod = in_flight_max * 3; for test_number in range(tests): print("") console_out(f"TEST RUN: {str(test_number)} with idempotence={idempotence}--------------------------", "TEST RUNNER") broker_manager = BrokerManager("confluent", True) if new_cluster: broker_manager.deploy(cluster_size, True) broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") broker_manager.correct_advertised_listeners() topic_name = topic + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() console_out(f"Creating topic {topic_name} using node {mgmt_node}", "TEST RUNNER") broker_manager.create_topic(mgmt_node, topic_name, rep_factor, partitions, min_insync_reps, unclean_failover) time.sleep(10) msg_monitor = MessageMonitor(print_mod, True) chaos = ChaosExecutor(broker_manager) pub_node = broker_manager.get_random_init_node() producer = KafkaProducer(test_number, 1, broker_manager, acks_mode, in_flight_max, print_mod) if idempotence: producer.create_idempotent_producer(10000000, buffering_max) else: producer.create_producer(1000000, buffering_max) producer.configure_as_sequence(sequence_count) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() pub_thread = threading.Thread(target=producer.start_producing,args=(topic_name, 1000000000)) pub_thread.start() console_out("producer started", "TEST RUNNER") init_wait_sec = 20 console_out(f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) chaos_thread = threading.Thread(target=chaos.start_kill_leader_or_connections,args=(topic_name, 0)) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") ctr = 1 while ctr < run_minutes: time.sleep(60) console_out(f"Test at {ctr} minute mark, {run_minutes-ctr} minutes left", "TEST RUNNER") ctr += 1 producer.stop_producing() try: chaos.stop_chaos_actions() chaos_thread.join() console_out(f"Chaos executor shutdown", "TEST RUNNER") except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) time.sleep(60) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", topic_name, group_id) consumer_manager.add_consumers(consumer_count, test_number) consumer_manager.start_consumers() ctr = 0 while ctr < 300: if msg_monitor.get_unique_count() >= producer.get_pos_ack_count() and len(producer.get_msg_set().difference(msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 confirmed_set = producer.get_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) duplicates = msg_monitor.get_receive_count() - msg_monitor.get_unique_count() console_out("RESULTS------------------------------------", "TEST RUNNER") console_out(f"Confirmed count: {producer.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") console_out(f"Duplication count: {duplicates}", "TEST RUNNER") success = True if len(lost_msgs) > 0: console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}", "TEST RUNNER") success = False if idempotence and msg_monitor.get_out_of_order(): success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if idempotence and duplicates: success = False console_out(f"FAILED TEST: Duplicates", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() pub_thread.join() msg_monitor.stop_consuming() monitor_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
def main(): #signal.signal(signal.SIGINT, interuppt_handler) args = get_args(sys.argv) count = -1 # no limit tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = int(get_mandatory_arg(args, "--consumers")) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) queue = get_mandatory_arg(args, "--queue") queue_type = get_mandatory_arg(args, "--queue-type") sac = get_mandatory_arg(args, "--sac") publisher_count = int(get_optional_arg(args, "--publishers", "1")) print_mod = int(get_optional_arg(args, "--print-mod", "0")) new_cluster = get_optional_arg(args, "--new-cluster", "true") in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10")) sequence_count = int(get_optional_arg(args, "--sequences", "1")) cluster_size = get_optional_arg(args, "--cluster", "3") chaos = get_optional_arg(args, "--chaos-actions", "true") chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int(get_optional_arg(args, "--chaos-min-interval", "60")) chaos_max_interval = int(get_optional_arg(args, "--chaos-max-interval", "120")) consumer_actions = get_optional_arg(args, "--consumer-actions", "true") con_action_min_interval = int(get_optional_arg(args, "--consumer-min-interval", "20")) con_action_max_interval = int(get_optional_arg(args, "--consumer-max-interval", "60")) if print_mod == 0: print_mod = in_flight_max * 5 include_chaos = True if chaos.upper() == "FALSE": include_chaos = False include_con_actions = True if consumer_actions.upper() == "FALSE": include_con_actions = False sac_enabled = True if sac.upper() == "FALSE": sac_enabled = False message_type = "sequence" for test_number in range(tests): print("") console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") if new_cluster.upper() == "TRUE": subprocess.call(["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"]) console_out(f"Waiting for cluster...", "TEST RUNNER") time.sleep(30) console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) broker_manager = BrokerManager() broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if sac_enabled: queue_created = broker_manager.create_sac_queue(mgmt_node, queue_name, cluster_size, queue_type) else: queue_created = broker_manager.create_queue(mgmt_node, queue_name, cluster_size, queue_type) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(print_mod) stats = QueueStats('jack', 'jack', queue_name) chaos = ChaosExecutor(initial_nodes) if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER") pub_node = broker_manager.get_random_init_node() publisher = RabbitPublisher(f"PUBLISHER(Test:{test_number} Id:P1)", initial_nodes, pub_node, in_flight_max, 120, print_mod) consumer_manager.add_consumers(consumer_count, test_number, queue_name) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() if publisher_count == 1: pub_thread = threading.Thread(target=publisher.publish_direct,args=(queue_name, count, sequence_count, 0, "sequence")) pub_thread.start() console_out("publisher started", "TEST RUNNER") if include_con_actions or include_chaos: init_wait_sec = 20 console_out(f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) if include_chaos: chaos_thread = threading.Thread(target=chaos.start_random_single_action_and_repair,args=(chaos_min_interval,chaos_max_interval)) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") if include_con_actions: consumer_action_thread = threading.Thread(target=consumer_manager.start_random_consumer_actions,args=(con_action_min_interval, con_action_max_interval)) consumer_action_thread.start() console_out("Consumer actions started", "TEST RUNNER") ctr = 0 run_seconds = run_minutes * 60 while ctr < run_seconds and not stop_please: try: time.sleep(1) ctr += 1 if ctr % 60 == 0: console_out(f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left", "TEST RUNNER") except KeyboardInterrupt: console_out(f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)", "TEST RUNNER") break try: chaos.stop_random_single_action_and_repair() consumer_manager.stop_random_consumer_actions() if include_chaos: chaos_thread.join() if include_con_actions: consumer_action_thread.join() except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") console_out("Resuming consumers", "TEST RUNNER") consumer_manager.resume_all_consumers() if publisher_count == 1: publisher.stop(True) console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while ctr < grace_period_sec: if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count() and len(publisher.get_msg_set().difference(msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() not_consumed_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS ----------------------------------------", "TEST RUNNER") console_out(f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if len(not_consumed_msgs) > 0: console_out(f"FAILED TEST: Potential failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END ------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() if publisher_count == 1: pub_thread.join() msg_monitor.stop_consuming() monitor_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
def main(): args = get_args(sys.argv) cluster_size = get_optional_arg(args, "--cluster", "3") new_cluster = is_true(get_mandatory_arg(args, "--new-cluster")) use_blockade = is_true(get_optional_arg(args, "--use-blockade", "true")) image_version = get_optional_arg(args, "--image-version", "confluent") consumer_count = int(get_optional_arg(args, "--consumers", "1")) group_id = get_optional_arg(args, "--group-id", str(uuid.uuid1())) grace_period_sec = int(get_optional_arg(args, "--grace-period-sec", "300")) topic, is_new_topic = get_topic(new_cluster, args) partitions = get_optional_arg(args, "--partitions", "3") rep_factor = get_optional_arg(args, "--rep-factor", "3") analyze = is_true(get_optional_arg(args, "--analyze", "true")) producer_count = int(get_optional_arg(args, "--producers", 1)) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 100)) min_insync_reps = int(get_optional_arg(args, "--min-insync-replicas", "1")) unclean_failover = get_optional_arg(args, "--unclean-failover", "false") sequence_count = int(get_optional_arg(args, "--sequences", "1")) acks_mode = get_optional_arg(args, "--acks-mode", "all") print_mod = int(get_optional_arg(args, "--print-mod", "0")) if print_mod == 0: print_mod = in_flight_max * 3; test_number = 1 console_out(f"Starting...", "TEST RUNNER") broker_manager = BrokerManager(image_version, use_blockade) broker_manager.deploy(cluster_size, new_cluster) initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") topic_name = topic if new_cluster or is_new_topic: mgmt_node = broker_manager.get_random_init_node() console_out(f"Creating topic {topic_name} using node {mgmt_node}", "TEST RUNNER") broker_manager.create_topic(mgmt_node, topic_name, rep_factor, partitions, min_insync_reps, unclean_failover) time.sleep(10) msg_monitor = MessageMonitor(print_mod, analyze) prod_manager = ProducerManager(broker_manager, "TEST RUNNER", topic_name) prod_manager.add_producers(producer_count, test_number, acks_mode, in_flight_max, print_mod, sequence_count) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", topic_name, group_id) consumer_manager.add_consumers(consumer_count, test_number) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() time.sleep(30) prod_manager.start_producers() while True: try: command = input("a=add consumer, r=remove consumer - then hit enter") if command == "a": consumer_manager.add_consumer_and_start_consumer(test_number) elif command == "r": consumer_manager.stop_and_remove_consumer() else: console_out("Unknown command", "TEST_RUNNER") except KeyboardInterrupt: console_out("Stopping producer. Starting grace period for consumers to catch up.", "TEST_RUNNER") prod_manager.stop_all_producers() break if producer_count > 0: try: ctr = 0 while ctr < grace_period_sec: if msg_monitor.get_unique_count() >= prod_manager.get_total_pos_ack_count() and len(prod_manager.get_total_msg_set().difference(msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 except KeyboardInterrupt: console_out("Grace period ended", "TEST RUNNER") confirmed_set = prod_manager.get_total_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") console_out(f"Confirmed count: {prod_manager.get_total_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") if analyze: success = True if len(lost_msgs) > 0: console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}", "TEST RUNNER") success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() msg_monitor.stop_consuming() monitor_thread.join() prod_manager.stop_all_producers() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
def main(): args = get_args(sys.argv) new_cluster = get_mandatory_arg(args, "--new-cluster") consumer_count = int(get_mandatory_arg(args, "--consumers")) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) topic = get_mandatory_arg(args, "--topic") partitions = get_mandatory_arg(args, "--partitions") cluster_size = get_optional_arg(args, "--cluster", "3") in_flight_max = int(get_optional_arg(args, "--in-flight-max", 100)) min_insync_reps = int(get_optional_arg(args, "--min-insync-replicas", "1")) unclean_failover = get_optional_arg(args, "--unclean-failover", "false") sequence_count = int(get_optional_arg(args, "--sequences", "1")) rep_factor = get_optional_arg(args, "--rep-factor", "3") acks_mode = get_optional_arg(args, "--acks-mode", "all") print_mod = int(get_optional_arg(args, "--print-mod", "0")) if print_mod == 0: print_mod = in_flight_max * 3 test_number = 1 console_out(f"Starting...", "TEST RUNNER") if new_cluster.upper() == "TRUE": subprocess.call( ["bash", "../automated/setup-test-run.sh", cluster_size, "3.8"]) console_out(f"Waiting for cluster...", "TEST RUNNER") time.sleep(30) else: console_out(f"Using existing cluster...", "TEST RUNNER") console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) broker_manager = BrokerManager() broker_manager.load_initial_nodes() initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") broker_manager.correct_advertised_listeners() topic_name = topic mgmt_node = broker_manager.get_random_init_node() console_out(f"Creating topic {topic_name} using node {mgmt_node}", "TEST RUNNER") broker_manager.create_topic(mgmt_node, topic_name, rep_factor, partitions, min_insync_reps, unclean_failover) time.sleep(10) msg_monitor = MessageMonitor(print_mod) chaos = ChaosExecutor(broker_manager) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", topic_name) pub_node = broker_manager.get_random_init_node() producer = KafkaProducer(test_number, 1, broker_manager, acks_mode, in_flight_max, print_mod) producer.create_producer() producer.configure_as_sequence(sequence_count) consumer_manager.add_consumers(consumer_count, test_number) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() pub_thread = threading.Thread(target=producer.start_producing, args=(topic_name, 10000000)) pub_thread.start() console_out("producer started", "TEST RUNNER") while True: try: time.sleep(1) except KeyboardInterrupt: console_out( "Stopping producer. Starting grace period for consumers to catch up.", "TEST_RUNNER") producer.stop_producing() break try: ctr = 0 while ctr < grace_period_sec: if msg_monitor.get_unique_count() >= producer.get_pos_ack_count( ) and len(producer.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 except KeyboardInterrupt: console_out("Grace period ended", "TEST RUNNER") confirmed_set = producer.get_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Confirmed count: {producer.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if len(lost_msgs) > 0: console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}", "TEST RUNNER") success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() msg_monitor.stop_consuming() monitor_thread.join() pub_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")
broker_ip = "" broker_port = "" else: broker_ip = get_mandatory_arg(args, "--broker-ip") broker_port = get_mandatory_arg(args, "--broker-port") amqproxy_ip = "" amqproxy_port = "" publish_mode = get_mandatory_arg_validated(args, "--pub-mode", ["async", "sync", "new-conn-per-msg", "fire-and-forget"]) delay_seconds = int(get_optional_arg(args, "--pub-delay", "0")) if delay_seconds > 0: console_out(f"Starting with delay of {delay_seconds} seconds", "TEST RUNNER") time.sleep(delay_seconds) broker_manager = BrokerManager(mgmt_ip, mgmt_port, broker_name, broker_ip, broker_port, amqproxy_ip, amqproxy_port, user, password, use_https, virtual_host) queue_created = False while queue_created == False: queue_created = broker_manager.create_queue(queue, False) if queue_created == False: time.sleep(5) time.sleep(2) if use_toxiproxy: proxy_created = False while proxy_created == False: proxy_created = broker_manager.add_proxy("clients")
import logging log = logging.getLogger('werkzeug') log.setLevel(logging.ERROR) args = get_args(sys.argv) queue = get_optional_arg(args, "--queue", f"q{random.randint(0, 100000)}") print_mod = int(get_optional_arg(args, "--print-mod", "1000")) use_confirms = is_true(get_mandatory_arg(args, "--use-confirms")) mgmt_ip = get_mandatory_arg(args, "--mgmt-ip") broker_name = get_mandatory_arg(args, "--broker-name") broker_ip = get_mandatory_arg(args, "--broker-ip") broker_port = get_mandatory_arg(args, "--broker-port") amqproxy_ip = get_mandatory_arg(args, "--amqproxy-ip") amqproxy_port = get_mandatory_arg(args, "--amqproxy-port") broker_manager = BrokerManager(mgmt_ip, broker_name, broker_ip, broker_port, amqproxy_ip, amqproxy_port) queue_created = False while queue_created == False: queue_created = broker_manager.create_queue(queue, False) if queue_created == False: time.sleep(5) time.sleep(2) proxy_publisher = SimplePublisher(broker_manager, f"PUBLISHER", True, use_confirms, True, print_mod) nonproxy_publisher = SimplePublisher(broker_manager, f"PUBLISHER", False, use_confirms, True, print_mod)