def __init__(self, args, suffix): self.suffix = suffix self.config_tag = get_mandatory_arg(args, "--config-tag", self.suffix) self.technology = get_mandatory_arg_validated(args, "--technology", self.suffix, ["rabbitmq"]) self.cluster_size = int( get_optional_arg(args, "--cluster-size", self.suffix, "1")) self.broker_version = get_mandatory_arg(args, "--version", self.suffix) self.volume_size = get_optional_arg(args, "--volume-size", self.suffix, "50") # for GCP deployment only self.filesystem = get_mandatory_arg_validated(args, "--filesystem", self.suffix, ["ext4", "xfs"]) self.tenancy = get_mandatory_arg_validated(args, "--tenancy", self.suffix, ["default", "dedicated"]) self.core_count = get_mandatory_arg(args, "--core-count", self.suffix) self.threads_per_core = get_mandatory_arg(args, "--threads-per-core", self.suffix) self.vars_file = get_optional_arg( args, "--vars-file", self.suffix, f".variables/{self.technology}-generic-vars.yml") self.no_tcp_delay = get_optional_arg(args, "--no-tcp-delay", self.suffix, "true") self.policies_file = get_optional_arg(args, "--policies-file", self.suffix, "none") self.pub_connect_to_node = get_optional_arg_validated( args, "--pub-connect-to-node", self.suffix, ["roundrobin", "local", "non-local", "random"], "roundrobin") self.con_connect_to_node = get_optional_arg_validated( args, "--con-connect-to-node", self.suffix, ["roundrobin", "local", "non-local", "random"], "roundrobin") self.node_number = -1
def __init__(self, args, suffix): super().__init__(args, suffix) self.generic_unix_url = get_optional_arg(args, "--generic-unix-url", self.suffix, "must-be-using-eks") self.instance = get_mandatory_arg(args, "--instance", self.suffix) self.volume1_iops_per_gb = get_optional_arg( args, "--volume1-iops-per-gb", self.suffix, "50") # only applicable to io1, else ignored self.volume2_iops_per_gb = get_optional_arg( args, "--volume2-iops-per-gb", self.suffix, "50") # only applicable to io1, else ignored self.volume3_iops_per_gb = get_optional_arg( args, "--volume3-iops-per-gb", self.suffix, "50") # only applicable to io1, else ignored self.volume1_size = get_optional_arg(args, "--volume1-size", self.suffix, "50") self.volume2_size = get_optional_arg(args, "--volume2-size", self.suffix, "0") self.volume3_size = get_optional_arg(args, "--volume3-size", self.suffix, "0") self.volume1_type = get_optional_arg_validated( args, "--volume1-type", self.suffix, [ "ebs-io1", "ebs-st1", "ebs-sc1", "ebs-gp2", "local-nvme", "pd-ssd" ], "ebs-gp2") self.volume2_type = get_optional_arg_validated( args, "--volume2-type", self.suffix, [ "ebs-io1", "ebs-st1", "ebs-sc1", "ebs-gp2", "local-nvme", "pd-ssd" ], "ebs-gp2") self.volume3_type = get_optional_arg_validated( args, "--volume3-type", self.suffix, [ "ebs-io1", "ebs-st1", "ebs-sc1", "ebs-gp2", "local-nvme", "pd-ssd" ], "ebs-gp2") self.volume1_mountpoint = get_optional_arg(args, "--volume1-mountpoint", self.suffix, "/volume1") self.volume2_mountpoint = get_optional_arg(args, "--volume2-mountpoint", self.suffix, "/volume2") self.volume3_mountpoint = get_optional_arg(args, "--volume3-mountpoint", self.suffix, "/volume3") self.data_volume = get_optional_arg(args, "--data-volume", self.suffix, "volume1") self.logs_volume = get_optional_arg(args, "--logs-volume", self.suffix, "volume1") self.quorum_volume = get_optional_arg(args, "--quorum-volume", self.suffix, "volume1") self.wal_volume = get_optional_arg(args, "--wal-volume", self.suffix, "volume1")
def __init__(self, args): self.run_id = str(uuid.uuid4()) self.tags = get_mandatory_arg(args, "--tags", "") self.mode = get_optional_arg_validated(args, "--mode", "", ["logged-benchmark", "model"], "logged-benchmark") self.config_count = int( get_optional_arg(args, "--config-count", "", "1")) self.new_instance_per_run = is_true( get_optional_arg(args, "--new-instance-per-run", "", "false")) self.no_destroy = is_true( get_optional_arg(args, "--no-destroy", "", "false")) self.no_deploy = is_true( get_optional_arg(args, "--no-deploy", "", "false")) self.run_tag = get_optional_arg(args, "--run-tag", "", "none") self.playlist_file = get_mandatory_arg(args, "--playlist-file", "") self.background_policies_file = get_optional_arg( args, "--bg-policies-file", "", "none") self.background_topology_file = get_optional_arg( args, "--bg-topology-file", "", "none") self.background_delay = int( get_optional_arg(args, "--bg-delay", "", "0")) self.background_step_seconds = int( get_optional_arg(args, "--bg-step-seconds", "", "0")) self.background_step_repeat = int( get_optional_arg(args, "--bg-step-repeat", "", "0")) self.gap_seconds = int(get_mandatory_arg(args, "--gap-seconds", "")) self.repeat_count = int(get_optional_arg(args, "--repeat", "", "1")) self.parallel_count = int(get_optional_arg(args, "--parallel", "", "1")) self.override_step_seconds = int( get_optional_arg(args, "--override-step-seconds", "", "0")) self.override_step_repeat = int( get_optional_arg(args, "--override-step-repeat", "", "0")) self.override_step_msg_limit = int( get_optional_arg(args, "--override-step-msg-limit", "", "0")) self.override_broker_hosts = get_optional_arg( args, "--override-broker-hosts", "", "") self.ami = get_mandatory_arg(args, "--ami", "") self.broker_sg = get_mandatory_arg(args, "--broker-sg", "") self.loadgen_sg = get_mandatory_arg(args, "--loadgen-sg", "") self.loadgen_instance = get_mandatory_arg(args, "--loadgen-instance", "") self.subnet = get_mandatory_arg(args, "--subnet", "") self.key_pair = get_mandatory_arg(args, "--keypair", "") self.username = "******" self.password = get_mandatory_arg(args, "--password", "") self.postgres_url = get_mandatory_arg(args, "--postgres-jdbc-url", "") self.postgres_user = get_mandatory_arg(args, "--postgres-user", "") self.postgres_pwd = get_mandatory_arg_no_print(args, "--postgres-password", "") self.node_counter = int( get_optional_arg(args, "--start-node-num-from", "", "1")) self.hosting = "aws" self.log_level = get_optional_arg(args, "--log-level", "", "info")
def __init__(self, args): self.run_id = str(uuid.uuid4()) self.tags = get_mandatory_arg(args, "--tags", "") self.mode = get_optional_arg_validated(args, "--mode", "", ["benchmark","model"], "benchmark") self.config_count = int(get_optional_arg(args, "--config-count", "", "1")) self.new_instance_per_run = is_true(get_optional_arg(args, "--new-instance-per-run", "", "false")) self.no_destroy = is_true(get_optional_arg(args, "--no-destroy", "", "false")) self.no_deploy = is_true(get_optional_arg(args, "--no-deploy", "", "false")) self.restart_brokers = is_true(get_optional_arg(args, "--restart-brokers", "", "true")) self.run_tag = get_optional_arg(args, "--run-tag", "", "none") self.playlist_file = get_mandatory_arg(args, "--playlist-file", "") # note that for AWS, background load has been moved to playlists. TODO: do same for GCP self.background_policies_file = get_optional_arg(args, "--bg-policies-file", "", "none") # GCP only self.background_topology_file = get_optional_arg(args, "--bg-topology-file", "", "none") # GCP only self.background_delay = int(get_optional_arg(args, "--bg-delay-seconds", "", "0")) # GCP only self.background_step_seconds = int(get_optional_arg(args, "--bg-step-seconds", "", "0")) # GCP only self.background_step_repeat = int(get_optional_arg(args, "--bg-step-repeat", "", "0")) # GCP only self.gap_seconds = int(get_mandatory_arg(args, "--gap-seconds", "")) self.start_allowance_ms = int(get_optional_arg(args, "--start-allowance-seconds", "", "60")) self.repeat_count = int(get_optional_arg(args, "--repeat", "", "1")) self.parallel_count = int(get_optional_arg(args, "--parallel", "", "1")) self.override_step_seconds = int(get_optional_arg(args, "--override-step-seconds", "", "0")) self.override_step_repeat = int(get_optional_arg(args, "--override-step-repeat", "", "0")) self.override_step_msg_limit = int(get_optional_arg(args, "--override-step-msg-limit", "", "0")) self.override_broker_hosts = get_optional_arg(args, "--override-broker-hosts", "", "") self.federation_enabled = is_true(get_optional_arg(args, "--federation-enabled", "", "false")) self.attempts = get_optional_arg(args, "--attempts", "", "1") self.warmUpSeconds = get_optional_arg(args, "--warm-up-seconds", "", "0") # model mode only. Value: dataloss,duplicates,ordering,consumption,connectivity. Don't use ordering unless one consumer per queue. self.checks = get_optional_arg(args, "--checks", "", "dataloss,duplicates,connectivity") self.grace_period_sec = get_optional_arg(args, "--grace-period-sec", "", "60") self.username = "******" self.password = get_mandatory_arg(args, "--password", "") self.postgres_url = get_mandatory_arg(args, "--postgres-jdbc-url", "") self.postgres_user = get_mandatory_arg(args, "--postgres-user", "") self.postgres_pwd = get_mandatory_arg_no_print(args, "--postgres-password", "") self.node_counter = int(get_optional_arg(args, "--start-node-num-from", "", "1")) self.log_level = get_optional_arg(args, "--log-level", "", "info") self.influx_subpath = get_mandatory_arg(args, "--influx-subpath", "")
def main(): print("quorum-queue-test.py") args = get_args(sys.argv) count = -1 # no limit tests = int(get_mandatory_arg(args, "--tests")) actions = int(get_mandatory_arg(args, "--actions")) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10)) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) cluster_size = get_optional_arg(args, "--cluster", "3") queue = get_mandatory_arg(args, "--queue") sac_enabled = is_true(get_mandatory_arg(args, "--sac")) chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int( get_optional_arg(args, "--chaos-min-interval", "30")) chaos_max_interval = int( get_optional_arg(args, "--chaos-max-interval", "120")) prefetch = int(get_optional_arg(args, "--pre-fetch", "10")) rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta", ["3.7", "3.8-beta", "3.8-alpha"]) for test_number in range(1, tests + 1): print("") console_out( f"TEST RUN: {str(test_number)} of {tests}--------------------------", "TEST RUNNER") setup_complete = False while not setup_complete: broker_manager = BrokerManager() broker_manager.deploy(cluster_size, True, rmq_version, False) initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") print_mod = in_flight_max * 5 queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False qc_ctr = 0 while queue_created == False and qc_ctr < 20: qc_ctr += 1 if sac_enabled: queue_created = broker_manager.create_quorum_sac_queue( mgmt_node, queue_name, cluster_size, 0) else: queue_created = broker_manager.create_quorum_queue( mgmt_node, queue_name, cluster_size, 0) if queue_created: setup_complete = True else: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor("qqt", test_number, print_mod, True, False) publisher = RabbitPublisher(1, test_number, broker_manager, in_flight_max, 120, print_mod) publisher.configure_sequence_direct(queue_name, count, 0, 1) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", False) consumer_manager.add_consumers(1, test_number, queue_name, prefetch) chaos = ChaosExecutor(initial_nodes) if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() pub_thread = threading.Thread(target=publisher.start_publishing) pub_thread.start() console_out("publisher started", "TEST RUNNER") for action_num in range(1, actions + 1): wait_sec = random.randint(chaos_min_interval, chaos_max_interval) console_out(f"waiting for {wait_sec} seconds before next action", "TEST RUNNER") time.sleep(wait_sec) console_out( f"execute chaos action {str(action_num)}/{actions} of test {str(test_number)}", "TEST RUNNER") chaos.execute_chaos_action() subprocess.call(["bash", "../cluster/cluster-status.sh"]) time.sleep(60) console_out("repairing cluster", "TEST RUNNER") chaos.repair() console_out("repaired cluster", "TEST RUNNER") publisher.stop_publishing() console_out("starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 while True: ms_since_last_msg = datetime.datetime.now( ) - msg_monitor.get_last_msg_time() if msg_monitor.get_unique_count() >= publisher.get_pos_ack_count( ) and len(publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break elif ctr > grace_period_sec and ms_since_last_msg.total_seconds( ) > 15: break time.sleep(1) ctr += 1 confirmed_set = publisher.get_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") if len(lost_msgs) > 0: console_out(f"Lost messages count: {len(lost_msgs)}", "TEST RUNNER") for msg in lost_msgs: console_out(f"Lost message: {msg}", "TEST RUNNER") console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if msg_monitor.get_out_of_order() == True: console_out("FAILED TEST: OUT OF ORDER MESSAGES", "TEST RUNNER") success = False if len(lost_msgs) > 0: console_out("FAILED TEST: LOST MESSAGES", "TEST RUNNER") success = False if success == True: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: consumer_manager.stop_all_consumers() pub_thread.join() except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST {str(test_number)} COMPLETE", "TEST RUNNER")
def main(): print("publish-consume.py") args = get_args(sys.argv) # cluster new_cluster = is_true( get_optional_arg_validated(args, "--new-cluster", "false", ["true", "false"])) if new_cluster: cluster_size = int(get_mandatory_arg(args, "--cluster-size")) else: cluster_size = int(get_optional_arg(args, "--cluster-size", "3")) rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta", ["3.7", "3.8-beta", "3.8-alpha"]) # queues and exchanges exchanges = as_list(get_optional_arg(args, "--exchanges", "")) queue_name = get_mandatory_arg(args, "--queue") queue_type = get_optional_arg_validated(args, "--queue-type", "mirrored", ["mirrored", "quorum"]) qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0")) rep_factor = int(get_optional_arg(args, "--rep-factor", str(cluster_size))) sac_enabled = is_true( get_optional_arg_validated(args, "--sac", "false", ["true", "false"])) if rmq_version == "3.7": if sac_enabled: console_out("Cannot use SAC mode with RabbitMQ 3.7", "TEST RUNNER") exit(1) if queue_type == "quorum": console_out("Cannot use quorum queues with RabbitMQ 3.7", "TEST RUNNER") exit(1) # publisher publisher_count = int(get_optional_arg(args, "--publishers", "1")) pub_mode = get_optional_arg_validated(args, "--pub-mode", "direct", ["direct", "exchange"]) msg_mode = get_optional_arg_validated( args, "--msg-mode", "sequence", ["sequence", "partitioned-sequence", "large-msgs", "hello"]) count = int(get_mandatory_arg(args, "--msgs")) dup_rate = float(get_optional_arg(args, "--dup-rate", "0")) sequence_count = int(get_optional_arg(args, "--sequences", 1)) in_flight_max = int(get_optional_arg(args, "--in-flight-max", 10)) # consumers consumer_count = int(get_optional_arg(args, "--consumers", "1")) prefetch = int(get_optional_arg(args, "--pre-fetch", "10")) analyze = is_true( get_optional_arg_validated(args, "--analyze", "true", ["true", "false"])) print_mod = get_optional_arg(args, "--print-mod", in_flight_max * 5) broker_manager = BrokerManager() broker_manager.deploy(cluster_size, new_cluster, rmq_version, False) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if queue_type == "mirrored": if sac_enabled: queue_created = broker_manager.create_standard_sac_queue( mgmt_node, queue_name, rep_factor) else: queue_created = broker_manager.create_standard_queue( mgmt_node, queue_name, rep_factor) elif queue_type == "quorum": if sac_enabled: queue_created = broker_manager.create_quorum_sac_queue( mgmt_node, queue_name, rep_factor, qq_max_length) else: queue_created = broker_manager.create_quorum_queue( mgmt_node, queue_name, rep_factor, qq_max_length) if queue_created == False: time.sleep(5) broker_manager.declare_exchanges(queue_name, exchanges) time.sleep(10) if consumer_count > 0: msg_monitor = MessageMonitor("pub-con", 1, print_mod, analyze, False) consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", False) consumer_manager.add_consumers(consumer_count, 1, queue_name, prefetch) monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() consumer_manager.start_consumers() if publisher_count > 0: pub_manager = PublisherManager(broker_manager, 1, "TEST RUNNER", publisher_count, in_flight_max, print_mod) if pub_mode == "direct": if msg_mode == "sequence": pub_manager.add_sequence_direct_publishers( queue_name, count, dup_rate, sequence_count) elif pub_mode == "partitioned-sequence": print("Cannot use partitioned sequence mode with direct mode") exit(1) elif pub_mode == "large-msgs": msg_size = int(get_mandatory_arg(args, "--msg-size")) pub_manager.add_large_msgs_direct_publishers( queue_name, count, dup_rate, msg_size) else: pub_manager.add_hello_msgs_direct_publishers( queue_name, count, dup_rate) elif pub_mode == "exchange": if len(exchanges) == 0: console_out("No exchanges provided", "TEST RUNNER") exit(1) if msg_mode == "sequence": pub_manager.add_sequence_to_exchanges_publishers( exchanges, "", count, dup_rate, sequence_count) elif msg_mode == "partitioned-sequence": pub_manager.add_partitioned_sequence_to_exchanges_publishers( exchanges, count, dup_rate, sequence_count) elif msg_mode == "large-msgs": msg_size = int(get_mandatory_arg(args, "--msg-size")) pub_manager.add_large_msgs_to_exchanges_publishers( exchanges, "", count, dup_rate, msg_size) else: pub_manager.add_hello_msgs_to_exchanges_publishers( exchanges, "", count, dup_rate) pub_manager.start_publishers() while True: try: console_out( "Press + to add a consumer, - to remove a consumer, ! to remove the active consumer (SAC only)", "TEST_RUNNER") input_str = input() if input_str == "+": consumer_manager.add_consumer_and_start_consumer( 1, queue_name, prefetch) elif input_str == "-": consumer_manager.stop_and_remove_oldest_consumer() else: consumer_manager.stop_and_remove_specfic_consumer(input_str) except KeyboardInterrupt: if publisher_count > 0: console_out( "Stopping publishers. Starting grace period for consumers to catch up.", "TEST_RUNNER") pub_manager.stop_all_publishers() break if publisher_count > 0 and consumer_count > 0: try: ctr = 0 while ctr < 300: if msg_monitor.get_unique_count( ) >= pub_manager.get_total_pos_ack_count() and len( pub_manager.get_total_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 except KeyboardInterrupt: console_out("Grace period ended", "TEST RUNNER") confirmed_set = pub_manager.get_total_msg_set() lost_msgs = confirmed_set.difference(msg_monitor.get_msg_set()) console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Confirmed count: {pub_manager.get_total_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") if analyze: success = True if len(lost_msgs) > 0: console_out(f"FAILED TEST: Lost messages: {len(lost_msgs)}", "TEST RUNNER") success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") if success: console_out("TEST OK", "TEST RUNNER") elif publisher_count > 0: console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Confirmed count: {pub_manager.get_total_pos_ack_count()}", "TEST RUNNER") elif consumer_count > 0: console_out("RESULTS------------------------------------", "TEST RUNNER") console_out( f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") console_out("RESULTS END------------------------------------", "TEST RUNNER") try: if consumer_count > 0: consumer_manager.stop_all_consumers() msg_monitor.stop_consuming() monitor_thread.join(10) except Exception as e: console_out("Failed to clean up test correctly: " + str(e), "TEST RUNNER") console_out(f"TEST 1 COMPLETE", "TEST RUNNER")
def main(): print("random-test.py") #signal.signal(signal.SIGINT, interuppt_handler) args = get_args(sys.argv) count = -1 # no limit test_name = get_mandatory_arg(args, "--test-name") tests = int(get_mandatory_arg(args, "--tests")) run_minutes = int(get_mandatory_arg(args, "--run-minutes")) consumer_count = int(get_mandatory_arg(args, "--consumers")) prefetch = int(get_optional_arg(args, "--pre-fetch", "10")) grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec")) queue = get_mandatory_arg(args, "--queue") queue_type = get_mandatory_arg(args, "--queue-type") analyze = is_true(get_optional_arg(args, "--analyze", "true")) if queue_type == "quorum": qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0")) sac_enabled = is_true(get_mandatory_arg(args, "--sac")) log_messages = is_true(get_optional_arg(args, "--log-msgs", "false")) publisher_count = int(get_optional_arg(args, "--publishers", "1")) if publisher_count > 0: in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10")) print_mod = int( get_optional_arg(args, "--print-mod", f"{in_flight_max * 5}")) sequence_count = int(get_optional_arg(args, "--sequences", "1")) else: print_mod = int(get_optional_arg(args, "--print-mod", f"1000")) new_cluster = is_true(get_optional_arg(args, "--new-cluster", "true")) cluster_size = get_optional_arg(args, "--cluster", "3") rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta", ["3.7", "3.8-beta", "3.8-alpha"]) stop_mode = get_optional_arg_validated(args, "--stop-mode", "crash", ["crash", "close", "cancel"]) use_toxiproxy = False consumer_hard_close = False if stop_mode == "crash": use_toxiproxy = True elif stop_mode == "close": consumer_hard_close = True include_chaos = is_true(get_optional_arg(args, "--chaos-actions", "true")) if include_chaos: chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed") chaos_min_interval = int( get_optional_arg(args, "--chaos-min-interval", "60")) chaos_max_interval = int( get_optional_arg(args, "--chaos-max-interval", "120")) include_con_actions = is_true( get_optional_arg(args, "--consumer-actions", "true")) if include_con_actions: con_action_min_interval = int( get_optional_arg(args, "--consumer-min-interval", "20")) con_action_max_interval = int( get_optional_arg(args, "--consumer-max-interval", "60")) failed_test_log = list() failed_tests = set() for test_number in range(tests): print("") subprocess.call(["mkdir", f"logs/{test_name}/{str(test_number)}"]) console_out(f"TEST RUN: {str(test_number)} --------------------------", "TEST RUNNER") broker_manager = BrokerManager() broker_manager.deploy(cluster_size, new_cluster, rmq_version, use_toxiproxy) initial_nodes = broker_manager.get_initial_nodes() console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER") queue_name = queue + "_" + str(test_number) mgmt_node = broker_manager.get_random_init_node() queue_created = False while queue_created == False: if queue_type == "mirrored": if sac_enabled: queue_created = broker_manager.create_standard_sac_queue( mgmt_node, queue_name, cluster_size) else: queue_created = broker_manager.create_standard_queue( mgmt_node, queue_name, cluster_size) elif queue_type == "quorum": if sac_enabled: queue_created = broker_manager.create_quorum_sac_queue( mgmt_node, queue_name, cluster_size, qq_max_length) else: queue_created = broker_manager.create_quorum_queue( mgmt_node, queue_name, cluster_size, qq_max_length) if queue_created == False: time.sleep(5) time.sleep(10) msg_monitor = MessageMonitor(test_name, test_number, print_mod, analyze, log_messages) chaos = ChaosExecutor(initial_nodes) if include_chaos: if chaos_mode == "partitions": chaos.only_partitions() elif chaos_mode == "nodes": chaos.only_kill_nodes() monitor_thread = threading.Thread(target=msg_monitor.process_messages) monitor_thread.start() if consumer_count > 0: consumer_manager = ConsumerManager(broker_manager, msg_monitor, "TEST RUNNER", use_toxiproxy) consumer_manager.add_consumers(consumer_count, test_number, queue_name, prefetch) consumer_manager.start_consumers() if publisher_count == 1: publisher = RabbitPublisher(1, test_number, broker_manager, in_flight_max, 120, print_mod) publisher.configure_sequence_direct(queue_name, count, 0, sequence_count) pub_thread = threading.Thread(target=publisher.start_publishing) pub_thread.start() console_out("publisher started", "TEST RUNNER") if include_con_actions or include_chaos: init_wait_sec = 20 console_out( f"Will start chaos and consumer actions in {init_wait_sec} seconds", "TEST RUNNER") time.sleep(init_wait_sec) if include_chaos: chaos_thread = threading.Thread( target=chaos.start_random_single_action_and_repair, args=(chaos_min_interval, chaos_max_interval)) chaos_thread.start() console_out("Chaos executor started", "TEST RUNNER") if include_con_actions: consumer_action_thread = threading.Thread( target=consumer_manager.start_random_consumer_actions, args=(con_action_min_interval, con_action_max_interval, consumer_hard_close)) consumer_action_thread.start() console_out("Consumer actions started", "TEST RUNNER") ctr = 0 run_seconds = run_minutes * 60 while ctr < run_seconds and not stop_please: try: time.sleep(1) ctr += 1 if ctr % 60 == 0: console_out( f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left", "TEST RUNNER") except KeyboardInterrupt: console_out( f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)", "TEST RUNNER") break try: chaos.stop_random_single_action_and_repair() if consumer_count > 0: consumer_manager.stop_random_consumer_actions() if include_chaos: chaos_thread.join(30) if include_con_actions: consumer_action_thread.join(30) except Exception as e: console_out("Failed to stop chaos cleanly: " + str(e), "TEST RUNNER") if publisher_count > 0: publisher.stop_publishing() if consumer_count > 0: console_out("Resuming consumers", "TEST RUNNER") consumer_manager.resume_all_consumers() console_out("Starting grace period for consumer to catch up", "TEST RUNNER") ctr = 0 try: while ctr < grace_period_sec: if publisher_count > 0 and msg_monitor.get_unique_count( ) >= publisher.get_pos_ack_count() and len( publisher.get_msg_set().difference( msg_monitor.get_msg_set())) == 0: break time.sleep(1) ctr += 1 except KeyboardInterrupt: console_out("Grace period ended", "TEST RUNNER") console_out("RESULTS ----------------------------------------", "TEST RUNNER") if publisher_count > 0: confirmed_set = publisher.get_msg_set() not_consumed_msgs = confirmed_set.difference( msg_monitor.get_msg_set()) console_out( f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") else: not_consumed_msgs = set() console_out( f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}", "TEST RUNNER") success = True if consumer_count > 0: if len(not_consumed_msgs) > 0: if sac_enabled: console_out( f"FAILED TEST: Potential message loss or failure of consumers to consume or failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") else: console_out( f"FAILED TEST: Potential message loss or failure of consumers to consume. Not consumed count: {len(not_consumed_msgs)}", "TEST RUNNER") failed_test_log.append( f"Test {test_number} FAILURE: Potential Message Loss. {len(not_consumed_msgs)} messsages." ) failed_tests.add(test_number) lost_ctr = 0 sorted_msgs = list(not_consumed_msgs) sorted_msgs.sort() for msg in sorted_msgs: console_out(f"Lost? {msg}", "TEST RUNNER") lost_ctr += 1 if lost_ctr > 500: console_out("More than 500, truncated list", "TEST RUNNER") break success = False if msg_monitor.get_out_of_order() == True: success = False console_out(f"FAILED TEST: Received out-of-order messages", "TEST RUNNER") failed_test_log.append( f"Test {test_number} FAILURE: Received out-of-order messages" ) failed_tests.add(test_number) if success: console_out("TEST OK", "TEST RUNNER") console_out("RESULTS END ------------------------------------", "TEST RUNNER") try: if consumer_count > 0: consumer_manager.stop_all_consumers() if publisher_count == 1: pub_thread.join(30) msg_monitor.stop_consuming() monitor_thread.join(30) except Exception as e: console_out_exception("Failed to clean up test correctly.", e, "TEST RUNNER") broker_manager.zip_log_files(test_name, test_number) console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER") console_out("", "TEST RUNNER") console_out("SUMMARY", "TEST RUNNER") console_out(f"OK {tests - len(failed_tests)} FAIL {len(failed_tests)}", "TEST RUNNER") for line in failed_test_log: console_out(line, "TEST RUNNER") console_out("TEST RUN COMPLETE", "TEST RUNNER")