Ejemplo n.º 1
0
    def disconnect(self):
        try:
            if not self.hard_close and self.channel is not None and self.channel.is_open:
                self.channel.stop_consuming()
                console_out(f"Cancelled consumer", self.get_actor())
                self.connection.sleep(2)

            if self.is_connection_open():
                self.connection.close()
                console_out(f"Closed connection", self.get_actor())

            return True
        except AttributeError:
            console_out(
                f"Closed connection (with internal pika attribute error)",
                self.get_actor())
        except TypeError:
            console_out(f"Closed connection (with internal pika type error)",
                        self.get_actor())
        except pika.exceptions.ConnectionWrongStateError:
            console_out(f"Cannot close connection, already closed",
                        self.get_actor())
        except pika.exceptions.StreamLostError:
            console_out(f"Closed connection (stream lost)", self.get_actor())
        except Exception as e:
            console_out_exception("Failed trying to disconnect.", e,
                                  self.get_actor())
            return False
Ejemplo n.º 2
0
    def teardown_all(self, configurations, key_pair, run_tag, no_destroy):
        try:
            console_out(self.actor, f"Getting logs")
            start_node, end_node = self.get_start_end_nodes(configurations)
            self.get_logs(key_pair, run_tag, start_node, end_node)
            console_out(self.actor, f"Logs retrieved")
        except Exception as e:
            console_out_exception(self.actor, "Failed retrieving logs", e)

        if no_destroy:
            console_out(self.actor, "No teardown as --no-destroy set to true")
        else:
            console_out(self.actor, "Terminating all servers")

            for config_tag in configurations:
                console_out(self.actor,
                            f"TEARDOWN FOR configuration {config_tag}")
                unique_conf_list = configurations[config_tag]
                for p in range(len(unique_conf_list)):
                    unique_conf = unique_conf_list[p]

                    for n in range(0, unique_conf.cluster_size):
                        node_num = int(unique_conf.node_number) + n
                        console_out(self.actor,
                                    f"TEARDOWN FOR node {node_num}")
                        self.teardown(unique_conf.technology, str(node_num),
                                      run_tag, no_destroy)
                console_out(self.actor, "All servers terminated")
            exit(1)
Ejemplo n.º 3
0
    def stop_start_consumer(self, con_index, hard_close):
        con = self.consumers[con_index]
        try:
            if self.use_toxiproxy:
                console_out(
                    f"SIMULATING CRASH OF CONSUMER {con_index+1} --------------------------------------",
                    self.actor)
                self.broker_manager.disable_consumer_proxy(
                    con.get_consumer_id())
                time.sleep(1)
                con.perform_hard_close()
                time.sleep(1)
                self.broker_manager.enable_consumer_proxy(
                    con.get_consumer_id())
            else:
                console_out(
                    f"STOPPING CONSUMER {con_index+1} --------------------------------------",
                    self.actor)
                if hard_close:
                    con.perform_hard_close()
                else:
                    con.stop_consuming()
            self.consumer_threads[con_index].join(15)

            con.connect()
            self.consumer_threads[con_index] = threading.Thread(
                target=con.consume)
            self.consumer_threads[con_index].start()
        except Exception as e:
            console_out_exception("Failed to stop/start consumer correctly", e,
                                  self.actor)
Ejemplo n.º 4
0
def send_to_broker():
    try:
        nonproxy_publisher.publish_msg_with_new_conn(
            "", "", f"Hello at {datetime.datetime.now()}")
        return ""
    except Exception as e:
        console_out_exception("NoProxy", e, "WEB")
        return str(e)
Ejemplo n.º 5
0
 def start_random_single_action_and_repair(self, min_duration_seconds,
                                           max_duration_seconds):
     while self.stop_random == False:
         try:
             self.single_action_and_repair(min_duration_seconds,
                                           max_duration_seconds)
         except Exception as e:
             console_out_exception("Failed performing action and repair", e,
                                   "TEST RUNNER")
Ejemplo n.º 6
0
 def close_connection(self):
     if self.connection is not None and self.connection.is_open:
         try:
             console_out("Closing connection...", self.get_actor())
             self.connection.close()
             console_out("Connection closed", self.get_actor())
         except pika.execeptions.ConnectionWrongStateError:
             console_out("Cannot close connection, already closed", self.get_actor())
         except Exception as e:
             console_out_exception("Failed closing connection", e, self.get_actor())
Ejemplo n.º 7
0
    def is_connection_open(self):
        try:
            if self.connection is None:
                return False

            return self.connection.is_open
        except Exception as e:
            console_out_exception("Failed checking if connection is open", e,
                                  self.get_actor())
            return False
Ejemplo n.º 8
0
    def get_logs_of_all_configs(self, common_conf, configurations):
        for config_tag in configurations:
            unique_conf_list = configurations[config_tag]
            for p in range(len(unique_conf_list)):
                unique_conf = unique_conf_list[p]

                try:
                    start_node, end_node = self.get_start_end_nodes_of_config(unique_conf)
                    self.get_logs(common_conf, unique_conf.logs_volume, start_node, end_node)
                except Exception as e:
                    console_out_exception(self.actor, "Failed retrieving logs", e)
Ejemplo n.º 9
0
    def add_proxy(self, name):
        try:
            r = requests.post(
                "http://toxiproxy:8474/proxies",
                data="{\"name\":\"" + name +
                "\",\"listen\":\"0.0.0.0:5672\",\"upstream\":\"" +
                self.mgmt_ip + ":5672\"}")

            console_out(f"Proxy add response: {r}", "TEST RUNNER")
            return r.status_code == 201 or r.status_code == 204 or r.status_code == 409
        except Exception as e:
            console_out_exception("Could not add proxy", e, "TEST RUNNER")
            return False
Ejemplo n.º 10
0
    def open_persistent_connection(self):
        url = self.broker_manager.get_url(self.use_proxy)
        console_out(f"Attempting to connect to {url}", self.get_actor())

        try:
            parameters = pika.URLParameters(url)
            self.connection = pika.BlockingConnection(parameters)
            self.channel = self.connection.channel()
            if self.use_confirms:
                self.channel.confirm_delivery()

            return True
        except Exception as e:
            console_out_exception("Connection failed", e, self.get_actor())
            return False
Ejemplo n.º 11
0
    def start_random_consumer_actions(self, min_seconds_interval,
                                      max_seconds_interval, hard_close):
        while self.stop_random == False:
            wait_sec = random.randint(min_seconds_interval,
                                      max_seconds_interval)
            console_out(f"Will execute consumer action in {wait_sec} seconds",
                        self.actor)
            self.wait_for(wait_sec)

            if self.stop_random == False:
                try:
                    self.do_consumer_action(hard_close)
                except Exception as e:
                    console_out_exception("Failed performing consumer action",
                                          e, "TEST RUNNER")
Ejemplo n.º 12
0
    def start_random_stop_starts(self, min_seconds_interval,
                                 max_seconds_interval, hard_close):
        while self.stop_random == False:
            wait_sec = random.randint(min_seconds_interval,
                                      max_seconds_interval)
            console_out(
                f"Will execute stop/start consumer action in {wait_sec} seconds",
                self.actor)
            self.wait_for(wait_sec)

            if self.stop_random == False:
                try:
                    self.stop_start_consumers(hard_close)
                except Exception as e:
                    console_out_exception("Failed stopping/starting consumers",
                                          e, self.actor)
Ejemplo n.º 13
0
    def get_logs_of_all_configs(self, common_conf, configurations):
        for config_tag in configurations:
            unique_conf_list = configurations[config_tag]
            for p in range(len(unique_conf_list)):
                unique_conf = unique_conf_list[p]

                if unique_conf.deployment == "ec2":
                    try:
                        start_node, end_node = self.get_start_end_nodes_of_config(
                            unique_conf)
                        self.get_logs(common_conf, unique_conf.logs_volume,
                                      start_node, end_node)
                    except Exception as e:
                        console_out_exception(self.actor,
                                              "Failed retrieving logs", e)
                elif unique_conf.deployment == "eks" or unique_conf.deployment == "gke":
                    console_out(self.actor,
                                "Log gathering not yet supported for EKS/GKE")
                else:
                    raise Exception(
                        f"Invalid deployment type: {unique_conf.deployment}")
Ejemplo n.º 14
0
    def publish_msg_with_new_conn(self, send_to_exchange, rk, body):
        url = self.broker_manager.get_url(self.use_proxy)

        try:
            parameters = pika.URLParameters(url)
            connection = pika.BlockingConnection(parameters)
            channel = connection.channel()

            mandatory = False
            if self.use_confirms:
                channel.confirm_delivery()
                mandatory = True
            corr_id = str(uuid.uuid4())

            try:
                channel.basic_publish(exchange=send_to_exchange,
                                      routing_key=rk,
                                      body=body,
                                      mandatory=mandatory,
                                      properties=pika.BasicProperties(
                                          content_type='text/plain',
                                          delivery_mode=2,
                                          correlation_id=corr_id))
                self.pos_acks += 1
            except exceptions.UnroutableError:
                self.undeliverable += 1
                if self.undeliverable % 100 == 0:
                    console_out(
                        f"{str(self.undeliverable)} messages could not be delivered",
                        self.get_actor())
            except exceptions.NackError:
                self.neg_acks += 1

            connection.close()
        except Exception as e:
            console_out_exception(f"Connection to {url} failed", e,
                                  self.get_actor())
Ejemplo n.º 15
0
    def connect(self):
        try:
            self.connected_node = self.broker_manager.get_current_node(
                self.consumer_id)
            ip = self.broker_manager.get_node_ip(self.connected_node)
            console_out(f"Connecting to {self.connected_node}",
                        self.get_actor())
            credentials = pika.PlainCredentials('jack', 'jack')
            parameters = pika.ConnectionParameters(
                ip,
                self.broker_manager.get_consumer_port(self.connected_node,
                                                      self.consumer_id), '/',
                credentials)
            self.connection = pika.BlockingConnection(parameters)
            self.channel = self.connection.channel()

            if self.prefetch > 0:
                self.channel.basic_qos(prefetch_count=self.prefetch)

            return True
        except Exception as e:
            console_out_exception("Failed trying to connect.", e,
                                  self.get_actor())
            return False
Ejemplo n.º 16
0
def main():
    print("random-test.py")
    #signal.signal(signal.SIGINT, interuppt_handler)
    args = get_args(sys.argv)

    count = -1  # no limit
    test_name = get_mandatory_arg(args, "--test-name")
    tests = int(get_mandatory_arg(args, "--tests"))
    run_minutes = int(get_mandatory_arg(args, "--run-minutes"))
    consumer_count = int(get_mandatory_arg(args, "--consumers"))
    prefetch = int(get_optional_arg(args, "--pre-fetch", "10"))
    grace_period_sec = int(get_mandatory_arg(args, "--grace-period-sec"))
    queue = get_mandatory_arg(args, "--queue")
    queue_type = get_mandatory_arg(args, "--queue-type")
    analyze = is_true(get_optional_arg(args, "--analyze", "true"))

    if queue_type == "quorum":
        qq_max_length = int(get_optional_arg(args, "--qq-max-length", "0"))

    sac_enabled = is_true(get_mandatory_arg(args, "--sac"))
    log_messages = is_true(get_optional_arg(args, "--log-msgs", "false"))

    publisher_count = int(get_optional_arg(args, "--publishers", "1"))
    if publisher_count > 0:
        in_flight_max = int(get_optional_arg(args, "--in-flight-max", "10"))
        print_mod = int(
            get_optional_arg(args, "--print-mod", f"{in_flight_max * 5}"))
        sequence_count = int(get_optional_arg(args, "--sequences", "1"))
    else:
        print_mod = int(get_optional_arg(args, "--print-mod", f"1000"))

    new_cluster = is_true(get_optional_arg(args, "--new-cluster", "true"))
    cluster_size = get_optional_arg(args, "--cluster", "3")
    rmq_version = get_optional_arg_validated(args, "--rmq-version", "3.8-beta",
                                             ["3.7", "3.8-beta", "3.8-alpha"])
    stop_mode = get_optional_arg_validated(args, "--stop-mode", "crash",
                                           ["crash", "close", "cancel"])

    use_toxiproxy = False
    consumer_hard_close = False
    if stop_mode == "crash":
        use_toxiproxy = True
    elif stop_mode == "close":
        consumer_hard_close = True

    include_chaos = is_true(get_optional_arg(args, "--chaos-actions", "true"))
    if include_chaos:
        chaos_mode = get_optional_arg(args, "--chaos-mode", "mixed")
        chaos_min_interval = int(
            get_optional_arg(args, "--chaos-min-interval", "60"))
        chaos_max_interval = int(
            get_optional_arg(args, "--chaos-max-interval", "120"))

    include_con_actions = is_true(
        get_optional_arg(args, "--consumer-actions", "true"))
    if include_con_actions:
        con_action_min_interval = int(
            get_optional_arg(args, "--consumer-min-interval", "20"))
        con_action_max_interval = int(
            get_optional_arg(args, "--consumer-max-interval", "60"))

    failed_test_log = list()
    failed_tests = set()

    for test_number in range(tests):

        print("")
        subprocess.call(["mkdir", f"logs/{test_name}/{str(test_number)}"])
        console_out(f"TEST RUN: {str(test_number)} --------------------------",
                    "TEST RUNNER")
        broker_manager = BrokerManager()
        broker_manager.deploy(cluster_size, new_cluster, rmq_version,
                              use_toxiproxy)
        initial_nodes = broker_manager.get_initial_nodes()
        console_out(f"Initial nodes: {initial_nodes}", "TEST RUNNER")

        queue_name = queue + "_" + str(test_number)
        mgmt_node = broker_manager.get_random_init_node()
        queue_created = False

        while queue_created == False:
            if queue_type == "mirrored":
                if sac_enabled:
                    queue_created = broker_manager.create_standard_sac_queue(
                        mgmt_node, queue_name, cluster_size)
                else:
                    queue_created = broker_manager.create_standard_queue(
                        mgmt_node, queue_name, cluster_size)
            elif queue_type == "quorum":
                if sac_enabled:
                    queue_created = broker_manager.create_quorum_sac_queue(
                        mgmt_node, queue_name, cluster_size, qq_max_length)
                else:
                    queue_created = broker_manager.create_quorum_queue(
                        mgmt_node, queue_name, cluster_size, qq_max_length)

            if queue_created == False:
                time.sleep(5)

        time.sleep(10)

        msg_monitor = MessageMonitor(test_name, test_number, print_mod,
                                     analyze, log_messages)
        chaos = ChaosExecutor(initial_nodes)

        if include_chaos:
            if chaos_mode == "partitions":
                chaos.only_partitions()
            elif chaos_mode == "nodes":
                chaos.only_kill_nodes()

        monitor_thread = threading.Thread(target=msg_monitor.process_messages)
        monitor_thread.start()

        if consumer_count > 0:
            consumer_manager = ConsumerManager(broker_manager, msg_monitor,
                                               "TEST RUNNER", use_toxiproxy)
            consumer_manager.add_consumers(consumer_count, test_number,
                                           queue_name, prefetch)
            consumer_manager.start_consumers()

        if publisher_count == 1:
            publisher = RabbitPublisher(1, test_number, broker_manager,
                                        in_flight_max, 120, print_mod)
            publisher.configure_sequence_direct(queue_name, count, 0,
                                                sequence_count)

            pub_thread = threading.Thread(target=publisher.start_publishing)
            pub_thread.start()
            console_out("publisher started", "TEST RUNNER")

        if include_con_actions or include_chaos:
            init_wait_sec = 20
            console_out(
                f"Will start chaos and consumer actions in {init_wait_sec} seconds",
                "TEST RUNNER")
            time.sleep(init_wait_sec)

        if include_chaos:
            chaos_thread = threading.Thread(
                target=chaos.start_random_single_action_and_repair,
                args=(chaos_min_interval, chaos_max_interval))
            chaos_thread.start()
            console_out("Chaos executor started", "TEST RUNNER")

        if include_con_actions:
            consumer_action_thread = threading.Thread(
                target=consumer_manager.start_random_consumer_actions,
                args=(con_action_min_interval, con_action_max_interval,
                      consumer_hard_close))
            consumer_action_thread.start()
            console_out("Consumer actions started", "TEST RUNNER")

        ctr = 0
        run_seconds = run_minutes * 60
        while ctr < run_seconds and not stop_please:
            try:
                time.sleep(1)
                ctr += 1

                if ctr % 60 == 0:
                    console_out(
                        f"Test at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left",
                        "TEST RUNNER")
            except KeyboardInterrupt:
                console_out(
                    f"Test forced to stop at {int(ctr/60)} minute mark, {int((run_seconds-ctr)/60)} minutes left)",
                    "TEST RUNNER")
                break

        try:
            chaos.stop_random_single_action_and_repair()

            if consumer_count > 0:
                consumer_manager.stop_random_consumer_actions()

            if include_chaos:
                chaos_thread.join(30)

            if include_con_actions:
                consumer_action_thread.join(30)
        except Exception as e:
            console_out("Failed to stop chaos cleanly: " + str(e),
                        "TEST RUNNER")

        if publisher_count > 0:
            publisher.stop_publishing()

        if consumer_count > 0:
            console_out("Resuming consumers", "TEST RUNNER")
            consumer_manager.resume_all_consumers()

            console_out("Starting grace period for consumer to catch up",
                        "TEST RUNNER")
            ctr = 0

            try:
                while ctr < grace_period_sec:
                    if publisher_count > 0 and msg_monitor.get_unique_count(
                    ) >= publisher.get_pos_ack_count() and len(
                            publisher.get_msg_set().difference(
                                msg_monitor.get_msg_set())) == 0:
                        break
                    time.sleep(1)
                    ctr += 1
            except KeyboardInterrupt:
                console_out("Grace period ended", "TEST RUNNER")

        console_out("RESULTS ----------------------------------------",
                    "TEST RUNNER")
        if publisher_count > 0:
            confirmed_set = publisher.get_msg_set()
            not_consumed_msgs = confirmed_set.difference(
                msg_monitor.get_msg_set())
            console_out(
                f"Confirmed count: {publisher.get_pos_ack_count()} Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
                "TEST RUNNER")
        else:
            not_consumed_msgs = set()
            console_out(
                f"Received count: {msg_monitor.get_receive_count()} Unique received: {msg_monitor.get_unique_count()}",
                "TEST RUNNER")

        success = True
        if consumer_count > 0:
            if len(not_consumed_msgs) > 0:
                if sac_enabled:
                    console_out(
                        f"FAILED TEST: Potential message loss or failure of consumers to consume or failure to promote Waiting to Active. Not consumed count: {len(not_consumed_msgs)}",
                        "TEST RUNNER")
                else:
                    console_out(
                        f"FAILED TEST: Potential message loss or failure of consumers to consume. Not consumed count: {len(not_consumed_msgs)}",
                        "TEST RUNNER")
                failed_test_log.append(
                    f"Test {test_number} FAILURE: Potential Message Loss. {len(not_consumed_msgs)} messsages."
                )
                failed_tests.add(test_number)

                lost_ctr = 0
                sorted_msgs = list(not_consumed_msgs)
                sorted_msgs.sort()
                for msg in sorted_msgs:
                    console_out(f"Lost? {msg}", "TEST RUNNER")
                    lost_ctr += 1
                    if lost_ctr > 500:
                        console_out("More than 500, truncated list",
                                    "TEST RUNNER")
                        break

                success = False

            if msg_monitor.get_out_of_order() == True:
                success = False
                console_out(f"FAILED TEST: Received out-of-order messages",
                            "TEST RUNNER")
                failed_test_log.append(
                    f"Test {test_number} FAILURE: Received out-of-order messages"
                )
                failed_tests.add(test_number)

        if success:
            console_out("TEST OK", "TEST RUNNER")

        console_out("RESULTS END ------------------------------------",
                    "TEST RUNNER")

        try:
            if consumer_count > 0:
                consumer_manager.stop_all_consumers()

            if publisher_count == 1:
                pub_thread.join(30)
            msg_monitor.stop_consuming()
            monitor_thread.join(30)
        except Exception as e:
            console_out_exception("Failed to clean up test correctly.", e,
                                  "TEST RUNNER")

        broker_manager.zip_log_files(test_name, test_number)
        console_out(f"TEST {str(test_number )} COMPLETE", "TEST RUNNER")

    console_out("", "TEST RUNNER")
    console_out("SUMMARY", "TEST RUNNER")
    console_out(f"OK {tests - len(failed_tests)} FAIL {len(failed_tests)}",
                "TEST RUNNER")
    for line in failed_test_log:
        console_out(line, "TEST RUNNER")

    console_out("TEST RUN COMPLETE", "TEST RUNNER")
Ejemplo n.º 17
0
    def consume(self):
        self.terminate = False
        self.hard_close = False
        self.last_msg = ""
        while True:
            try:
                if self.terminate == True:
                    break

                if self.connection is None or self.connection.is_closed or self.channel is None or self.channel.is_closed:
                    if self.reconnect() == False:
                        self.wait_for(5)
                        continue

                self.consumer_tag = self.channel.basic_consume(
                    self.queue_name, self.callback)

                console_out(
                    f"Consuming queue: {self.queue_name} with consumer tag: {self.consumer_tag}",
                    self.get_actor())

                self.set_actor()
                self.channel.start_consuming()
            except pika.exceptions.ConnectionClosed as e:
                if self.terminate == True:
                    break

                console_out_exception(
                    f"Connection was closed. Last msg acked: {self.last_msg}",
                    e, self.get_actor())
                self.wait_for(5)
                continue
            except pika.exceptions.AMQPChannelError as e:
                if self.terminate == True:
                    break

                console_out_exception(
                    f"Caught a channel error. Last msg acked: {self.last_msg}",
                    e, self.get_actor())
                self.wait_for(5)
                if self.disconnect():
                    self.connected_node = "none"
                    continue
                else:
                    self.terminate = True
                    console_out("Aborting consumer", self.get_actor())
                    break
            except pika.exceptions.AMQPConnectionError as e:
                if self.terminate == True:
                    break

                console_out_exception(
                    f"Connection error. Last msg acked: {self.last_msg}", e,
                    self.get_actor())
                self.wait_for(5)
                continue
            except Exception as e:
                if self.terminate == True:
                    break

                console_out_exception(
                    f"Unexpected error. Last msg acked: {self.last_msg}", e,
                    self.get_actor())
                self.wait_for(5)
                if self.disconnect():
                    self.connected_node = "none"
                    continue
                else:
                    self.terminate = True
                    console_out("Aborting consumer", self.get_actor())
                    break