def deploy(self, cluster_size, new_cluster): if new_cluster: if self.use_blockade: subprocess.call([ "bash", "../cluster/deploy-blockade-cluster.sh", cluster_size, self.image_version ]) console_out(f"Waiting for cluster to establish itself...", "TEST RUNNER") time.sleep(30) console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status.sh"]) else: subprocess.call([ "bash", "../cluster/deploy-compose-cluster.sh", cluster_size, self.image_version ]) console_out(f"Waiting for cluster to establish itself...", "TEST RUNNER") time.sleep(30) console_out(f"Cluster status:", "TEST RUNNER") subprocess.call(["bash", "../cluster/cluster-status-dc.sh"]) self.load_initial_nodes() self.correct_advertised_listeners() else: console_out(f"Using existing cluster...", "TEST RUNNER") if self.use_blockade: subprocess.call(["bash", "../cluster/cluster-status.sh"]) else: subprocess.call(["bash", "../cluster/cluster-status-dc.sh"]) self.load_initial_nodes()
def create_topic(self, broker, topic_name, replication_factor, partitions, min_insync_reps, unclean_failover): try: if self.image_version == "confluent": subprocess.call([ "bash", "../cluster/cp-create-topic.sh", broker, topic_name, str(replication_factor), str(partitions), str(min_insync_reps), str(unclean_failover) ]) elif self.image_version == "wurstmeister": subprocess.call([ "bash", "../cluster/wm-create-topic.sh", broker, topic_name, str(replication_factor), str(partitions), str(min_insync_reps), str(unclean_failover) ]) else: raise ValueError("Non-supported Kafka image") return True except Exception as e: console_out(f"Could not create topic: {e}", "TEST RUNNER") return False
def start_producing(self, topic, msg_count): console_out(f"{self.key_count} sequences of a possible {len(self.keys)} to be sent", self.get_actor()) for msg_index in range(0, msg_count): if self.terminate: break self.producer.poll(0) body = None if self.message_type == "partitioned-sequence": topic = f"{topic}-{self.keys[self.key_index]}" body = f"{self.keys[self.key_index]}={self.val}" elif self.message_type == "sequence": body = f"{self.keys[self.key_index]}={self.val}" else: body = uuid.uuid4() while len(self.pending_ack) > self.in_flight_limit: time.sleep(0.2) if self.terminate: break self.producer.poll(0) self.producer.produce(topic, value=body.encode('utf-8'),key=self.keys[self.key_index], callback=self.delivery_report) self.pending_ack.add(body) self.curr_pos += 1 self.key_index += 1 if self.key_index == self.key_count: self.key_index = 0 self.val += 1
def correct_advertised_listeners(self): for broker in self.init_live_nodes: broker_ip = self.get_node_ip(broker) kf_index = broker.index("kafka") broker_id = int(broker[kf_index + 5:kf_index + 6]) port = self.get_node_port(broker) try: if self.image_version == "confluent": subprocess.call([ "bash", "../cluster/cp-correct-adv-listener.sh", broker, str(broker_id), broker_ip, port ]) elif self.image_version == "wurstmeister": subprocess.call([ "bash", "../cluster/wm-correct-adv-listener.sh", broker, str(broker_id), broker_ip ]) else: raise ValueError("Non-supported Kafka image") except Exception as e: console_out(f"Could not correct advtertised listener: {e}", "TEST RUNNER") return False return True
def run_benchmark(self, unique_conf, common_conf, playlist_entry, policies, run_ordinal): status_id = unique_conf.technology + unique_conf.node_number federation_args = "" if common_conf.federation_enabled: ds_node_number = int(unique_conf.node_number) + 100 + x ds_broker_ips = self.get_broker_ips(unique_conf.technology, ds_node_number, unique_conf.cluster_size, common_conf.run_tag) federation_args += f"--downstream-broker-hosts {ds_broker_ips}" script = "run-logged-aws-benchmark.sh" # TODO: make these contexts not use hard-coded regions and cluster names if unique_conf.deployment == "eks": context = f"{unique_conf.deployment_user}@benchmarking-eks.eu-west-1.eksctl.io" script = "run-logged-aws-k8s-benchmark.sh" elif unique_conf.deployment == "gke": context = f"gke_{unique_conf.deployment_user}_europe-west4-a_benchmarking-gke" script = "run-logged-aws-k8s-benchmark.sh" else: context = "none" cluster_name = f"rmq-{unique_conf.deployment}" self._benchmark_status[status_id] = "started" exit_code = subprocess.call([ "bash", script, unique_conf.node_number, common_conf.key_pair, unique_conf.technology, unique_conf.broker_version, unique_conf.instance, unique_conf.volume1_type, unique_conf.filesystem, common_conf.hosting, unique_conf.tenancy, common_conf.password, common_conf.postgres_url, common_conf.postgres_user, common_conf.postgres_pwd, playlist_entry.topology, common_conf.run_id, common_conf.username, common_conf.password, common_conf.run_tag, unique_conf.core_count, unique_conf.threads_per_core, unique_conf.config_tag, str(unique_conf.cluster_size), unique_conf.no_tcp_delay, policies, str(common_conf.override_step_seconds), str(common_conf.override_step_repeat), str(common_conf.override_step_msg_limit), common_conf.override_broker_hosts, unique_conf.pub_connect_to_node, unique_conf.con_connect_to_node, str(unique_conf.pub_heartbeat_sec), str(unique_conf.con_heartbeat_sec), common_conf.mode, str(common_conf.grace_period_sec), common_conf.warmUpSeconds, common_conf.checks, str(run_ordinal), common_conf.tags, common_conf.attempts, common_conf.influx_subpath, playlist_entry.get_topology_variables(), playlist_entry.get_policy_variables(), federation_args, context, cluster_name, unique_conf.memory_gb ]) if exit_code != 0: console_out(self.actor, f"Benchmark {unique_conf.node_number} failed") self._benchmark_status[status_id] = "failed" else: self._benchmark_status[status_id] = "success"
def run_background_load(self, unique_conf, common_conf, topology, policies, step_seconds, step_repeat, delay_seconds): if delay_seconds > 0: console_out( self.actor, f"Delaying start of background load by {delay_seconds} seconds for {unique_conf.node_number}" ) time.sleep(delay_seconds) console_out(self.actor, f"Starting background load for {unique_conf.node_number}") status_id = unique_conf.technology + unique_conf.node_number broker_user = "******" broker_password = common_conf.password if policies == "": policies = "none" subprocess.call([ "bash", "run-background-load-aws.sh", broker_user, broker_password, str(unique_conf.cluster_size), common_conf.key_pair, unique_conf.node_number, policies, str(step_seconds), str(step_repeat), common_conf.run_tag, unique_conf.technology, topology, unique_conf.broker_version ])
def stop_start_consumer(self, con_index, hard_close): con = self.consumers[con_index] try: if self.use_toxiproxy: console_out( f"SIMULATING CRASH OF CONSUMER {con_index+1} --------------------------------------", self.actor) self.broker_manager.disable_consumer_proxy( con.get_consumer_id()) time.sleep(1) con.perform_hard_close() time.sleep(1) self.broker_manager.enable_consumer_proxy( con.get_consumer_id()) else: console_out( f"STOPPING CONSUMER {con_index+1} --------------------------------------", self.actor) if hard_close: con.perform_hard_close() else: con.stop_consuming() self.consumer_threads[con_index].join(15) con.connect() self.consumer_threads[con_index] = threading.Thread( target=con.consume) self.consumer_threads[con_index].start() except Exception as e: console_out_exception("Failed to stop/start consumer correctly", e, self.actor)
def on_connection_closed(self, connection, reason): self.channel = None if self.stopping: self.connection.ioloop.stop() else: console_out(f"Connection closed. Reason: {reason}. Reopening in 5 seconds.", self.get_actor()) self.connection.ioloop.call_later(5, self.connection.ioloop.stop)
def parallel_deploy(self, configurations, common_conf): d_threads = list() for config_tag in configurations: unique_conf_list = configurations[config_tag] for i in range(len(unique_conf_list)): unique_conf = unique_conf_list[i] if common_conf.no_deploy: deploy = threading.Thread(target=self.update_single, args=(unique_conf, common_conf,)) else: deploy = threading.Thread(target=self.deploy_rabbitmq_cluster, args=(unique_conf, common_conf,)) # if unique_conf.cluster_size == 1: # deploy = threading.Thread(target=self.deploy_single, args=(unique_conf, common_conf,)) # else: # deploy = threading.Thread(target=self.deploy_rabbitmq_cluster, args=(unique_conf, common_conf,)) d_threads.append(deploy) for dt in d_threads: dt.start() for dt in d_threads: dt.join() for config_tag in configurations: unique_conf_list = configurations[config_tag] for p in range(len(unique_conf_list)): unique_conf = unique_conf_list[p] status_id1 = unique_conf.technology + unique_conf.node_number if self._deploy_status[status_id1] != "success": console_out(self.actor, f"Deployment failed for node {unique_conf.technology}{unique_conf.node_number}") if not common_conf.no_deploy: self.teardown_all(configurations, common_conf, False)
def deploy_joinee(self, status_id, unique_conf, common_conf, node, node_range_start, node_range_end): exit_code = subprocess.call([ "bash", "deploy-rmq-cluster-broker.sh", unique_conf.broker_version, unique_conf.core_count, unique_conf.filesystem, unique_conf.generic_unix_url, common_conf.influx_subpath, unique_conf.instance, common_conf.key_pair, common_conf.log_level, str(node), str(node_range_end), str(node_range_start), "joinee", common_conf.run_tag, common_conf.broker_sg, common_conf.subnet, unique_conf.tenancy, unique_conf.threads_per_core, unique_conf.vars_file, unique_conf.data_volume, unique_conf.logs_volume, unique_conf.quorum_volume, unique_conf.wal_volume, unique_conf.volume1_size, unique_conf.volume1_mountpoint, unique_conf.volume2_size, unique_conf.volume2_mountpoint, unique_conf.volume3_size, unique_conf.volume3_mountpoint ], cwd="../deploy/aws") if exit_code != 0: console_out( self.actor, f"deploy of joinee rabbitmq{node} failed with exit code {exit_code}" ) self._deploy_status[status_id] = "failed"
def configure_large_msgs_direct(self, queue, count, dup_rate, msg_size): self.large_msg = self.repeat_to_length("1234567890", msg_size) self.routing_key = queue self.exchanges = [""] console_out(f"Will publish large messages to queue {queue}", self.get_actor()) self.configure(count, dup_rate, "large-msgs")
def start_consumers(self): for con_id in range(1, len(self.consumers) + 1): con_thread = threading.Thread( target=self.consumers[con_id - 1].start_consuming) con_thread.start() self.consumer_threads.append(con_thread) console_out(f"consumer {con_id} started", self.actor)
def restart_all_brokers(self, configurations, common_conf): r_threads = list() for config_tag in configurations: console_out(self.actor, f"BROKER RESTART FOR configuration {config_tag}") unique_conf_list = configurations[config_tag] # iterate over configurations for p in range(len(unique_conf_list)): unique_conf = unique_conf_list[p] # iterate over nodes of this configuration for n in range(unique_conf.cluster_size): node = int(unique_conf.node_number) + n restart = threading.Thread(target=self.restart_broker, args=(unique_conf.technology, str(node), common_conf)) r_threads.append(restart) for rt in r_threads: rt.start() for rt in r_threads: rt.join() for config_tag in configurations: unique_conf_list = configurations[config_tag] for p in range(len(unique_conf_list)): unique_conf = unique_conf_list[p] for n in range(unique_conf.cluster_size): node = int(unique_conf.node_number) + n status_id = f"{unique_conf.technology}{node}" if self._action_status[status_id] != "success": console_out(self.actor, f"Broker restart failed for node {unique_conf.technology}{node}") if not common_conf.no_deploy: self._deployer.teardown_all(configurations, common_conf, False)
def deploy_single(self, unique_conf, common_conf): status_id = unique_conf.technology + unique_conf.node_number self._deploy_status[status_id] = "started" volume_type = unique_conf.volume.split("-")[1] exit_code = subprocess.call([ "bash", "deploy-single-broker.sh", common_conf.ami, unique_conf.broker_version, unique_conf.core_count, unique_conf.filesystem, unique_conf.generic_unix_url, unique_conf.instance, common_conf.key_pair, common_conf.loadgen_instance, common_conf.loadgen_sg, common_conf.log_level, unique_conf.node_number, common_conf.run_tag, common_conf.broker_sg, common_conf.subnet, unique_conf.technology, unique_conf.tenancy, unique_conf.threads_per_core, unique_conf.vars_file, unique_conf.volume_size, volume_type ], cwd="../deploy/aws") if exit_code != 0: console_out( self.actor, f"deploy {unique_conf.node_number} failed with exit code {exit_code}" ) self._deploy_status[status_id] = "failed" else: self._deploy_status[status_id] = "success"
def on_channel_closed(self, channel, reason): console_out(f"Channel {channel} was closed. Reason: {reason}", self.get_actor()) self._channel = None if not self._stopping: if self._connection.is_open: self._connection.close()
def stop_one_broker(self, configurations, common_conf): r_threads = list() for config_tag in configurations: console_out(self.actor, f"BROKER SHUTDOWN FOR configuration {config_tag}") unique_conf_list = configurations[config_tag] # iterate over configurations for p in range(len(unique_conf_list)): unique_conf = unique_conf_list[p] restart = threading.Thread(target=self.stop_broker, args=(unique_conf.technology, str(unique_conf.node_number), common_conf)) r_threads.append(restart) for rt in r_threads: rt.start() for rt in r_threads: rt.join() for config_tag in configurations: unique_conf_list = configurations[config_tag] for p in range(len(unique_conf_list)): unique_conf = unique_conf_list[p] status_id = f"{unique_conf.technology}{unique_conf.node_number}" if self._action_status[status_id] != "success": console_out( self.actor, f"Broker shutdown failed for node {unique_conf.technology}{unique_conf.node_number}" ) if not common_conf.no_deploy: self._deployer.teardown_all(configurations, common_conf, False)
def start_publishers(self): for prod_id in range(1, len(self.publishers) + 1): pub_thread = threading.Thread( target=self.publishers[prod_id - 1].start_publishing) pub_thread.start() self.publisher_threads.append(pub_thread) console_out(f"Publisher {prod_id} started", self.actor)
def create_standard_queue(self, mgmt_node, queue_name, replication_factor): try: mgmt_node_ip = self.get_mgmt_node_ip(mgmt_node) queue_node = "rabbit@" + mgmt_node r = requests.put( 'http://' + mgmt_node_ip + ':15672/api/queues/%2F/' + queue_name, data= "{\"auto_delete\":false,\"durable\":true,\"arguments\":{\"x-single-active-consumer\": false},\"node\":\"" + queue_node + "\"}", auth=('jack', 'jack')) r = requests.put( 'http://' + mgmt_node_ip + ':15672/api/policies/%2F/ha-queues', data="{\"pattern\":\"" + queue_name + "\", \"definition\": {\"ha-mode\":\"exactly\", \"ha-params\": " + str(replication_factor) + ",\"ha-sync-mode\":\"automatic\" }, \"priority\":0, \"apply-to\": \"queues\"}", auth=('jack', 'jack')) console_out(f"Created {queue_name} with response code {r}", "TEST_RUNNER") return r.status_code == 201 or r.status_code == 204 except Exception as e: console_out("Could not create queue. Will retry. " + str(e), "TEST RUNNER") return False
def run_background_load(self, unique_conf, common_conf): console_out(self.actor, f"Starting background load for {unique_conf.node_number}") status_id = unique_conf.technology + unique_conf.node_number broker_user = "******" broker_password = common_conf.password topology = common_conf.background_topology_file policies = common_conf.background_policies_file step_seconds = str(common_conf.background_step_seconds) step_repeat = str(common_conf.background_step_repeat) nodes = "" for x in range(int(unique_conf.cluster_size)): comma = "," if x == 0: comma = "" node_number = int(unique_conf.node_number) + x nodes = f"{nodes}{comma}{node_number}" self._benchmark_status[status_id] = "started" subprocess.Popen(["bash", "run-background-load-aws.sh", broker_user, broker_password, str(unique_conf.cluster_size), common_conf.key_pair, unique_conf.node_number, nodes, policies, step_seconds, step_repeat, common_conf.run_tag, unique_conf.technology, topology, unique_conf.broker_version])
def publish_msg_with_existing_conn(self, send_to_exchange, rk, body): mandatory = False if self.use_confirms: mandatory = True corr_id = str(uuid.uuid4()) try: self.channel.basic_publish(exchange=send_to_exchange, routing_key=rk, body=body, mandatory=mandatory, properties=pika.BasicProperties( content_type='text/plain', delivery_mode=2, correlation_id=corr_id)) self.pos_acks += 1 except exceptions.UnroutableError: self.undeliverable += 1 if self.undeliverable % 100 == 0: console_out( f"{str(self.undeliverable)} messages could not be delivered", self.get_actor()) except exceptions.NackError: self.neg_acks += 1
def restart_broker(self, technology, node, common_conf): status_id = technology + node node_name = f"{common_conf.run_tag}-rmq{node}-server" command_args = [ "gcloud", "compute", "ssh", node_name, "--", "docker exec $(docker container ls | awk '/rabbitmq/ { print $1 }') rabbitmqctl -l stop_app" ] result = subprocess.run(command_args) if result.returncode != 0: console_out( self.actor, f"Restart (1/2) of broker on node {node} failed with exit code {result.returncode}" ) self._action_status[status_id] = "failed" return command_args = [ "gcloud", "compute", "ssh", node_name, "--", "docker restart --time 30 $(docker container ls | awk '/rabbitmq/ { print $1 }')" ] result = subprocess.run(command_args) if result.returncode != 0: console_out( self.actor, f"Restart (2/2) of broker on node {node} failed with exit code {result.returncode}" ) self._action_status[status_id] = "failed" return self._action_status[status_id] = "success"
def reconnect(self): self.connection = None self.channel = None console_out("Connection is closed. Opening new connection", self.get_actor()) self.broker_manager.next_node(self.consumer_id) return self.connect()
def on_undeliverable(self, channel, method, properties, body): body_str = str(body, "utf-8") self.undeliverable += 1 if self.undeliverable % 100 == 0: console_out( f"{str(self.undeliverable)} messages could not be delivered", self.get_actor())
def reconnect(self): self.connection = None self.channel = None console_out("Connection is closed. Opening new connection", self.get_actor()) self.next_node() return self.connect()
def get_playlist_entries(playlist_file): pl_file = open(playlist_file, "r") playlist_json = json.loads(pl_file.read()) common_attr = playlist_json["commonAttributes"] playlist_entries = list() # load topologies to run and check topology and policy files exist for playlist_entry in playlist_json['benchmarks']: entry = PlaylistEntry() entry.topology = get_entry_mandatory_field(playlist_entry, common_attr, "topology") entry.topology_variables = get_variables(playlist_entry, common_attr, "topologyVariables") entry.policy = get_entry_mandatory_field(playlist_entry, common_attr, "policy") entry.policy_variables = get_variables(playlist_entry, common_attr, "policyVariables") entry.has_broker_actions = get_entry_optional_field(playlist_entry, common_attr, "hasBrokerActions", False) if entry.has_broker_actions: entry.broker_action = get_entry_mandatory_field(playlist_entry, common_attr, "brokerAction") entry.trigger_type = get_entry_mandatory_field(playlist_entry, common_attr, "triggerType") entry.trigger_at = get_entry_mandatory_field(playlist_entry, common_attr, "triggerAt") if not os.path.exists("../benchmark/topologies/" + entry.topology): console_out("RUNNER", f"The topology file {entry.topology} does not exist") exit(1) if len(entry.policy) > 0 and not os.path.exists("../benchmark/policies/" + entry.policy): console_out("RUNNER", f"The policy file {entry.policy} does not exist") exit(1) playlist_entries.append(entry) return playlist_entries
def configure_hello_msgs_to_exchanges(self, exchanges, routing_key, count, dup_rate): self.exchanges = exchanges self.routing_key = routing_key console_out(f"Will publish hello msgs to exchanges {exchanges}", self.get_actor()) self.configure(count, dup_rate, "hello")
def teardown(self, technology, node, run_tag, no_destroy): if no_destroy: console_out(self.actor, "No teardown as --no-destroy set to true") else: try: self.__delete_instance(f"{run_tag}-rmq{node}-server") except Exception as e: console_out(self.actor, f"{e}, ignoring")
def on_channel_closed(self, channel, reply_code, reply_text): console_out( f"Channel {channel} was closed. Code: {reply_code} Text: {reply_text}", self.get_actor()) self._channel = None if not self._stopping: if self._connection.is_open: self._connection.close()
def on_channel_open(self, channel): self._channel = channel self.add_on_channel_close_callback() console_out('Channel opened, publishing to commence', self.get_actor()) self.reset_ack_tracking() self.seq_no = 0 self.start_publishing()
def interuppt_handler(signum, frame): global stop_please, stop_requests console_out("STOP REQUESTED", "TEST RUNNER") stop_please = True stop_requests += 1 if stop_requests >= 2: sys.exit(-2)