def create_user_and_wait(redpanda, admin: Admin, creds: SaslCredentials): admin.create_user(*creds) def user_exists_everywhere(): for node in redpanda.nodes: users = redpanda._admin.list_users(node=node) if creds.username not in users: redpanda.logger.info(f"{creds.username} not in {users}") return False return True # It should only take milliseconds for raft0 write to propagate wait_until(user_exists_everywhere, timeout_sec=5, backoff_sec=0.5)
def prepare_users(self): """ Create users and ACLs TODO: - wait for users to propogate """ admin = Admin(self.redpanda) client = self.get_super_client() # base case user is not a superuser and has no configured ACLs admin.create_user("base", self.password, self.algorithm) admin.create_user("cluster_describe", self.password, self.algorithm) client.create_cluster_acls("cluster_describe", "describe")
class RedpandaService(Service): PERSISTENT_ROOT = "/var/lib/redpanda" DATA_DIR = os.path.join(PERSISTENT_ROOT, "data") CONFIG_FILE = "/etc/redpanda/redpanda.yaml" STDOUT_STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "redpanda.log") WASM_STDOUT_STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "wasm_engine.log") COVERAGE_PROFRAW_CAPTURE = os.path.join(PERSISTENT_ROOT, "redpanda.profraw") CLUSTER_NAME = "my_cluster" READY_TIMEOUT_SEC = 10 LOG_LEVEL_KEY = "redpanda_log_level" DEFAULT_LOG_LEVEL = "info" SUPERUSER_CREDENTIALS = ("admin", "admin", "SCRAM-SHA-256") COV_KEY = "enable_cov" DEFAULT_COV_OPT = False logs = { "redpanda_start_stdout_stderr": { "path": STDOUT_STDERR_CAPTURE, "collect_default": True }, "wasm_engine_start_stdout_stderr": { "path": WASM_STDOUT_STDERR_CAPTURE, "collect_default": True }, "code_coverage_profraw_file": { "path": COVERAGE_PROFRAW_CAPTURE, "collect_default": True } } def __init__(self, context, num_brokers, client_type, enable_rp=True, extra_rp_conf=None, enable_pp=False, enable_sr=False, topics=None, num_cores=3): super(RedpandaService, self).__init__(context, num_nodes=num_brokers) self._context = context self._client_type = client_type self._enable_rp = enable_rp self._extra_rp_conf = extra_rp_conf or dict() self._enable_pp = enable_pp self._enable_sr = enable_sr self._log_level = self._context.globals.get(self.LOG_LEVEL_KEY, self.DEFAULT_LOG_LEVEL) self._topics = topics or () self._num_cores = num_cores self._admin = Admin(self) self._started = [] # client is intiialized after service starts self._client = None self.config_file_lock = threading.Lock() def sasl_enabled(self): return self._extra_rp_conf and self._extra_rp_conf.get( "enable_sasl", False) def start(self, nodes=None, clean_nodes=True): """Start the service on all nodes.""" to_start = nodes if nodes is not None else self.nodes assert all((node in self.nodes for node in to_start)) self.logger.info("%s: starting service" % self.who_am_i()) if self._start_time < 0: # Set self._start_time only the first time self.start is invoked self._start_time = time.time() self.logger.debug( self.who_am_i() + ": killing processes and attempting to clean up before starting") for node in to_start: try: self.stop_node(node) except Exception: pass try: if clean_nodes: self.clean_node(node) else: self.logger.debug("%s: skip cleaning node" % self.who_am_i(node)) except Exception as e: self.logger.exception( f"Error cleaning data files on {node.account.hostname}:") raise for node in to_start: self.logger.debug("%s: starting node" % self.who_am_i(node)) self.start_node(node) if self._start_duration_seconds < 0: self._start_duration_seconds = time.time() - self._start_time self._admin.create_user(*self.SUPERUSER_CREDENTIALS) self.logger.info("Waiting for all brokers to join cluster") expected = set(self._started) wait_until(lambda: {n for n in self._started if self.registered(n)} == expected, timeout_sec=30, backoff_sec=1, err_msg="Cluster membership did not stabilize") self.logger.info("Verifying storage is in expected state") storage = self.storage() for node in storage.nodes: if not set(node.ns) == {"redpanda"} or not set( node.ns["redpanda"].topics) == {"controller", "kvstore"}: self.logger.error( f"Unexpected files: ns={node.ns} redpanda topics={node.ns['redpanda'].topics}" ) raise RuntimeError("Unexpected files in data directory") security_settings = dict() if self.sasl_enabled(): username, password, algorithm = self.SUPERUSER_CREDENTIALS security_settings = dict(security_protocol='SASL_PLAINTEXT', sasl_mechanism=algorithm, sasl_plain_username=username, sasl_plain_password=password, request_timeout_ms=30000, api_version_auto_timeout_ms=3000) self._client = KafkaAdminClient(bootstrap_servers=self.brokers_list(), **security_settings) self._create_initial_topics(security_settings) def _create_initial_topics(self, security_settings): user = security_settings.get("sasl_plain_username") passwd = security_settings.get("sasl_plain_password") client = self._client_type(self, user=user, passwd=passwd) for spec in self._topics: self.logger.debug(f"Creating initial topic {spec}") client.create_topic(spec) def start_redpanda(self, node): cmd = ( f"nohup {self.find_binary('redpanda')}" f" --redpanda-cfg {RedpandaService.CONFIG_FILE}" f" --default-log-level {self._log_level}" f" --logger-log-level=exception=debug:archival=debug:io=debug:cloud_storage=debug " f" --kernel-page-cache=true " f" --overprovisioned " f" --smp {self._num_cores} " f" --memory 6G " f" --reserve-memory 0M " f" >> {RedpandaService.STDOUT_STDERR_CAPTURE} 2>&1 &") # set llvm_profile var for code coverage # each node will create its own copy of the .profraw file # since each node creates a redpanda broker. if self.cov_enabled(): cmd = f"LLVM_PROFILE_FILE=\"{RedpandaService.COVERAGE_PROFRAW_CAPTURE}\" " + cmd node.account.ssh(cmd) def signal_redpanda(self, node, signal=signal.SIGKILL, idempotent=False): """ :param idempotent: if true, then kill-like signals are ignored if the process is already gone. """ pid = self.redpanda_pid(node) if pid is None: if idempotent and signal in {signal.SIGKILL, signal.SIGTERM}: return else: raise RuntimeError( f"Can't signal redpanda on node {node.name}, it isn't running" ) node.account.signal(pid, signal, allow_fail=False) def start_node(self, node, override_cfg_params=None): """ Start a single instance of redpanda. This function will not return until redpanda appears to have started successfully. If redpanda does not start within a timeout period the service will fail to start. Thus this function also acts as an implicit test that redpanda starts quickly. """ node.account.mkdirs(RedpandaService.DATA_DIR) node.account.mkdirs(os.path.dirname(RedpandaService.CONFIG_FILE)) self.write_conf_file(node, override_cfg_params) if self.coproc_enabled(): self.start_wasm_engine(node) self.start_redpanda(node) wait_until( lambda: Admin.ready(node).get("status") == "ready", timeout_sec=RedpandaService.READY_TIMEOUT_SEC, err_msg=f"Redpanda service {node.account.hostname} failed to start", retry_on_exc=True) self._started.append(node) def coproc_enabled(self): coproc = self._extra_rp_conf.get('enable_coproc') dev_mode = self._extra_rp_conf.get('developer_mode') return coproc is True and dev_mode is True def start_wasm_engine(self, node): wcmd = (f"nohup {self.find_binary('node')}" f" {self.find_wasm_root()}/main.js" f" {RedpandaService.CONFIG_FILE} " f" >> {RedpandaService.WASM_STDOUT_STDERR_CAPTURE} 2>&1 &") self.logger.info( f"Starting wasm engine on {node.account} with command: {wcmd}") # wait until the wasm engine has finished booting up wasm_port = 43189 conf_value = self._extra_rp_conf.get('coproc_supervisor_server') if conf_value is not None: wasm_port = conf_value['port'] with node.account.monitor_log( RedpandaService.WASM_STDOUT_STDERR_CAPTURE) as mon: node.account.ssh(wcmd) mon.wait_until( f"Starting redpanda wasm service on port: {wasm_port}", timeout_sec=RedpandaService.READY_TIMEOUT_SEC, backoff_sec=0.5, err_msg= f"Wasm engine didn't finish startup in {RedpandaService.READY_TIMEOUT_SEC} seconds", ) def monitor_log(self, node): assert node in self._started return node.account.monitor_log(RedpandaService.STDOUT_STDERR_CAPTURE) def find_wasm_root(self): rp_install_path_root = self._context.globals.get( "rp_install_path_root", None) return f"{rp_install_path_root}/opt/wasm" def find_binary(self, name): rp_install_path_root = self._context.globals.get( "rp_install_path_root", None) return f"{rp_install_path_root}/bin/{name}" def stop_node(self, node): pids = self.pids(node) for pid in pids: node.account.signal(pid, signal.SIGTERM, allow_fail=False) timeout_sec = 30 wait_until(lambda: len(self.pids(node)) == 0, timeout_sec=timeout_sec, err_msg="Redpanda node failed to stop in %d seconds" % timeout_sec) if node in self._started: self._started.remove(node) def clean_node(self, node, preserve_logs=False): node.account.kill_process("redpanda", clean_shutdown=False) if node.account.exists(RedpandaService.PERSISTENT_ROOT): if node.account.sftp_client.listdir( RedpandaService.PERSISTENT_ROOT): if not preserve_logs: node.account.remove(f"{RedpandaService.PERSISTENT_ROOT}/*") else: node.account.remove( f"{RedpandaService.PERSISTENT_ROOT}/data/*") if node.account.exists(RedpandaService.CONFIG_FILE): node.account.remove(f"{RedpandaService.CONFIG_FILE}") def remove_local_data(self, node): node.account.remove(f"{RedpandaService.PERSISTENT_ROOT}/data/*") def redpanda_pid(self, node): # we need to look for redpanda pid. pids() method returns pids of both # nodejs server and redpanda try: cmd = "ps ax | grep -i 'redpanda' | grep -v grep | awk '{print $1}'" for p in node.account.ssh_capture(cmd, allow_fail=True, callback=int): return p except (RemoteCommandError, ValueError): return None def pids(self, node): """Return process ids associated with running processes on the given node.""" try: cmd = "ps ax | grep -i 'redpanda\|node' | grep -v grep | awk '{print $1}'" pid_arr = [ pid for pid in node.account.ssh_capture( cmd, allow_fail=True, callback=int) ] return pid_arr except (RemoteCommandError, ValueError): return [] def started_nodes(self): return self._started def write_conf_file(self, node, override_cfg_params): node_info = {self.idx(n): n for n in self.nodes} conf = self.render("redpanda.yaml", node=node, data_dir=RedpandaService.DATA_DIR, cluster=RedpandaService.CLUSTER_NAME, nodes=node_info, node_id=self.idx(node), enable_rp=self._enable_rp, enable_pp=self._enable_pp, enable_sr=self._enable_sr, superuser=self.SUPERUSER_CREDENTIALS, sasl_enabled=self.sasl_enabled()) if self._extra_rp_conf: doc = yaml.full_load(conf) self.logger.debug( "Setting custom Redpanda configuration options: {}".format( self._extra_rp_conf)) doc["redpanda"].update(self._extra_rp_conf) conf = yaml.dump(doc) if override_cfg_params: doc = yaml.full_load(conf) self.logger.debug( "Setting custom Redpanda node configuration options: {}". format(override_cfg_params)) doc["redpanda"].update(override_cfg_params) conf = yaml.dump(doc) self.logger.info("Writing Redpanda config file: {}".format( RedpandaService.CONFIG_FILE)) self.logger.debug(conf) node.account.create_file(RedpandaService.CONFIG_FILE, conf) def restart_nodes(self, nodes, override_cfg_params=None): nodes = [nodes] if isinstance(nodes, ClusterNode) else nodes for node in nodes: self.stop_node(node) for node in nodes: self.start_node(node, override_cfg_params) def registered(self, node): """ Check if a newly added node is fully registered with the cluster, such that a kafka metadata request to any node in the cluster will include it. We first check the admin API to do a kafka-independent check, and then verify that kafka clients see the same thing. """ idx = self.idx(node) self.logger.debug( f"registered: checking if broker {idx} ({node.name} is registered..." ) # Query all nodes' admin APIs, so that we don't advance during setup until # the node is stored in raft0 AND has been replayed on all nodes. Otherwise # a kafka metadata request to the last node to join could return incomplete # metadata and cause strange issues within a test. admin = Admin(self) for peer in self._started: try: admin_brokers = admin.get_brokers(node=peer) except requests.exceptions.RequestException as e: # We run during startup, when admin API may not even be listening yet: tolerate # API errors but presume that if some APIs are not up yet, then node registration # is also not complete. self.logger.debug( f"registered: peer {peer.name} admin API not yet available ({e})" ) return False found = idx in [b['node_id'] for b in admin_brokers] if not found: self.logger.info( f"registered: node {node.name} not yet found in peer {peer.name}'s broker list ({admin_brokers})" ) return False else: self.logger.debug( f"registered: node {node.name} now visible in peer {peer.name}'s broker list ({admin_brokers})" ) client = PythonLibrdkafka(self) brokers = client.brokers() broker = brokers.get(idx, None) if broker is None: # This should never happen, because we already checked via the admin API # that the node of interest had become visible to all peers. self.logger.error( f"registered: node {node.name} not found in kafka metadata!") assert broker is not None self.logger.debug(f"registered: found broker info: {broker}") return True def controller(self): """ :return: the ClusterNode that is currently controller leader, or None if no leader exists """ for node in self.nodes: try: r = requests.request( "get", f"http://{node.account.hostname}:9644/v1/partitions/redpanda/controller/0", timeout=10) except requests.exceptions.RequestException: continue if r.status_code != 200: continue else: resp_leader_id = r.json()['leader_id'] if resp_leader_id != -1: return self.get_node(resp_leader_id) return None def node_storage(self, node): """ Retrieve a summary of storage on a node. """ def listdir(path, only_dirs=False): try: ents = node.account.sftp_client.listdir(path) except FileNotFoundError: # Perhaps the directory has been deleted since we saw it. # This is normal if doing a listing concurrently with topic deletion. return [] if not only_dirs: return ents paths = map(lambda fn: (fn, os.path.join(path, fn)), ents) def safe_isdir(path): try: return node.account.isdir(path) except FileNotFoundError: # Things that no longer exist are also no longer directories return False return [p[0] for p in paths if safe_isdir(p[1])] store = NodeStorage(RedpandaService.DATA_DIR) for ns in listdir(store.data_dir, True): if ns == '.coprocessor_offset_checkpoints': continue ns = store.add_namespace(ns, os.path.join(store.data_dir, ns)) for topic in listdir(ns.path): topic = ns.add_topic(topic, os.path.join(ns.path, topic)) for num in listdir(topic.path): partition = topic.add_partition( num, node, os.path.join(topic.path, num)) partition.add_files(listdir(partition.path)) return store def storage(self): store = ClusterStorage() for node in self._started: s = self.node_storage(node) store.add_node(s) return store def copy_data(self, dest, node): # after copying, move all files up a directory level so the caller does # not need to know what the name of the storage directory is. with tempfile.TemporaryDirectory() as d: node.account.copy_from(RedpandaService.DATA_DIR, d) data_dir = os.path.basename(RedpandaService.DATA_DIR) data_dir = os.path.join(d, data_dir) for fn in os.listdir(data_dir): shutil.move(os.path.join(data_dir, fn), dest) def data_checksum(self, node): """Run command that computes MD5 hash of every file in redpanda data directory. The results of the command are turned into a map from path to hash-size tuples.""" cmd = f"find {RedpandaService.DATA_DIR} -type f -exec md5sum -z '{{}}' \; -exec stat -c ' %s' '{{}}' \;" lines = node.account.ssh_output(cmd) lines = lines.decode().split("\n") # there is a race between `find` iterating over file names and passing # those to an invocation of `md5sum` in which the file may be deleted. # here we log these instances for debugging, but otherwise ignore them. found = [] for line in lines: if "No such file or directory" in line: self.logger.debug(f"Skipping file that disappeared: {line}") continue found.append(line) lines = found # the `find` command will stick a newline at the end of the results # which gets parsed as an empty line by `split` above if lines[-1] == "": lines.pop() return { tokens[1].rstrip("\x00"): (tokens[0], int(tokens[2])) for tokens in map(lambda l: l.split(), lines) } def broker_address(self, node): assert node in self._started cfg = self.read_configuration(node) return f"{node.account.hostname}:{one_or_many(cfg['redpanda']['kafka_api'])['port']}" def brokers(self, limit=None): return ",".join(self.brokers_list(limit)) def brokers_list(self, limit=None): brokers = [self.broker_address(n) for n in self._started[:limit]] random.shuffle(brokers) return brokers def schema_reg(self, limit=None): schema_reg = [ f"http://{n.account.hostname}:8081" for n in self._started[:limit] ] return ",".join(schema_reg) def metrics(self, node): assert node in self._started url = f"http://{node.account.hostname}:9644/metrics" resp = requests.get(url) assert resp.status_code == 200 return text_string_to_metric_families(resp.text) def metrics_sample(self, sample_pattern, nodes=None): """ Query metrics for a single sample using fuzzy name matching. This interface matches the sample pattern against sample names, and requires that exactly one (family, sample) match the query. All values for the sample across the requested set of nodes are returned in a flat array. None will be returned if less than one (family, sample) matches. An exception will be raised if more than one (family, sample) matches. Example: The query: redpanda.metrics_sample("under_replicated") will return an array containing MetricSample instances for each node and core/shard in the cluster. Each entry will correspond to a value from: family = vectorized_cluster_partition_under_replicated_replicas sample = vectorized_cluster_partition_under_replicated_replicas """ nodes = nodes or self.nodes found_sample = None sample_values = [] for node in nodes: metrics = self.metrics(node) for family in metrics: for sample in family.samples: if sample_pattern not in sample.name: continue if not found_sample: found_sample = (family.name, sample.name) if found_sample != (family.name, sample.name): raise Exception( f"More than one metric matched '{sample_pattern}'. Found {found_sample} and {(family.name, sample.name)}" ) sample_values.append( MetricSample(family.name, sample.name, node, sample.value, sample.labels)) if not sample_values: return None return MetricSamples(sample_values) def read_configuration(self, node): assert node in self._started with self.config_file_lock: with node.account.open(RedpandaService.CONFIG_FILE) as f: return yaml.full_load(f.read()) def shards(self): """ Fetch the max shard id for each node. """ shards_per_node = {} for node in self._started: num_shards = 0 metrics = self.metrics(node) for family in metrics: for sample in family.samples: if sample.name == "vectorized_reactor_utilization": num_shards = max(num_shards, int(sample.labels["shard"])) assert num_shards > 0 shards_per_node[self.idx(node)] = num_shards return shards_per_node def healthy(self): """ A primitive health check on all the nodes which returns True when all nodes report that no under replicated partitions exist. This should later be replaced by a proper / official start-up probe type check on the health of a node after a restart. """ counts = {self.idx(node): None for node in self.nodes} for node in self.nodes: metrics = self.metrics(node) idx = self.idx(node) for family in metrics: for sample in family.samples: if sample.name == "vectorized_cluster_partition_under_replicated_replicas": if counts[idx] is None: counts[idx] = 0 counts[idx] += int(sample.value) return all(map(lambda count: count == 0, counts.values())) def describe_topics(self, topics=None): """ Describe topics. Pass topics=None to describe all topics, or a pass a list of topic names to restrict the call to a set of specific topics. Sample return value: [ {'error_code': 0, 'topic': 'topic-kabn', 'is_internal': False, 'partitions': [ {'error_code': 0, 'partition': 0, 'leader': 1, 'replicas': [1], 'isr': [1], 'offline_replicas': []} } ] """ return self._client.describe_topics(topics) def partitions(self, topic): """ Return partition metadata for the topic. """ kc = KafkaCat(self) md = kc.metadata() topic = next(filter(lambda t: t["topic"] == topic, md["topics"])) def make_partition(p): index = p["partition"] leader_id = p["leader"] leader = None if leader_id == -1 else self.get_node(leader_id) replicas = [self.get_node(r["id"]) for r in p["replicas"]] return Partition(index, leader, replicas) return [make_partition(p) for p in topic["partitions"]] def create_topic(self, specs): if isinstance(specs, TopicSpec): specs = [specs] client = self._client_type(self) for spec in specs: self.logger.info(f"Creating topic {spec}") client.create_topic(spec) def delete_topic(self, name): client = self._client_type(self) self.logger.debug(f"Deleting topic {name}") client.delete_topic(name) def cov_enabled(self): return self._context.globals.get(self.COV_KEY, self.DEFAULT_COV_OPT)
class RedpandaService(Service): PERSISTENT_ROOT = "/var/lib/redpanda" DATA_DIR = os.path.join(PERSISTENT_ROOT, "data") CONFIG_FILE = "/etc/redpanda/redpanda.yaml" STDOUT_STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "redpanda.log") WASM_STDOUT_STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "wasm_engine.log") CLUSTER_NAME = "my_cluster" READY_TIMEOUT_SEC = 20 SUPERUSER_CREDENTIALS = ("admin", "admin", "SCRAM-SHA-256") logs = { "redpanda_start_stdout_stderr": { "path": STDOUT_STDERR_CAPTURE, "collect_default": True }, "wasm_engine_start_stdout_stderr": { "path": WASM_STDOUT_STDERR_CAPTURE, "collect_default": True } } def __init__(self, context, num_brokers, client_type, enable_rp=True, extra_rp_conf=None, enable_pp=False, topics=None, log_level='info'): super(RedpandaService, self).__init__(context, num_nodes=num_brokers) self._context = context self._client_type = client_type self._enable_rp = enable_rp self._extra_rp_conf = extra_rp_conf self._enable_pp = enable_pp self._log_level = log_level self._topics = topics or () self.v_build_dir = self._context.globals.get("v_build_dir", None) self._admin = Admin(self) def sasl_enabled(self): return self._extra_rp_conf and self._extra_rp_conf.get( "enable_sasl", False) def start(self): super(RedpandaService, self).start() self._admin.create_user(*self.SUPERUSER_CREDENTIALS) self.logger.info("Waiting for all brokers to join cluster") expected = set(self.nodes) wait_until(lambda: {n for n in self.nodes if self.registered(n)} == expected, timeout_sec=30, backoff_sec=1, err_msg="Cluster membership did not stabilize") # verify storage is in an expected initial state storage = self.storage() for node in storage.nodes: assert set(node.ns) == {"redpanda"} assert set(node.ns["redpanda"].topics) == {"controller", "kvstore"} self._create_initial_topics() def _create_initial_topics(self): client = self._client_type(self) for spec in self._topics: self.logger.debug(f"Creating initial topic {spec}") client.create_topic(spec) def start_node(self, node, override_cfg_params=None): node.account.mkdirs(RedpandaService.DATA_DIR) node.account.mkdirs(os.path.dirname(RedpandaService.CONFIG_FILE)) self.write_conf_file(node, override_cfg_params) if self.coproc_enabled(): self.start_wasm_engine(node) cmd = (f"nohup {self.find_binary('redpanda')}" f" --redpanda-cfg {RedpandaService.CONFIG_FILE}" f" --default-log-level {self._log_level}" f" --logger-log-level=exception=debug:archival=debug " f" --kernel-page-cache=true " f" --overprovisioned " f" --smp 3 " f" --memory 6G " f" --reserve-memory 0M " f" >> {RedpandaService.STDOUT_STDERR_CAPTURE} 2>&1 &") self.logger.info( f"Starting Redpanda service on {node.account} with command: {cmd}") # wait until redpanda has finished booting up with node.account.monitor_log( RedpandaService.STDOUT_STDERR_CAPTURE) as mon: node.account.ssh(cmd) mon.wait_until( "Successfully started Redpanda!", timeout_sec=RedpandaService.READY_TIMEOUT_SEC, backoff_sec=0.5, err_msg= f"Redpanda didn't finish startup in {RedpandaService.READY_TIMEOUT_SEC} seconds", ) def coproc_enabled(self): coproc = self._extra_rp_conf.get('enable_coproc') dev_mode = self._extra_rp_conf.get('developer_mode') return coproc is True and dev_mode is True def start_wasm_engine(self, node): wcmd = (f"nohup {self.find_binary('node')}" f" {self.find_wasm_root()}/main.js" f" {RedpandaService.CONFIG_FILE} " f" >> {RedpandaService.WASM_STDOUT_STDERR_CAPTURE} 2>&1 &") self.logger.info( f"Starting wasm engine on {node.account} with command: {wcmd}") # wait until the wasm engine has finished booting up wasm_port = 43189 conf_value = self._extra_rp_conf.get('coproc_supervisor_server') if conf_value is not None: wasm_port = conf_value['port'] with node.account.monitor_log( RedpandaService.WASM_STDOUT_STDERR_CAPTURE) as mon: node.account.ssh(wcmd) mon.wait_until( f"Starting redpanda wasm service on port: {wasm_port}", timeout_sec=RedpandaService.READY_TIMEOUT_SEC, backoff_sec=0.5, err_msg= f"Wasm engine didn't finish startup in {RedpandaService.READY_TIMEOUT_SEC} seconds", ) def find_wasm_root(self): rp_install_path_root = self._context.globals.get( "rp_install_path_root", None) return f"{rp_install_path_root}/opt/wasm" def find_binary(self, name): rp_install_path_root = self._context.globals.get( "rp_install_path_root", None) return f"{rp_install_path_root}/bin/{name}" def stop_node(self, node): pids = self.pids(node) for pid in pids: node.account.signal(pid, signal.SIGTERM, allow_fail=False) timeout_sec = 30 wait_until(lambda: len(self.pids(node)) == 0, timeout_sec=timeout_sec, err_msg="Redpanda node failed to stop in %d seconds" % timeout_sec) def clean_node(self, node): node.account.kill_process("redpanda", clean_shutdown=False) node.account.remove(f"{RedpandaService.PERSISTENT_ROOT}/*") node.account.remove(f"{RedpandaService.CONFIG_FILE}") def pids(self, node): """Return process ids associated with running processes on the given node.""" try: cmd = "ps ax | grep -i 'redpanda\|node' | grep -v grep | awk '{print $1}'" pid_arr = [ pid for pid in node.account.ssh_capture( cmd, allow_fail=True, callback=int) ] return pid_arr except (RemoteCommandError, ValueError): return [] def write_conf_file(self, node, override_cfg_params): node_info = {self.idx(n): n for n in self.nodes} conf = self.render("redpanda.yaml", node=node, data_dir=RedpandaService.DATA_DIR, cluster=RedpandaService.CLUSTER_NAME, nodes=node_info, node_id=self.idx(node), enable_rp=self._enable_rp, enable_pp=self._enable_pp, superuser=self.SUPERUSER_CREDENTIALS, sasl_enabled=self.sasl_enabled()) if self._extra_rp_conf: doc = yaml.full_load(conf) self.logger.debug( "Setting custom Redpanda configuration options: {}".format( self._extra_rp_conf)) doc["redpanda"].update(self._extra_rp_conf) conf = yaml.dump(doc) if override_cfg_params: doc = yaml.full_load(conf) self.logger.debug( "Setting custom Redpanda node configuration options: {}". format(override_cfg_params)) doc["redpanda"].update(override_cfg_params) conf = yaml.dump(doc) self.logger.info("Writing Redpanda config file: {}".format( RedpandaService.CONFIG_FILE)) self.logger.debug(conf) node.account.create_file(RedpandaService.CONFIG_FILE, conf) def restart_nodes(self, nodes): nodes = [nodes] if isinstance(nodes, ClusterNode) else nodes for node in nodes: self.stop_node(node) for node in nodes: self.start_node(node) def registered(self, node): idx = self.idx(node) self.logger.debug( f"Checking if broker {idx} ({node.name} is registered") client = PythonLibrdkafka(self) brokers = client.brokers() broker = brokers.get(idx, None) self.logger.debug(f"Found broker info: {broker}") return broker is not None def controller(self): kc = KafkaCat(self) cid = kc.metadata()["controllerid"] self.logger.debug("Controller reported with id: {}".format(cid)) if cid != -1: node = self.get_node(cid) self.logger.debug("Controller node found: {}".format(node)) return node def node_storage(self, node): """ Retrieve a summary of storage on a node. """ def listdir(path, only_dirs=False): ents = node.account.sftp_client.listdir(path) if not only_dirs: return ents paths = map(lambda fn: (fn, os.path.join(path, fn)), ents) return [p[0] for p in paths if node.account.isdir(p[1])] store = NodeStorage(RedpandaService.DATA_DIR) for ns in listdir(store.data_dir, True): if ns == '.coprocessor_offset_checkpoints': continue ns = store.add_namespace(ns, os.path.join(store.data_dir, ns)) for topic in listdir(ns.path): topic = ns.add_topic(topic, os.path.join(ns.path, topic)) for num in listdir(topic.path): partition = topic.add_partition( num, node, os.path.join(topic.path, num)) partition.add_files(listdir(partition.path)) return store def storage(self): store = ClusterStorage() for node in self.nodes: s = self.node_storage(node) store.add_node(s) return store def copy_data(self, dest, node): # after copying, move all files up a directory level so the caller does # not need to know what the name of the storage directory is. with tempfile.TemporaryDirectory() as d: node.account.copy_from(RedpandaService.DATA_DIR, d) data_dir = os.path.basename(RedpandaService.DATA_DIR) data_dir = os.path.join(d, data_dir) for fn in os.listdir(data_dir): shutil.move(os.path.join(data_dir, fn), dest) def data_checksum(self, node): """Run command that computes MD5 hash of every file in redpanda data directory. The results of the command are turned into a map from path to hash-size tuples.""" cmd = f"find {RedpandaService.DATA_DIR} -type f -exec md5sum '{{}}' \; -exec stat -c %s '{{}}' \;" lines = node.account.ssh_output(cmd) tokens = lines.split() return { tokens[ix + 1].decode(): (tokens[ix].decode(), int(tokens[ix + 2])) for ix in range(0, len(tokens), 3) } def broker_address(self, node): assert node in self.nodes cfg = self.read_configuration(node) return f"{node.account.hostname}:{cfg['redpanda']['kafka_api']['port']}" def brokers(self, limit=None): brokers = ",".join( map(lambda n: self.broker_address(n), self.nodes[:limit])) return brokers def metrics(self, node): assert node in self.nodes url = f"http://{node.account.hostname}:9644/metrics" resp = requests.get(url) assert resp.status_code == 200 return text_string_to_metric_families(resp.text) def read_configuration(self, node): assert node in self.nodes with node.account.open(RedpandaService.CONFIG_FILE) as f: cfg = yaml.full_load(f.read()) return cfg def shards(self): """ Fetch the max shard id for each node. """ shards_per_node = {} for node in self.nodes: num_shards = 0 metrics = self.metrics(node) for family in metrics: for sample in family.samples: if sample.name == "vectorized_reactor_utilization": num_shards = max(num_shards, int(sample.labels["shard"])) assert num_shards > 0 shards_per_node[self.idx(node)] = num_shards return shards_per_node def partitions(self, topic): """ Return partition metadata for the topic. """ kc = KafkaCat(self) md = kc.metadata() topic = next(filter(lambda t: t["topic"] == topic, md["topics"])) def make_partition(p): index = p["partition"] leader_id = p["leader"] leader = None if leader_id == -1 else self.get_node(leader_id) replicas = [self.get_node(r["id"]) for r in p["replicas"]] return Partition(index, leader, replicas) return [make_partition(p) for p in topic["partitions"]]
def prepare_cluster(self, use_tls, use_sasl): self.security = SecurityConfig() self.security.enable_sasl = use_sasl self.security.enable_mtls_identity = use_tls and not use_sasl if use_tls: self.tls = tls.TLSCertManager(self.logger) # cert for principal with no explicitly granted permissions self.base_user_cert = self.tls.create_cert(socket.gethostname(), common_name="morty", name="base_client") # cert for principal with cluster describe permissions self.cluster_describe_user_cert = self.tls.create_cert( socket.gethostname(), common_name="cluster_describe", name="cluster_describe_client") # cert for admin user used to bootstrap self.admin_user_cert = self.tls.create_cert( socket.gethostname(), common_name="admin", name="test_admin_client") self.security.tls_provider = MTLSProvider(self.tls) self.redpanda.set_security_settings(self.security) self.redpanda.start() admin = Admin(self.redpanda) if self.security.enable_mtls_identity: feature_name = "mtls_authentication" admin.put_feature(feature_name, {"state": "active"}) # wait for feature to be active so that tests don't have to retry def check_feature_active(): for f in admin.get_features()["features"]: if f["name"] == feature_name and f["state"] == "active": return True return False wait_until(check_feature_active, timeout_sec=10, backoff_sec=1) # base case user is not a superuser and has no configured ACLs if use_sasl: admin.create_user("base", self.password, self.algorithm) # only grant cluster describe permission to user cluster_describe if use_sasl: admin.create_user("cluster_describe", self.password, self.algorithm) client = self.get_super_client() client.acl_create_allow_cluster("cluster_describe", "describe") # there is not a convenient interface for waiting for acls to propogate # to all nodes so when we are using mtls only for identity we inject a # sleep here to try to avoid any acl propogation races. if self.security.enable_mtls_identity: time.sleep(5) return # wait for users to proogate to nodes def users_propogated(): for node in self.redpanda.nodes: users = admin.list_users(node=node) if "base" not in users or "cluster_describe" not in users: return False return True wait_until(users_propogated, timeout_sec=10, backoff_sec=1)
class RedpandaService(Service): PERSISTENT_ROOT = "/var/lib/redpanda" DATA_DIR = os.path.join(PERSISTENT_ROOT, "data") CONFIG_FILE = "/etc/redpanda/redpanda.yaml" STDOUT_STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "redpanda.log") WASM_STDOUT_STDERR_CAPTURE = os.path.join(PERSISTENT_ROOT, "wasm_engine.log") CLUSTER_NAME = "my_cluster" READY_TIMEOUT_SEC = 10 LOG_LEVEL_KEY = "redpanda_log_level" DEFAULT_LOG_LEVEL = "info" SUPERUSER_CREDENTIALS = ("admin", "admin", "SCRAM-SHA-256") logs = { "redpanda_start_stdout_stderr": { "path": STDOUT_STDERR_CAPTURE, "collect_default": True }, "wasm_engine_start_stdout_stderr": { "path": WASM_STDOUT_STDERR_CAPTURE, "collect_default": True } } def __init__(self, context, num_brokers, client_type, enable_rp=True, extra_rp_conf=None, enable_pp=False, enable_sr=False, topics=None, num_cores=3): super(RedpandaService, self).__init__(context, num_nodes=num_brokers) self._context = context self._client_type = client_type self._enable_rp = enable_rp self._extra_rp_conf = extra_rp_conf or dict() self._enable_pp = enable_pp self._enable_sr = enable_sr self._log_level = self._context.globals.get(self.LOG_LEVEL_KEY, self.DEFAULT_LOG_LEVEL) self._topics = topics or () self._num_cores = num_cores self._admin = Admin(self) self._started = [] # client is intiialized after service starts self._client = None self.config_file_lock = threading.Lock() def sasl_enabled(self): return self._extra_rp_conf and self._extra_rp_conf.get( "enable_sasl", False) def start(self, nodes=None, clean_nodes=True): """Start the service on all nodes.""" to_start = nodes if nodes is not None else self.nodes assert all((node in self.nodes for node in to_start)) self.logger.info("%s: starting service" % self.who_am_i()) if self._start_time < 0: # Set self._start_time only the first time self.start is invoked self._start_time = time.time() self.logger.debug( self.who_am_i() + ": killing processes and attempting to clean up before starting") for node in to_start: try: self.stop_node(node) except Exception: pass try: if clean_nodes: self.clean_node(node) else: self.logger.debug("%s: skip cleaning node" % self.who_am_i(node)) except Exception: pass for node in to_start: self.logger.debug("%s: starting node" % self.who_am_i(node)) self.start_node(node) if self._start_duration_seconds < 0: self._start_duration_seconds = time.time() - self._start_time self._admin.create_user(*self.SUPERUSER_CREDENTIALS) self.logger.info("Waiting for all brokers to join cluster") expected = set(self._started) wait_until(lambda: {n for n in self._started if self.registered(n)} == expected, timeout_sec=30, backoff_sec=1, err_msg="Cluster membership did not stabilize") self.logger.info("Verifying storage is in expected state") storage = self.storage() for node in storage.nodes: assert set(node.ns) == {"redpanda"} assert set(node.ns["redpanda"].topics) == {"controller", "kvstore"} self._create_initial_topics() security_settings = dict() if self.sasl_enabled(): username, password, algorithm = self.SUPERUSER_CREDENTIALS security_settings = dict(security_protocol='SASL_PLAINTEXT', sasl_mechanism=algorithm, sasl_plain_username=username, sasl_plain_password=password, request_timeout_ms=30000, api_version_auto_timeout_ms=3000) self._client = KafkaAdminClient(bootstrap_servers=self.brokers_list(), **security_settings) def _create_initial_topics(self): client = self._client_type(self) for spec in self._topics: self.logger.debug(f"Creating initial topic {spec}") client.create_topic(spec) def start_node(self, node, override_cfg_params=None): """ Start a single instance of redpanda. This function will not return until redpanda appears to have started successfully. If redpanda does not start within a timeout period the service will fail to start. Thus this function also acts as an implicit test that redpanda starts quickly. """ node.account.mkdirs(RedpandaService.DATA_DIR) node.account.mkdirs(os.path.dirname(RedpandaService.CONFIG_FILE)) self.write_conf_file(node, override_cfg_params) if self.coproc_enabled(): self.start_wasm_engine(node) cmd = (f"nohup {self.find_binary('redpanda')}" f" --redpanda-cfg {RedpandaService.CONFIG_FILE}" f" --default-log-level {self._log_level}" f" --logger-log-level=exception=debug:archival=debug " f" --kernel-page-cache=true " f" --overprovisioned " f" --smp {self._num_cores} " f" --memory 6G " f" --reserve-memory 0M " f" >> {RedpandaService.STDOUT_STDERR_CAPTURE} 2>&1 &") node.account.ssh(cmd) wait_until( lambda: Admin.ready(node).get("status") == "ready", timeout_sec=RedpandaService.READY_TIMEOUT_SEC, err_msg=f"Redpanda service {node.account.hostname} failed to start", retry_on_exc=True) self._started.append(node) def coproc_enabled(self): coproc = self._extra_rp_conf.get('enable_coproc') dev_mode = self._extra_rp_conf.get('developer_mode') return coproc is True and dev_mode is True def start_wasm_engine(self, node): wcmd = (f"nohup {self.find_binary('node')}" f" {self.find_wasm_root()}/main.js" f" {RedpandaService.CONFIG_FILE} " f" >> {RedpandaService.WASM_STDOUT_STDERR_CAPTURE} 2>&1 &") self.logger.info( f"Starting wasm engine on {node.account} with command: {wcmd}") # wait until the wasm engine has finished booting up wasm_port = 43189 conf_value = self._extra_rp_conf.get('coproc_supervisor_server') if conf_value is not None: wasm_port = conf_value['port'] with node.account.monitor_log( RedpandaService.WASM_STDOUT_STDERR_CAPTURE) as mon: node.account.ssh(wcmd) mon.wait_until( f"Starting redpanda wasm service on port: {wasm_port}", timeout_sec=RedpandaService.READY_TIMEOUT_SEC, backoff_sec=0.5, err_msg= f"Wasm engine didn't finish startup in {RedpandaService.READY_TIMEOUT_SEC} seconds", ) def monitor_log(self, node): assert node in self._started return node.account.monitor_log(RedpandaService.STDOUT_STDERR_CAPTURE) def find_wasm_root(self): rp_install_path_root = self._context.globals.get( "rp_install_path_root", None) return f"{rp_install_path_root}/opt/wasm" def find_binary(self, name): rp_install_path_root = self._context.globals.get( "rp_install_path_root", None) return f"{rp_install_path_root}/bin/{name}" def stop_node(self, node): pids = self.pids(node) for pid in pids: node.account.signal(pid, signal.SIGTERM, allow_fail=False) timeout_sec = 30 wait_until(lambda: len(self.pids(node)) == 0, timeout_sec=timeout_sec, err_msg="Redpanda node failed to stop in %d seconds" % timeout_sec) def clean_node(self, node): node.account.kill_process("redpanda", clean_shutdown=False) node.account.remove(f"{RedpandaService.PERSISTENT_ROOT}/*") node.account.remove(f"{RedpandaService.CONFIG_FILE}") def pids(self, node): """Return process ids associated with running processes on the given node.""" try: cmd = "ps ax | grep -i 'redpanda\|node' | grep -v grep | awk '{print $1}'" pid_arr = [ pid for pid in node.account.ssh_capture( cmd, allow_fail=True, callback=int) ] return pid_arr except (RemoteCommandError, ValueError): return [] def started_nodes(self): return self._started def write_conf_file(self, node, override_cfg_params): node_info = {self.idx(n): n for n in self.nodes} conf = self.render("redpanda.yaml", node=node, data_dir=RedpandaService.DATA_DIR, cluster=RedpandaService.CLUSTER_NAME, nodes=node_info, node_id=self.idx(node), enable_rp=self._enable_rp, enable_pp=self._enable_pp, enable_sr=self._enable_sr, superuser=self.SUPERUSER_CREDENTIALS, sasl_enabled=self.sasl_enabled()) if self._extra_rp_conf: doc = yaml.full_load(conf) self.logger.debug( "Setting custom Redpanda configuration options: {}".format( self._extra_rp_conf)) doc["redpanda"].update(self._extra_rp_conf) conf = yaml.dump(doc) if override_cfg_params: doc = yaml.full_load(conf) self.logger.debug( "Setting custom Redpanda node configuration options: {}". format(override_cfg_params)) doc["redpanda"].update(override_cfg_params) conf = yaml.dump(doc) self.logger.info("Writing Redpanda config file: {}".format( RedpandaService.CONFIG_FILE)) self.logger.debug(conf) node.account.create_file(RedpandaService.CONFIG_FILE, conf) def restart_nodes(self, nodes, override_cfg_params=None): nodes = [nodes] if isinstance(nodes, ClusterNode) else nodes for node in nodes: self.stop_node(node) for node in nodes: self.start_node(node, override_cfg_params) def registered(self, node): idx = self.idx(node) self.logger.debug( f"Checking if broker {idx} ({node.name} is registered") client = PythonLibrdkafka(self) brokers = client.brokers() broker = brokers.get(idx, None) self.logger.debug(f"Found broker info: {broker}") return broker is not None def controller(self): kc = KafkaCat(self) cid = kc.metadata()["controllerid"] self.logger.debug("Controller reported with id: {}".format(cid)) if cid != -1: node = self.get_node(cid) self.logger.debug("Controller node found: {}".format(node)) return node def node_storage(self, node): """ Retrieve a summary of storage on a node. """ def listdir(path, only_dirs=False): ents = node.account.sftp_client.listdir(path) if not only_dirs: return ents paths = map(lambda fn: (fn, os.path.join(path, fn)), ents) return [p[0] for p in paths if node.account.isdir(p[1])] store = NodeStorage(RedpandaService.DATA_DIR) for ns in listdir(store.data_dir, True): if ns == '.coprocessor_offset_checkpoints': continue ns = store.add_namespace(ns, os.path.join(store.data_dir, ns)) for topic in listdir(ns.path): topic = ns.add_topic(topic, os.path.join(ns.path, topic)) for num in listdir(topic.path): partition = topic.add_partition( num, node, os.path.join(topic.path, num)) partition.add_files(listdir(partition.path)) return store def storage(self): store = ClusterStorage() for node in self._started: s = self.node_storage(node) store.add_node(s) return store def copy_data(self, dest, node): # after copying, move all files up a directory level so the caller does # not need to know what the name of the storage directory is. with tempfile.TemporaryDirectory() as d: node.account.copy_from(RedpandaService.DATA_DIR, d) data_dir = os.path.basename(RedpandaService.DATA_DIR) data_dir = os.path.join(d, data_dir) for fn in os.listdir(data_dir): shutil.move(os.path.join(data_dir, fn), dest) def data_checksum(self, node): """Run command that computes MD5 hash of every file in redpanda data directory. The results of the command are turned into a map from path to hash-size tuples.""" cmd = f"find {RedpandaService.DATA_DIR} -type f -exec md5sum '{{}}' \; -exec stat -c %s '{{}}' \;" lines = node.account.ssh_output(cmd) tokens = lines.split() return { tokens[ix + 1].decode(): (tokens[ix].decode(), int(tokens[ix + 2])) for ix in range(0, len(tokens), 3) } def broker_address(self, node): assert node in self._started cfg = self.read_configuration(node) return f"{node.account.hostname}:{cfg['redpanda']['kafka_api']['port']}" def brokers(self, limit=None): return ",".join(self.brokers_list(limit)) def brokers_list(self, limit=None): brokers = [self.broker_address(n) for n in self._started[:limit]] random.shuffle(brokers) return brokers def metrics(self, node): assert node in self._started url = f"http://{node.account.hostname}:9644/metrics" resp = requests.get(url) assert resp.status_code == 200 return text_string_to_metric_families(resp.text) def read_configuration(self, node): assert node in self._started with self.config_file_lock: with node.account.open(RedpandaService.CONFIG_FILE) as f: return yaml.full_load(f.read()) def shards(self): """ Fetch the max shard id for each node. """ shards_per_node = {} for node in self._started: num_shards = 0 metrics = self.metrics(node) for family in metrics: for sample in family.samples: if sample.name == "vectorized_reactor_utilization": num_shards = max(num_shards, int(sample.labels["shard"])) assert num_shards > 0 shards_per_node[self.idx(node)] = num_shards return shards_per_node def healthy(self): """ A primitive health check on all the nodes which returns True when all nodes report that no under replicated partitions exist. This should later be replaced by a proper / official start-up probe type check on the health of a node after a restart. """ counts = {self.idx(node): None for node in self.nodes} for node in self.nodes: metrics = self.metrics(node) idx = self.idx(node) for family in metrics: for sample in family.samples: if sample.name == "vectorized_cluster_partition_under_replicated_replicas": if counts[idx] is None: counts[idx] = 0 counts[idx] += int(sample.value) return all(map(lambda count: count == 0, counts.values())) def describe_topics(self, topics=None): """ Describe topics. Pass topics=None to describe all topics, or a pass a list of topic names to restrict the call to a set of specific topics. Sample return value: [ {'error_code': 0, 'topic': 'topic-kabn', 'is_internal': False, 'partitions': [ {'error_code': 0, 'partition': 0, 'leader': 1, 'replicas': [1], 'isr': [1], 'offline_replicas': []} } ] """ return self._client.describe_topics(topics) def partitions(self, topic): """ Return partition metadata for the topic. """ kc = KafkaCat(self) md = kc.metadata() topic = next(filter(lambda t: t["topic"] == topic, md["topics"])) def make_partition(p): index = p["partition"] leader_id = p["leader"] leader = None if leader_id == -1 else self.get_node(leader_id) replicas = [self.get_node(r["id"]) for r in p["replicas"]] return Partition(index, leader, replicas) return [make_partition(p) for p in topic["partitions"]] def create_topic(self, specs): if isinstance(specs, TopicSpec): specs = [specs] client = self._client_type(self) for spec in specs: self.logger.debug(f"Creating topic {spec}") client.create_topic(spec) def delete_topic(self, name): client = self._client_type(self) self.logger.debug(f"Deleting topic {name}") client.delete_topic(name)