def get_container_id(self, service: str, running: bool = False) -> str: """Given a service name, tries to find a unique matching container id If running is True, only return running containers. """ try: if running: cmd = f"docker ps".split() else: cmd = f"docker ps -a".split() list_containers = spawn.capture(cmd, unicode=True) pattern = re.compile(f"^(?P<c_id>[^ ]+).*{service}") matches = [] for line in list_containers.splitlines(): m = pattern.search(line) if m: matches.append(m.group("c_id")) if len(matches) != 1: raise Failed( f"failed to get a unique container id for service {service}, found: {matches}" ) return matches[0] except subprocess.CalledProcessError as e: raise Failed(f"failed to get container id for {service}: {e}")
def stop_and_start(self) -> None: try: spawn.runv(["docker", self._stop_cmd, self._container]) except subprocess.CalledProcessError as e: raise Failed(f"Unable to {self._stop_cmd} container {self._container}: {e}") time.sleep(self._stopped_time) try: spawn.runv(["docker", self._run_cmd, self._container]) except subprocess.CalledProcessError as e: raise Failed(f"Unable to {self._run_cmd} container {self._container}: {e}") time.sleep(self._running_time)
def stop_and_start(self, container_id: str) -> None: try: spawn.runv(["docker", self._stop_cmd, container_id]) except subprocess.CalledProcessError as e: raise Failed(f"Unable to {self._stop_cmd} container {container_id}: {e}") time.sleep(self._stop_time) try: spawn.runv(["docker", self._run_cmd, container_id]) except subprocess.CalledProcessError as e: raise Failed(f"Unable to {self._run_cmd} container {container_id}: {e}") time.sleep(self._run_time)
def run(self, comp: Composition, workflow: Workflow) -> None: container_id = comp.get_container_id(self._service) if self._running: if not comp.docker_container_is_running(container_id): raise Failed(f"chaos-confirm: container {container_id} is not running") else: if comp.docker_container_is_running(container_id): raise Failed( f"chaos-confirm: expected {container_id} to have exited, is running" ) actual_exit_code = comp.docker_inspect("{{.State.ExitCode}}", container_id) if actual_exit_code != f"'{self._exit_code}'": raise Failed( f"chaos-confirm: expected exit code '{self._exit_code}' for {container_id}, found {actual_exit_code}" )
def run(self, services: List[str]) -> None: """run mzcompose run in this directory""" with cd(self._path): try: mzcompose_run(services) except subprocess.CalledProcessError: raise Failed("error when bringing up all services")
def run(self, comp: Composition, workflow: Workflow) -> None: if not self._chaos: self._chaos = self.default_chaos if not self._services: self._services = self.get_container_ids(running=True) say(f"will run these chaos types: {self._chaos} on these containers: {self._services}" ) if not self._other_service: say(f"no 'other_service' provided, running chaos forever") while True: self.add_chaos() else: container_ids = self.get_container_ids( services=[self._other_service]) if len(container_ids) != 1: raise Failed( f"wrong number of container ids found for service {self._other_service}. expected 1, found: {len(container_ids)}" ) container_id = container_ids[0] say(f"running chaos as long as {self._other_service} (container {container_id}) is running" ) while comp.docker_container_is_running(container_id): self.add_chaos()
def get_container_ids(self, services: List[str] = [], running: bool = False) -> List[str]: """ Parse Docker processes for container ids. :param services: If provided, only return container ids for these services. :param running: If True, only return container ids of running processes. :return: Docker container id strs """ try: docker_processes = self.get_docker_processes(running=running) patterns = [] if services: for service in services: patterns.append(f"^(?P<c_id>[^ ]+).*{service}") else: patterns.append(f"^(?P<c_id>[^ ]+).*") matches = [] for pattern in patterns: compiled_pattern = re.compile(pattern) for process in docker_processes.splitlines(): m = compiled_pattern.search(process) if m and m.group("c_id") != "CONTAINER": matches.append(m.group("c_id")) return matches except subprocess.CalledProcessError as e: raise Failed(f"failed to get Docker container ids: {e}")
def run(self, comp: Composition, workflow: Workflow) -> None: ui.progress( f"waiting for {self._host}:{self._port}", "C", ) for remaining in ui.timeout_loop(self._timeout_secs): cmd = f"docker run --rm -t --network {comp.name}_default ubuntu:bionic-20200403".split( ) cmd.extend([ "timeout", str(self._timeout_secs), "bash", "-c", f"cat < /dev/null > /dev/tcp/{self._host}/{self._port}", ]) try: spawn.capture(cmd, unicode=True, stderr_too=True) except subprocess.CalledProcessError as e: ui.log_in_automation( "wait-for-tcp ({}:{}): error running {}: {}, stdout:\n{}\nstderr:\n{}" .format( self._host, self._port, ui.shell_quote(cmd), e, e.stdout, e.stderr, )) ui.progress(" {}".format(int(remaining))) else: ui.progress(" success!", finish=True) return raise Failed(f"Unable to connect to {self._host}:{self._port}")
def run(self, comp: Composition, workflow: Workflow) -> None: container_id = comp.get_container_id(self._service) say(f"Killing container: {container_id}") try: spawn.runv(["docker", "kill", container_id]) except subprocess.CalledProcessError as e: raise Failed(f"Unable to kill container {container_id}: {e}")
def wait_for_mysql(timeout_secs: int, user: str, passwd: str, host: str, port: int) -> None: args = f"mysql user={user} host={host} port={port}" ui.progress(f"waitng for {args}", "C") error = None for _ in ui.timeout_loop(timeout_secs): try: conn = pymysql.connect(user=user, passwd=passwd, host=host, port=port) with conn.cursor() as cur: cur.execute("SELECT 1") result = cur.fetchone() if result == (1, ): print(f"success!") return else: print(f"weird, {args} did not return 1: {result}") except Exception as e: ui.progress(".") error = e ui.progress(finish=True) raise Failed(f"Never got correct result for {args}: {error}")
def run(self, comp: Composition, workflow: Workflow) -> None: try: container_id = comp.get_container_id(self._service) cmd = f"docker exec {container_id} tc qdisc add dev eth0 root netem delay {self._delay}ms".split( ) spawn.runv(cmd) except subprocess.CalledProcessError as e: raise Failed(f"Unable to delay container {container_id}: {e}")
def confirm_exit_code(self, container_id: str, expected_exit_code: int) -> None: actual_exit_code = self.docker_inspect("{{.State.ExitCode}}", container_id) if actual_exit_code != f"'{expected_exit_code}'": raise Failed( f"chaos-confirm: expected exit code '{expected_exit_code}' for {container_id}, found {actual_exit_code}" )
def run(self, comp: Composition, workflow: Workflow) -> None: pattern = f"{comp.name}_{self._container}" ui.progress(f"Ensuring {self._container} stays up ", "C") for i in range(self._uptime_secs, 0, -1): time.sleep(1) try: stdout = spawn.capture(["docker", "ps", "--format={{.Names}}"], unicode=True) except subprocess.CalledProcessError as e: raise Failed(f"{e.stdout}") found = False for line in stdout.splitlines(): if line.startswith(pattern): found = True break if not found: print(f"failed! {pattern} logs follow:") print_docker_logs(pattern, 10) raise Failed(f"container {self._container} stopped running!") ui.progress(f" {i}") print()
def add_chaos(self) -> None: random_container = random.choice(self._services) random_chaos = random.choice(self._chaos) if random_chaos == "pause": self.add_and_remove_chaos( add_cmd=f"docker pause {random_container}", remove_cmd=f"docker unpause {random_container}", ) elif random_chaos == "stop": self.add_and_remove_chaos( add_cmd=f"docker stop {random_container}", remove_cmd=f"docker start {random_container}", ) elif random_chaos == "kill": self.add_and_remove_chaos( add_cmd=f"docker kill {random_container}", remove_cmd=f"docker start {random_container}", ) elif random_chaos == "delay": self.add_and_remove_netem_chaos( container_id=random_container, add_cmd= f"docker exec -t {random_container} tc qdisc add dev eth0 root netem \ delay 100ms 100ms distribution normal", ) elif random_chaos == "rate": self.add_and_remove_netem_chaos( container_id=random_container, add_cmd= f"docker exec -t {random_container} tc qdisc add dev eth0 root netem \ rate 5kbit 20 100 5", ) elif random_chaos == "loss": self.add_and_remove_netem_chaos( container_id=random_container, add_cmd= f"docker exec -t {random_container} tc qdisc add dev eth0 root netem loss 10", ) elif random_chaos == "duplicate": self.add_and_remove_netem_chaos( container_id=random_container, add_cmd= f"docker exec -t {random_container} tc qdisc add dev eth0 root netem duplicate 10", ) elif random_chaos == "corrupt": self.add_and_remove_netem_chaos( container_id=random_container, add_cmd= f"docker exec -t {random_container} tc qdisc add dev eth0 root netem corrupt 10", ) else: raise Failed(f"unexpected type of chaos: {random_chaos}")
def docker_inspect(self, format: str, container_id: str) -> str: try: cmd = f"docker inspect -f '{format}' {container_id}".split() output = spawn.capture(cmd, unicode=True, stderr_too=True).splitlines()[0] except subprocess.CalledProcessError as e: ui.log_in_automation( "docker inspect ({}): error running {}: {}, stdout:\n{}\nstderr:\n{}".format( container_id, ui.shell_quote(cmd), e, e.stdout, e.stderr, ) ) raise Failed(f"failed to inspect Docker container: {e}") else: return output
def get_docker_processes(running: bool = False) -> str: """ Use 'docker ps' to return all Docker process information. :param running: If True, only return running processes. :return: str of processes """ try: if running: cmd = f"docker ps".split() else: cmd = f"docker ps -a".split() return spawn.capture(cmd, unicode=True) except subprocess.CalledProcessError as e: raise Failed(f"failed to get Docker container ids: {e}")
def wait_for_pg( timeout_secs: int, query: str, dbname: str, port: int, host: str, user: str, password: str, print_result: bool, expected: Union[Iterable[Any], Literal["any"]], ) -> None: """Wait for a pg-compatible database (includes materialized) """ args = f"dbname={dbname} host={host} port={port} user={user} password={password}" ui.progress(f"waiting for {args} to handle {query!r}", "C") error = None for remaining in ui.timeout_loop(timeout_secs): try: conn = pg8000.connect( database=dbname, host=host, port=port, user=user, password=password, timeout=1, ) cur = conn.cursor() cur.execute(query) if expected == "any" and cur.rowcount == -1: ui.progress("success!", finish=True) return result = list(cur.fetchall()) if expected == "any" or result == expected: if print_result: say(f"query result: {result}") else: ui.progress("success!", finish=True) return else: say(f"host={host} port={port} did not return rows matching {expected} got: {result}" ) except Exception as e: ui.progress(" " + str(int(remaining))) error = e ui.progress(finish=True) raise Failed(f"never got correct result for {args}: {error}")
def wait_for_pg( timeout_secs: int, query: str, dbname: str, port: int, host: str, print_result: bool, expected: Union[Iterable[Any], Literal["any"]], ) -> None: """Wait for a pg-compatible database (includes materialized) """ args = f"dbname={dbname} host={host} port={port} user=ignored" ui.progress(f"waiting for {args} to handle {query!r}", "C") error = None if isinstance(expected, tuple): expected = list(expected) for remaining in ui.timeout_loop(timeout_secs): try: conn = pg8000.connect(database=dbname, host=host, port=port, user="******", timeout=1) cur = conn.cursor() cur.execute(query) result = cur.fetchall() found_result = False for row in result: if expected == "any" or list(row) == expected: if not found_result: found_result = True ui.progress(" up and responding!", finish=True) if print_result: say("query result:") if print_result: print(" ".join([str(r) for r in row])) if found_result: return else: say(f"host={host} port={port} did not return any row matching {expected} got: {result}" ) except Exception as e: ui.progress(" " + str(int(remaining))) error = e ui.progress(finish=True) raise Failed(f"never got correct result for {args}: {error}")
def run(self, comp: Composition, workflow: Workflow) -> None: if self._port is None: ports = comp.find_host_ports(self._service) if len(ports) != 1: raise Failed( f"Could not unambiguously determine port for {self._service} " f"found: {','.join(ports)}") port = int(ports[0]) else: port = self._port wait_for_mysql( user=self._user, passwd=self._password, host=self._host, port=port, timeout_secs=self._timeout_secs, )
def run(self, comp: Composition, workflow: Workflow) -> None: if self._port is None: ports = comp.find_host_ports(self._service) if len(ports) != 1: raise Failed( f"Unable to unambiguously determine port for {self._service}, " f"found ports: {','.join(ports)}") port = int(ports[0]) else: port = self._port wait_for_pg( dbname=self._dbname, host=self._host, port=port, timeout_secs=self._timeout_secs, query=self._query, expected=self._expected, print_result=self._print_result, )
def run(self, comp: Composition, workflow: Workflow) -> None: if self._port is None: ports = comp.find_host_ports(self._service) if len(ports) != 1: raise Failed( f"Could not unambiguously determine port for {self._service} " f"found: {','.join(ports)}") port = int(ports[0]) else: port = self._port conn = pymysql.connect( user=self._user, passwd=self._password, host=self._host, port=port, client_flag=pymysql.constants.CLIENT.MULTI_STATEMENTS, autocommit=True, ) with conn.cursor() as cur: cur.execute(self._query)
def run(self, comp: Composition) -> None: ui.progress( f"waiting for {self._host}:{self._port}", "C", ) for remaining in ui.timeout_loop(self._timeout_secs): cmd = f"docker run --rm -it --network {comp.name}_default ubuntu:bionic-20200403".split( ) cmd.extend([ "timeout", str(self._timeout_secs), "bash", "-c", f"cat < /dev/null > /dev/tcp/{self._host}/{self._port}", ]) try: spawn.capture(cmd, unicode=True, stderr_too=True) except subprocess.CalledProcessError: ui.progress(" {}".format(int(remaining))) else: ui.progress(" success!", finish=True) return raise Failed(f"Unable to connect to {self._host}:{self._port}")
def run(self, services: List[str]) -> None: """run mzcompose run in this directory""" try: mzcompose_run(services, args=self._compose_args()) except subprocess.CalledProcessError as e: raise Failed("error when bringing up all services") from e
def run(self, comp: Composition, workflow: Workflow) -> None: try: workflow.mzcompose_run(self._command) except subprocess.CalledProcessError: raise Failed("giving up: {}".format(ui.shell_quote(self._command)))
def run(self, comp: Composition, workflow: Workflow) -> None: try: workflow.mzcompose_up(self._services) except subprocess.CalledProcessError: services = ", ".join(self._services) raise Failed(f"ERROR: services didn't come up cleanly: {services}")
def up(self, services: List[str]) -> None: with cd(self._path): try: mzcompose_up(services) except subprocess.CalledProcessError: raise Failed("error when bringing up all services")
def run(self, comp: Composition, workflow: Workflow) -> None: try: cmd = self.get_cmd().split() spawn.runv(cmd) except subprocess.CalledProcessError as e: raise Failed(f"Unable to run netem chaos command: {e.stderr}")
def up(self, services: List[str]) -> None: try: mzcompose_up(services, args=self._compose_args()) except subprocess.CalledProcessError: raise Failed("error when bringing up all services")
def confirm_is_running(self, container_id: str) -> None: if self.docker_inspect("{{.State.Running}}", container_id) != "'true'": raise Failed( f"chaos-confirm: container {container_id} is not running")
def threaded_netem(self, cmd: List[str]) -> None: try: spawn.runv(cmd) except subprocess.CalledProcessError as e: raise Failed(f"Unable to run netem chaos command: {e}")