def run_to_destruction(args): with infra.network.network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_open(args) LOG.warning("About to issue transactions until destruction") try: wsm = 5000 while True: LOG.info(f"Trying with writes scaled by {wsm}") network = test(network, args, batch_size=10, write_size_multiplier=wsm) if wsm > 1000000: LOG.error( f"Run to destruction still hasn't caused exception with write sizes multiplied by {wsm}. Infinite loop, or not actually submitting?" ) raise ValueError(wsm) else: wsm += 50000 # Grow very quickly, expect to fail on the second iteration except Exception as e: timeout = 10 LOG.info("Large write set caused an exception, as expected") LOG.info(f"Exception was: {e}") LOG.info(f"Polling for {timeout}s for node to terminate") end_time = time.time() + timeout while time.time() < end_time: time.sleep(0.1) exit_code = network.nodes[0].remote.remote.proc.poll() if exit_code is not None: LOG.info(f"Node terminated with exit code {exit_code}") assert exit_code != 0 break if time.time() > end_time: raise TimeoutError( f"Node took longer than {timeout}s to terminate") from e network.ignore_errors_on_shutdown()
def run(args): # Set a relatively low cap on max open sessions, so we can saturate it in a reasonable amount of time args.max_open_sessions = 40 args.max_open_sessions_hard = args.max_open_sessions + 5 # Listen on additional RPC interfaces with even lower session caps for i, node_spec in enumerate(args.nodes): additional_args = [] caps = interface_caps(i) for address, cap in caps.items(): additional_args.append(f"--rpc-interface={address},,{cap}") node_spec.additional_raw_node_args = additional_args # Chunk often, so that new fds are regularly requested args.ledger_chunk_bytes = "500B" with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb ) as network: check = infra.checker.Checker() network.start_and_join(args) primary, _ = network.find_nodes() caps = interface_caps(primary.local_node_id) primary_pid = primary.remote.remote.proc.pid initial_fds = psutil.Process(primary_pid).num_fds() assert ( initial_fds < args.max_open_sessions ), f"Initial number of file descriptors has already reached session limit: {initial_fds} >= {args.max_open_sessions}" num_fds = initial_fds LOG.success(f"{primary_pid} has {num_fds} open file descriptors") initial_metrics = get_session_metrics(primary) assert initial_metrics["active"] <= initial_metrics["peak"], initial_metrics main_session_metrics = initial_metrics["interfaces"][ f"{primary.rpc_host}:{primary.rpc_port}" ] assert ( main_session_metrics["soft_cap"] == args.max_open_sessions ), initial_metrics assert ( main_session_metrics["hard_cap"] == args.max_open_sessions_hard ), initial_metrics max_fds = args.max_open_sessions + (initial_fds * 2) def create_connections_until_exhaustion( target, continue_to_hard_cap=False, client_fn=primary.client ): with contextlib.ExitStack() as es: clients = [] LOG.success(f"Creating {target} clients") consecutive_failures = 0 i = 1 healthy_clients = [] while i <= target: logs = [] try: clients.append( es.enter_context( client_fn( identity="user0", connection_timeout=1, limits=httpx.Limits( max_connections=1, max_keepalive_connections=1, keepalive_expiry=30, ), ) ) ) r = clients[-1].post( "/app/log/private", {"id": 42, "msg": "foo"}, log_capture=logs, ) if r.status_code == http.HTTPStatus.OK: check( r, result=True, ) consecutive_failures = 0 i += 1 healthy_clients.append(clients[-1]) elif r.status_code == http.HTTPStatus.SERVICE_UNAVAILABLE: if continue_to_hard_cap: consecutive_failures = 0 i += 1 continue raise RuntimeError(r.body.text()) else: flush_info(logs) raise ValueError( f"Unexpected response status code: {r.status_code}" ) except (CCFConnectionException, RuntimeError) as e: flush_info(logs) LOG.warning(f"Hit exception at client {i}/{target}: {e}") clients.pop(-1) if consecutive_failures < 5: # Maybe got unlucky and tried to create a session while many files were open - keep trying consecutive_failures += 1 continue else: # Ok you've really hit a wall, stop trying to create clients break else: raise AllConnectionsCreatedException( f"Successfully created {target} clients without exception - expected this to exhaust available connections" ) num_fds = psutil.Process(primary_pid).num_fds() LOG.success( f"{primary_pid} has {num_fds}/{max_fds} open file descriptors" ) r = clients[0].get("/node/metrics") assert r.status_code == http.HTTPStatus.OK, r.status_code peak_metrics = r.body.json()["sessions"] assert peak_metrics["active"] <= peak_metrics["peak"], peak_metrics assert peak_metrics["active"] == len(healthy_clients), ( peak_metrics, len(healthy_clients), ) # Submit many requests, and at least enough to trigger additional snapshots more_requests = max(len(clients) * 3, args.snapshot_tx_interval * 2) LOG.info( f"Submitting an additional {more_requests} requests from existing clients" ) for _ in range(more_requests): client = random.choice(healthy_clients) logs = [] try: client.post( "/app/log/private", {"id": 42, "msg": "foo"}, timeout=1, log_capture=logs, ) except Exception as e: flush_info(logs) LOG.error(e) raise e time.sleep(1) num_fds = psutil.Process(primary_pid).num_fds() LOG.success( f"{primary_pid} has {num_fds}/{max_fds} open file descriptors" ) LOG.info("Disconnecting clients") clients = [] time.sleep(1) num_fds = psutil.Process(primary_pid).num_fds() LOG.success(f"{primary_pid} has {num_fds}/{max_fds} open file descriptors") return num_fds # For initial safe tests, we have many more fds than the maximum sessions, so file operations should still succeed even when network is saturated resource.prlimit(primary_pid, resource.RLIMIT_NOFILE, (max_fds, max_fds)) LOG.success(f"Setting max fds to safe initial value {max_fds} on {primary_pid}") nb_conn = (max_fds - num_fds) * 2 num_fds = create_connections_until_exhaustion(nb_conn) to_create = max_fds - num_fds + 1 num_fds = create_connections_until_exhaustion(to_create) # Check that lower caps are enforced on each interface for i, (address, cap) in enumerate(caps.items()): create_connections_until_exhaustion( cap + 1, client_fn=functools.partial(primary.client, interface_idx=i + 1), ) try: create_connections_until_exhaustion(to_create, True) except AllConnectionsCreatedException: # This is fine! The soft cap means this test no longer reaches the hard cap. # It gets HTTP errors but then _closes_ sockets, fast enough that we never hit the hard cap pass final_metrics = get_session_metrics(primary) assert final_metrics["active"] <= final_metrics["peak"], final_metrics assert final_metrics["peak"] > initial_metrics["peak"], ( initial_metrics, final_metrics, ) assert final_metrics["peak"] >= args.max_open_sessions, final_metrics assert final_metrics["peak"] < args.max_open_sessions_hard, final_metrics # Now set a low fd limit, so network sessions completely exhaust them - expect this to cause failures max_fds = args.max_open_sessions // 2 resource.prlimit(primary_pid, resource.RLIMIT_NOFILE, (max_fds, max_fds)) LOG.success(f"Setting max fds to dangerously low {max_fds} on {primary_pid}") try: num_fds = create_connections_until_exhaustion(to_create) except Exception as e: LOG.warning( f"Node with only {max_fds} fds crashed when allowed to created {args.max_open_sessions} sessions, as expected" ) LOG.warning(e) network.ignore_errors_on_shutdown() else: LOG.warning("Expected a fatal crash and saw none!")
def run(args): # Set a relatively low cap on max open sessions, so we can saturate it in a reasonable amount of time args.max_open_sessions = 100 args.max_open_sessions_hard = args.max_open_sessions + 20 # Chunk often, so that new fds are regularly requested args.ledger_chunk_bytes = "500B" with infra.network.network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: check = infra.checker.Checker() network.start_and_join(args) primary, _ = network.find_nodes() primary_pid = primary.remote.remote.proc.pid initial_fds = psutil.Process(primary_pid).num_fds() assert ( initial_fds < args.max_open_sessions ), f"Initial number of file descriptors has already reached session limit: {initial_fds} >= {args.max_open_sessions}" num_fds = initial_fds LOG.success(f"{primary_pid} has {num_fds} open file descriptors") def create_connections_until_exhaustion(target, continue_to_hard_cap=False): with contextlib.ExitStack() as es: clients = [] LOG.success(f"Creating {target} clients") consecutive_failures = 0 for i in range(target): logs = [] try: clients.append( es.enter_context( primary.client("user0", connection_timeout=1))) r = clients[-1].post( "/app/log/private", { "id": 42, "msg": "foo" }, log_capture=logs, ) if r.status_code == http.HTTPStatus.OK: check( r, result=True, ) consecutive_failures = 0 elif r.status_code == http.HTTPStatus.SERVICE_UNAVAILABLE: if continue_to_hard_cap: consecutive_failures = 0 continue raise RuntimeError(r.body.text()) else: flush_info(logs) raise ValueError( f"Unexpected response status code: {r.status_code}" ) except (CCFConnectionException, RuntimeError) as e: flush_info(logs) LOG.warning(f"Hit exception at client {i}: {e}") clients.pop(-1) if consecutive_failures < 5: # Maybe got unlucky and tried to create a session while many files were open - keep trying consecutive_failures += 1 continue else: # Ok you've really hit a wall, stop trying to create clients break else: raise AllConnectionsCreatedException( f"Successfully created {target} clients without exception - expected this to exhaust available connections" ) num_fds = psutil.Process(primary_pid).num_fds() LOG.success( f"{primary_pid} has {num_fds}/{max_fds} open file descriptors" ) # Submit many requests, and at least enough to trigger additional snapshots more_requests = max( len(clients) * 3, args.snapshot_tx_interval * 2) LOG.info( f"Submitting an additional {more_requests} requests from existing clients" ) for _ in range(more_requests): client = random.choice(clients) logs = [] try: client.post( "/app/log/private", { "id": 42, "msg": "foo" }, timeout=1, log_capture=logs, ) except Exception as e: flush_info(logs) LOG.error(e) raise e time.sleep(1) num_fds = psutil.Process(primary_pid).num_fds() LOG.success( f"{primary_pid} has {num_fds}/{max_fds} open file descriptors" ) LOG.info("Disconnecting clients") clients = [] time.sleep(1) num_fds = psutil.Process(primary_pid).num_fds() LOG.success( f"{primary_pid} has {num_fds}/{max_fds} open file descriptors") return num_fds # For initial safe tests, we have many more fds than the maximum sessions, so file operations should still succeed even when network is saturated max_fds = args.max_open_sessions + (initial_fds * 2) resource.prlimit(primary_pid, resource.RLIMIT_NOFILE, (max_fds, max_fds)) LOG.success( f"Setting max fds to safe initial value {max_fds} on {primary_pid}" ) nb_conn = (max_fds - num_fds) * 2 num_fds = create_connections_until_exhaustion(nb_conn) to_create = max_fds - num_fds + 1 num_fds = create_connections_until_exhaustion(to_create) try: create_connections_until_exhaustion(to_create, True) except AllConnectionsCreatedException as e: # This is fine! The soft cap means this test no longer reaches the hard cap. # It gets HTTP errors but then _closes_ sockets, fast enough that we never hit the hard cap pass # Now set a low fd limit, so network sessions completely exhaust them - expect this to cause failures max_fds = args.max_open_sessions // 2 resource.prlimit(primary_pid, resource.RLIMIT_NOFILE, (max_fds, max_fds)) LOG.success( f"Setting max fds to dangerously low {max_fds} on {primary_pid}") try: num_fds = create_connections_until_exhaustion(to_create) except Exception as e: LOG.warning( f"Node with only {max_fds} fds crashed when allowed to created {args.max_open_sessions} sessions, as expected" ) LOG.warning(e) network.ignore_errors_on_shutdown() else: raise RuntimeError("Expected a fatal crash and saw none!")