def start_restaked(node_dir, rpc_port, config): if not config: config = load_config() near_root = config['near_root'] command = [ near_root + 'restaked', '--home=%s' % node_dir, '--rpc-url=127.0.0.1:%d' % rpc_port, '--wait-period=1'] pid = subprocess.Popen(command).pid print("Starting restaked for %s, rpc = 0.0.0.0:%d" % (node_dir, rpc_port)) atexit.register(atexit_stop_restaked, pid)
def __init__(self): node_config = { 'archive': True, 'tracked_shards': [0], } self._config = cluster.load_config() self._near_root, self._node_dirs = cluster.init_cluster( num_nodes=1, num_observers=2, num_shards=1, config=self._config, genesis_config_changes=[['epoch_length', EPOCH_LENGTH], ['block_producer_kickout_threshold', 80]], client_config_changes={ 0: node_config, 1: node_config, 2: node_config, 3: node_config }) self._nodes = [None] * len(self._node_dirs)
# Generates three epochs worth of blocks # Requests next light client block until it reaches the last final block. # Verifies that the returned blocks are what we expect, and runs the validation on them import sys, time sys.path.append('lib') from cluster import start_cluster, load_config from lightclient import compute_block_hash, validate_light_client_block TIMEOUT = 150 config = load_config() client_config_changes = {} if not config['local']: client_config_changes = { "consensus": { "min_block_production_delay": { "secs": 4, "nanos": 0, }, "max_block_production_delay": { "secs": 8, "nanos": 0, }, "max_block_wait_delay": { "secs": 24, "nanos": 0, }, } }
def doit(s, n, N, k, monkeys, timeout): global block_timeout, balances_timeout, tx_tolerance, epoch_length, wait_if_restart, wipe_data, restart_sync_timeout assert 2 <= n <= N config = load_config() local_config_changes = {} monkey_names = [x.__name__ for x in monkeys] proxy = None logging.info(monkey_names) for i in range(N + k + 1): local_config_changes[i] = { "consensus": {"block_header_fetch_horizon": BLOCK_HEADER_FETCH_HORIZON, "state_sync_timeout": {"secs": 5, "nanos": 0}}, "view_client_throttle_period": {"secs": 0, "nanos": 0} } for i in range(N, N + k + 1): # make all the observers track all the shards local_config_changes[i]["tracked_shards"] = list(range(s)) if 'monkey_wipe_data' in monkey_names: # When data can be deleted, with the short epoch length while the node with deleted data folder is syncing, # other nodes can run sufficiently far ahead to GC the old data. Have one archival node to address it. # It is also needed, because the balances timeout is longer, and the txs can get GCed on the observer node # by the time it gets to checking their status. local_config_changes[N + k]['archive'] = True if 'monkey_local_network' in monkey_names or 'monkey_packets_drop' in monkey_names or 'monkey_node_restart' in monkey_names: expect_network_issues() block_timeout += 40 if 'monkey_local_network' in monkey_names or 'monkey_packets_drop' in monkey_names: assert config['local'], 'Network stress operations only work on local nodes' drop_probability = 0.05 if 'monkey_packets_drop' in monkey_names else 0 reject_list = RejectListProxy.create_reject_list(1) proxy = RejectListProxy(reject_list, drop_probability) tx_tolerance += 0.3 if 'monkey_local_network' in monkey_names or 'monkey_packets_drop' in monkey_names: # add 15 seconds + 10 seconds for each unique network-related monkey balances_timeout += 15 if 'monkey_local_network' in monkey_names: balances_timeout += 10 if 'monkey_packets_drop' in monkey_names: wait_if_restart = True balances_timeout += 10 if 'monkey_node_restart' in monkey_names or 'monkey_node_set' in monkey_names: balances_timeout += 10 tx_tolerance += 0.5 if 'monkey_wipe_data' in monkey_names: assert 'monkey_node_restart' in monkey_names or 'monkey_node_set' in monkey_names wipe_data = True balances_timeout += 25 # if nodes can restart, we should give them way more time to sync. # if packets can also be dropped, each state-sync-related request or response lost adds 10 seconds # to the sync process. restart_sync_timeout = 45 if 'monkey_packets_drop' not in monkey_names else 90 block_timeout += (10 if 'monkey_packets_drop' not in monkey_names else 40) # We need to make sure that the blocks that include txs are not garbage collected. From the first tx sent until # we check balances time equal to `balances_timeout * 2` passes, and the block production is capped at 1.7/s. # The GC keeps five epochs of blocks. min_epoch_length = (int((balances_timeout * 2) * 1.7) + 4) // 5 epoch_length = max(epoch_length, min_epoch_length) near_root, node_dirs = init_cluster( N, k + 1, s, config, [["min_gas_price", 0], ["max_inflation_rate", [0, 1]], ["epoch_length", epoch_length], ["block_producer_kickout_threshold", 10], ["chunk_producer_kickout_threshold", 10]], local_config_changes) started = time.time() boot_node = spin_up_node(config, near_root, node_dirs[0], 0, None, None, proxy=proxy) boot_node.stop_checking_store() boot_node.mess_with = False nodes = [boot_node] for i in range(1, N + k + 1): node = spin_up_node(config, near_root, node_dirs[i], i, boot_node.node_key.pk, boot_node.addr(), proxy=proxy) node.stop_checking_store() nodes.append(node) if i >= n and i < N: node.kill() node.mess_with = True else: node.mess_with = False stopped = Value('i', 0) error = Value('i', 0) ps = [] nonces = [(Value('i', 1), Lock()) for _ in range(N + k + 1)] def launch_process(func): nonlocal stopped, error, ps p = Process(target=func, args=(stopped, error, nodes, nonces)) p.start() ps.append((p, func.__name__)) def check_errors(): nonlocal error, ps if error.value != 0: for (p, _) in ps: p.terminate() assert False, "At least one process failed, check error messages above" for monkey in monkeys: launch_process(monkey) launch_process(blocks_tracker) started = time.time() while time.time() - started < timeout: check_errors() time.sleep(1) logging.info("") logging.info("==========================================") logging.info("# TIMEOUT IS HIT, SHUTTING DOWN THE TEST #") logging.info("==========================================") stopped.value = 1 started_shutdown = time.time() proxies_stopped = False while True: check_errors() still_running = [name for (p, name) in ps if p.is_alive()] if len(still_running) == 0: break # If the test is running with proxies, `node_restart` and `node_set` can get # stuck because the proxies now are their child processes. We can't kill the # proxies rigth away, because that would interfere with block production, and # might prevent other workers (e.g. block_tracker) from completing in a timely # manner. Thus, kill the proxies some time into the shut down process. if time.time() - started_shutdown > TIMEOUT_SHUTDOWN / 2 and not proxies_stopped: logging.info("Shutdown is %s seconds in, shutting down proxies if any" % (TIMEOUT_SHUTDOWN / 2)) if boot_node.proxy is not None: boot_node.proxy.global_stopped.value = 1 for p in boot_node.proxy.ps: p.terminate() proxies_stopped = True if time.time() - started_shutdown > TIMEOUT_SHUTDOWN: for (p, _) in ps: p.terminate() assert False, "The test didn't gracefully shut down in time\nStill running: %s" % ( still_running) check_errors() logging.info("Shut down complete, executing store validity checks") for node in nodes: node.is_check_store = True node.check_store()
def doit(s, n, N, k, monkeys, timeout): global block_timeout, balances_timeout, tx_tolerance assert 2 <= n <= N config = load_config() local_config_changes = {} for i in range(N, N + k + 1): # make all the observers track all the shards local_config_changes[i] = {"tracked_shards": list(range(s))} near_root, node_dirs = init_cluster( N, k + 1, s, config, [["min_gas_price", 0], ["max_inflation_rate", [0, 1]], ["epoch_length", EPOCH_LENGTH], ["block_producer_kickout_threshold", 10], ["chunk_producer_kickout_threshold", 10]], local_config_changes) monkey_names = [x.__name__ for x in monkeys] proxy = None logging.info(monkey_names) if 'monkey_local_network' in monkey_names or 'monkey_global_network' in monkey_names: assert config[ 'local'], 'Network stress operations only work on local nodes' reject_list = RejectListProxy.create_reject_list(1) proxy = RejectListProxy(reject_list) expect_network_issues() block_timeout += 40 balances_timeout += 20 tx_tolerance += 0.3 if 'monkey_node_restart' in monkey_names: expect_network_issues() if 'monkey_node_restart' in monkey_names or 'monkey_node_set' in monkey_names: block_timeout += 40 balances_timeout += 10 tx_tolerance += 0.5 started = time.time() boot_node = spin_up_node(config, near_root, node_dirs[0], 0, None, None, proxy=proxy) boot_node.stop_checking_store() boot_node.mess_with = False nodes = [boot_node] for i in range(1, N + k + 1): node = spin_up_node(config, near_root, node_dirs[i], i, boot_node.node_key.pk, boot_node.addr(), proxy=proxy) node.stop_checking_store() nodes.append(node) if i >= n and i < N: node.kill() node.mess_with = True else: node.mess_with = False stopped = Value('i', 0) error = Value('i', 0) ps = [] nonces = [(Value('i', 1), Lock()) for _ in range(N + k + 1)] def launch_process(func): nonlocal stopped, error, ps p = Process(target=func, args=(stopped, error, nodes, nonces)) p.start() ps.append((p, func.__name__)) def check_errors(): nonlocal error, ps if error.value != 0: for (p, _) in ps: p.terminate() assert False, "At least one process failed, check error messages above" for monkey in monkeys: launch_process(monkey) launch_process(blocks_tracker) started = time.time() while time.time() - started < timeout: check_errors() time.sleep(1) logging.info("") logging.info("==========================================") logging.info("# TIMEOUT IS HIT, SHUTTING DOWN THE TEST #") logging.info("==========================================") stopped.value = 1 started_shutdown = time.time() while True: check_errors() still_running = [name for (p, name) in ps if p.is_alive()] if len(still_running) == 0: break if time.time() - started_shutdown > TIMEOUT_SHUTDOWN: for (p, _) in ps: p.terminate() assert False, "The test didn't gracefully shut down in time\nStill running: %s" % ( still_running) check_errors() logging.info("Shut down complete, executing store validity checks") for node in nodes: node.is_check_store = True node.check_store()