def test_expired_kes( self, cluster_kes: clusterlib.ClusterLib, ): """Test expired KES.""" cluster = cluster_kes expire_timeout = int(cluster.slots_per_kes_period * cluster.slot_length * cluster.max_kes_evolutions + 1) expected_errors = [ ("*.stdout", "TraceNoLedgerView"), ("*.stdout", "KESKeyAlreadyPoisoned"), ("*.stdout", "KESCouldNotEvolve"), ("*.stdout", r"ExceededTimeLimit \(ChainSync"), ] with logfiles.expect_errors(expected_errors): LOGGER.info( f"Waiting for {expire_timeout} sec for KES expiration.") time.sleep(expire_timeout) init_slot = cluster.get_slot_no() kes_period_timeout = int(cluster.slots_per_kes_period * cluster.slot_length + 1) LOGGER.info( f"Waiting for {kes_period_timeout} sec for next KES period.") time.sleep(kes_period_timeout) assert cluster.get_slot_no() == init_slot, "Unexpected new slots"
def test_opcert_past_kes_period( self, cluster_lock_pool2: clusterlib.ClusterLib, cluster_manager: cluster_management.ClusterManager, ): """Start a stake pool with an operational certificate created with expired `--kes-period`. * generate new operational certificate with `--kes-period` in the past * restart the node with the new operational certificate * check that the pool is not producing any blocks * generate new operational certificate with valid `--kes-period` and restart the node * check that the pool is producing blocks again """ pool_name = "node-pool2" node_name = "pool2" cluster = cluster_lock_pool2 temp_template = helpers.get_func_name() pool_rec = cluster_manager.cache.addrs_data[pool_name] node_cold = pool_rec["cold_key_pair"] stake_pool_id = cluster.get_stake_pool_id(node_cold.vkey_file) stake_pool_id_dec = helpers.decode_bech32(stake_pool_id) opcert_file: Path = pool_rec["pool_operational_cert"] def _wait_epoch_chores(this_epoch: int): # wait for next epoch if cluster.get_epoch() == this_epoch: cluster.wait_for_new_epoch() # wait for the end of the epoch clusterlib_utils.wait_for_epoch_interval(cluster_obj=cluster, start=-19, stop=-9) # save ledger state clusterlib_utils.save_ledger_state( cluster_obj=cluster, state_name=f"{temp_template}_{cluster.get_epoch()}", ) with cluster_manager.restart_on_failure(): # generate new operational certificate with `--kes-period` in the past invalid_opcert_file = cluster.gen_node_operational_cert( node_name=node_name, kes_vkey_file=pool_rec["kes_key_pair"].vkey_file, cold_skey_file=pool_rec["cold_key_pair"].skey_file, cold_counter_file=pool_rec["cold_key_pair"].counter_file, kes_period=cluster.get_kes_period() - 5, ) expected_errors = [ (f"{node_name}.stdout", "TPraosCannotForgeKeyNotUsableYet"), ] with logfiles.expect_errors(expected_errors): # restart the node with the new operational certificate logfiles.add_ignore_rule("*.stdout", "MuxBearerClosed") shutil.copy(invalid_opcert_file, opcert_file) cluster_nodes.restart_node(node_name) cluster.wait_for_new_epoch() LOGGER.info("Checking blocks production for 5 epochs.") this_epoch = -1 for __ in range(5): _wait_epoch_chores(this_epoch) this_epoch = cluster.get_epoch() # check that the pool is not producing any blocks blocks_made = clusterlib_utils.get_ledger_state( cluster_obj=cluster)["blocksCurrent"] if blocks_made: assert ( stake_pool_id_dec not in blocks_made ), f"The pool '{pool_name}' has produced blocks in epoch {this_epoch}" # generate new operational certificate with valid `--kes-period` os.remove(opcert_file) valid_opcert_file = cluster.gen_node_operational_cert( node_name=node_name, kes_vkey_file=pool_rec["kes_key_pair"].vkey_file, cold_skey_file=pool_rec["cold_key_pair"].skey_file, cold_counter_file=pool_rec["cold_key_pair"].counter_file, kes_period=cluster.get_kes_period(), ) # copy the new certificate and restart the node shutil.move(str(valid_opcert_file), str(opcert_file)) cluster_nodes.restart_node(node_name) cluster.wait_for_new_epoch() LOGGER.info("Checking blocks production for another 5 epochs.") blocks_made_db = [] this_epoch = cluster.get_epoch() active_again_epoch = this_epoch for __ in range(5): _wait_epoch_chores(this_epoch) this_epoch = cluster.get_epoch() # check that the pool is producing blocks blocks_made = clusterlib_utils.get_ledger_state( cluster_obj=cluster)["blocksCurrent"] blocks_made_db.append(stake_pool_id_dec in blocks_made) assert any(blocks_made_db), ( f"The pool '{pool_name}' has not produced any blocks " f"since epoch {active_again_epoch}")
def test_expired_kes( self, cluster_kes: clusterlib.ClusterLib, cluster_manager: cluster_management.ClusterManager, worker_id: str, ): """Test expired KES. * start local cluster instance configured with short KES period and low number of key evolutions, so KES expires soon on all pools * refresh opcert on 2 of the 3 pools, so KES doesn't expire on those 2 pools and the pools keep minting blocks * wait for KES expiration on the selected pool * check that the pool with expired KES didn't mint blocks in an epoch that followed after KES expiration * check KES period info command with an operational certificate with an expired KES * check KES period info command with operational certificates with a valid KES """ cluster = cluster_kes temp_template = common.get_test_id(cluster) expire_timeout = 200 expire_node_name = "pool1" expire_pool_name = f"node-{expire_node_name}" expire_pool_rec = cluster_manager.cache.addrs_data[expire_pool_name] expire_pool_id = cluster.get_stake_pool_id(expire_pool_rec["cold_key_pair"].vkey_file) expire_pool_id_dec = helpers.decode_bech32(expire_pool_id) # refresh opcert on 2 of the 3 pools, so KES doesn't expire on those 2 pools and # the pools keep minting blocks refreshed_nodes = ["pool2", "pool3"] def _refresh_opcerts(): for n in refreshed_nodes: refreshed_pool_rec = cluster_manager.cache.addrs_data[f"node-{n}"] refreshed_opcert_file = cluster.gen_node_operational_cert( node_name=f"{n}_refreshed_opcert", kes_vkey_file=refreshed_pool_rec["kes_key_pair"].vkey_file, cold_skey_file=refreshed_pool_rec["cold_key_pair"].skey_file, cold_counter_file=refreshed_pool_rec["cold_key_pair"].counter_file, kes_period=cluster.get_kes_period(), ) shutil.copy(refreshed_opcert_file, refreshed_pool_rec["pool_operational_cert"]) cluster_nodes.restart_nodes(refreshed_nodes) _refresh_opcerts() expected_err_regexes = ["KESKeyAlreadyPoisoned", "KESCouldNotEvolve"] # ignore expected errors in bft1 node log file, as bft1 opcert will not get refreshed logfiles.add_ignore_rule( files_glob="bft1.stdout", regex="|".join(expected_err_regexes), ignore_file_id=worker_id, ) # search for expected errors only in log file corresponding to pool with expired KES expected_errors = [(f"{expire_node_name}.stdout", err) for err in expected_err_regexes] this_epoch = -1 with logfiles.expect_errors(expected_errors, ignore_file_id=worker_id): LOGGER.info( f"{datetime.datetime.now()}: Waiting for {expire_timeout} sec for KES expiration." ) time.sleep(expire_timeout) _wait_epoch_chores( cluster_obj=cluster, temp_template=temp_template, this_epoch=this_epoch ) this_epoch = cluster.get_epoch() # check that the pool is not producing any blocks blocks_made = clusterlib_utils.get_ledger_state(cluster_obj=cluster)["blocksCurrent"] if blocks_made: assert ( expire_pool_id_dec not in blocks_made ), f"The pool '{expire_pool_name}' has minted blocks in epoch {this_epoch}" # refresh opcerts one more time _refresh_opcerts() LOGGER.info( f"{datetime.datetime.now()}: Waiting 120 secs to make sure the expected errors " "make it to log files." ) time.sleep(120) # check kes-period-info with an operational certificate with KES expired # TODO: the query is currently broken kes_query_currently_broken = False try: kes_info_expired = cluster.get_kes_period_info( opcert_file=expire_pool_rec["pool_operational_cert"] ) except clusterlib.CLIError as err: if "currentlyBroken" not in str(err): raise kes_query_currently_broken = True if kes_query_currently_broken: pytest.xfail("`query kes-period-info` is currently broken") else: kes.check_kes_period_info_result( kes_output=kes_info_expired, expected_scenario=kes.KesScenarios.INVALID_KES_PERIOD ) # check kes-period-info with valid operational certificates for n in refreshed_nodes: refreshed_pool_rec = cluster_manager.cache.addrs_data[f"node-{n}"] kes_info_valid = cluster.get_kes_period_info( opcert_file=refreshed_pool_rec["pool_operational_cert"] ) kes.check_kes_period_info_result( kes_output=kes_info_valid, expected_scenario=kes.KesScenarios.ALL_VALID )
def test_opcert_future_kes_period( # noqa: C901 self, cluster_lock_pool2: clusterlib.ClusterLib, cluster_manager: cluster_management.ClusterManager, ): """Start a stake pool with an operational certificate created with invalid `--kes-period`. * generate new operational certificate with `--kes-period` in the future * restart the node with the new operational certificate * check that the pool is not producing any blocks * if network era > Alonzo - generate new operational certificate with valid `--kes-period`, but counter value +2 from last used operational ceritificate - restart the node - check that the pool is not producing any blocks * generate new operational certificate with valid `--kes-period` and restart the node * check that the pool is producing blocks again """ # pylint: disable=too-many-statements,too-many-branches pool_name = cluster_management.Resources.POOL2 node_name = "pool2" cluster = cluster_lock_pool2 temp_template = common.get_test_id(cluster) pool_rec = cluster_manager.cache.addrs_data[pool_name] node_cold = pool_rec["cold_key_pair"] stake_pool_id = cluster.get_stake_pool_id(node_cold.vkey_file) stake_pool_id_dec = helpers.decode_bech32(stake_pool_id) opcert_file: Path = pool_rec["pool_operational_cert"] cold_counter_file: Path = pool_rec["cold_key_pair"].counter_file expected_errors = [ (f"{node_name}.stdout", "PraosCannotForgeKeyNotUsableYet"), ] if VERSIONS.cluster_era > VERSIONS.ALONZO: expected_errors.append((f"{node_name}.stdout", "CounterOverIncrementedOCERT")) # In Babbage we get `CounterOverIncrementedOCERT` error if counter for new opcert # is not exactly +1 from last used opcert. We'll backup the original counter # file so we can use it for issuing next valid opcert. cold_counter_file_orig = Path( f"{cold_counter_file.stem}_orig{cold_counter_file.suffix}" ).resolve() shutil.copy(cold_counter_file, cold_counter_file_orig) logfiles.add_ignore_rule( files_glob="*.stdout", regex="MuxBearerClosed|CounterOverIncrementedOCERT", ignore_file_id=cluster_manager.worker_id, ) # generate new operational certificate with `--kes-period` in the future invalid_opcert_file = cluster.gen_node_operational_cert( node_name=f"{node_name}_invalid_opcert_file", kes_vkey_file=pool_rec["kes_key_pair"].vkey_file, cold_skey_file=pool_rec["cold_key_pair"].skey_file, cold_counter_file=cold_counter_file, kes_period=cluster.get_kes_period() + 100, ) kes_query_currently_broken = False with cluster_manager.restart_on_failure(): with logfiles.expect_errors(expected_errors, ignore_file_id=cluster_manager.worker_id): # restart the node with the new operational certificate shutil.copy(invalid_opcert_file, opcert_file) cluster_nodes.restart_nodes([node_name]) cluster.wait_for_new_epoch() LOGGER.info("Checking blocks production for 4 epochs.") this_epoch = -1 for invalid_opcert_epoch in range(4): _wait_epoch_chores( cluster_obj=cluster, temp_template=temp_template, this_epoch=this_epoch ) this_epoch = cluster.get_epoch() # check that the pool is not producing any blocks blocks_made = clusterlib_utils.get_ledger_state(cluster_obj=cluster)[ "blocksCurrent" ] if blocks_made: assert ( stake_pool_id_dec not in blocks_made ), f"The pool '{pool_name}' has produced blocks in epoch {this_epoch}" if invalid_opcert_epoch == 1: # check kes-period-info with operational certificate with # invalid `--kes-period` # TODO: the query is currently broken try: kes_period_info = cluster.get_kes_period_info(invalid_opcert_file) except clusterlib.CLIError as err: if "currentlyBroken" not in str(err): raise kes_query_currently_broken = True if not kes_query_currently_broken: kes.check_kes_period_info_result( kes_output=kes_period_info, expected_scenario=kes.KesScenarios.INVALID_KES_PERIOD, ) # test the `CounterOverIncrementedOCERT` error - the counter will now be +2 from # last used opcert counter value if invalid_opcert_epoch == 2 and VERSIONS.cluster_era > VERSIONS.ALONZO: overincrement_opcert_file = cluster.gen_node_operational_cert( node_name=f"{node_name}_overincrement_opcert_file", kes_vkey_file=pool_rec["kes_key_pair"].vkey_file, cold_skey_file=pool_rec["cold_key_pair"].skey_file, cold_counter_file=cold_counter_file, kes_period=cluster.get_kes_period(), ) # copy the new certificate and restart the node shutil.copy(overincrement_opcert_file, opcert_file) cluster_nodes.restart_nodes([node_name]) if invalid_opcert_epoch == 3: # check kes-period-info with operational certificate with # invalid counter # TODO: the query is currently broken, implement once it is fixed pass # in Babbage we'll use the original counter for issuing new valid opcert so the counter # value of new valid opcert equals to counter value of the original opcert +1 if VERSIONS.cluster_era > VERSIONS.ALONZO: shutil.copy(cold_counter_file_orig, cold_counter_file) # generate new operational certificate with valid `--kes-period` valid_opcert_file = cluster.gen_node_operational_cert( node_name=f"{node_name}_valid_opcert_file", kes_vkey_file=pool_rec["kes_key_pair"].vkey_file, cold_skey_file=pool_rec["cold_key_pair"].skey_file, cold_counter_file=cold_counter_file, kes_period=cluster.get_kes_period(), ) # copy the new certificate and restart the node shutil.copy(valid_opcert_file, opcert_file) cluster_nodes.restart_nodes([node_name]) this_epoch = cluster.wait_for_new_epoch() LOGGER.info("Checking blocks production for another 2 epochs.") blocks_made_db = [] active_again_epoch = this_epoch for __ in range(2): _wait_epoch_chores( cluster_obj=cluster, temp_template=temp_template, this_epoch=this_epoch ) this_epoch = cluster.get_epoch() # check that the pool is producing blocks blocks_made = clusterlib_utils.get_ledger_state(cluster_obj=cluster)[ "blocksCurrent" ] blocks_made_db.append(stake_pool_id_dec in blocks_made) assert any(blocks_made_db), ( f"The pool '{pool_name}' has not produced any blocks " f"since epoch {active_again_epoch}" ) if kes_query_currently_broken: pytest.xfail("`query kes-period-info` is currently broken") else: # check kes-period-info with valid operational certificate kes_period_info = cluster.get_kes_period_info(valid_opcert_file) kes.check_kes_period_info_result( kes_output=kes_period_info, expected_scenario=kes.KesScenarios.ALL_VALID ) # check kes-period-info with invalid operational certificate, wrong counter and period kes_period_info = cluster.get_kes_period_info(invalid_opcert_file) kes.check_kes_period_info_result( kes_output=kes_period_info, expected_scenario=kes.KesScenarios.INVALID_KES_PERIOD if VERSIONS.cluster_era > VERSIONS.ALONZO else kes.KesScenarios.ALL_INVALID, )
def test_opcert_past_kes_period( self, cluster_lock_pool2: clusterlib.ClusterLib, cluster_manager: parallel_run.ClusterManager, ): """Start a stake pool with an operational certificate created with expired `--kes-period`. * generate new operational certificate with `--kes-period` in the past * restart the node with the new operational certificate * check that the pool is not producing any blocks * generate new operational certificate with valid `--kes-period` and restart the node """ pool_name = "node-pool2" node_name = "pool2" cluster = cluster_lock_pool2 temp_template = helpers.get_func_name() pool_rec = cluster_manager.cache.addrs_data[pool_name] node_cold = pool_rec["cold_key_pair"] stake_pool_id = cluster.get_stake_pool_id(node_cold.vkey_file) stake_pool_id_dec = helpers.decode_bech32(stake_pool_id) opcert_file: Path = pool_rec["pool_operational_cert"] def _wait_epoch_chores(this_epoch: int): # wait for next epoch if cluster.get_last_block_epoch() == this_epoch: cluster.wait_for_new_epoch() # wait for the end of the epoch time.sleep(clusterlib_utils.time_to_next_epoch_start(cluster) - 5) # save ledger state clusterlib_utils.save_ledger_state( cluster_obj=cluster, name_template=f"{temp_template}_{cluster.get_last_block_epoch()}", ) with cluster_manager.restart_on_failure(): # generate new operational certificate with `--kes-period` in the past invalid_opcert_file = cluster.gen_node_operational_cert( node_name=node_name, node_kes_vkey_file=pool_rec["kes_key_pair"].vkey_file, node_cold_skey_file=pool_rec["cold_key_pair"].skey_file, node_cold_counter_file=pool_rec["cold_key_pair"].counter_file, kes_period=cluster.get_last_block_kes_period() - 1, ) expected_errors = [ (f"{node_name}.stdout", "TPraosCannotForgeKeyNotUsableYet"), ] with logfiles.expect_errors(expected_errors): # restart the node with the new operational certificate shutil.copy(invalid_opcert_file, opcert_file) devops_cluster.restart_node(node_name) LOGGER.info("Checking blocks production for 5 epochs.") this_epoch = -1 for __ in range(5): _wait_epoch_chores(this_epoch) this_epoch = cluster.get_last_block_epoch() # check that the pool is not producing any blocks blocks_made = cluster.get_ledger_state()["nesBcur"]["unBlocksMade"] if blocks_made: assert ( stake_pool_id_dec not in blocks_made ), f"The pool '{pool_name}' has produced blocks in epoch {this_epoch}" # generate new operational certificate with valid `--kes-period` os.remove(opcert_file) valid_opcert_file = cluster.gen_node_operational_cert( node_name=node_name, node_kes_vkey_file=pool_rec["kes_key_pair"].vkey_file, node_cold_skey_file=pool_rec["cold_key_pair"].skey_file, node_cold_counter_file=pool_rec["cold_key_pair"].counter_file, kes_period=cluster.get_last_block_kes_period(), ) # copy the new certificate and restart the node shutil.move(str(valid_opcert_file), str(opcert_file)) devops_cluster.restart_node(node_name) LOGGER.info("Checking blocks production for another 3 epochs.") for __ in range(5): _wait_epoch_chores(this_epoch) this_epoch = cluster.get_last_block_epoch() # check that the pool is not producing any blocks blocks_made = cluster.get_ledger_state()["nesBcur"]["unBlocksMade"] assert ( stake_pool_id_dec in blocks_made ), f"The pool '{pool_name}' has not produced blocks in epoch {this_epoch}"