def testenv_setup_teardown( tmp_path_factory: TempdirFactory, worker_id: str, request: FixtureRequest) -> Generator[None, None, None]: pytest_root_tmp = temptools.get_pytest_root_tmp(tmp_path_factory) with locking.FileLockIfXdist( f"{pytest_root_tmp}/{cluster_management.CLUSTER_LOCK}"): # save environment info for Allure if not list(pytest_root_tmp.glob(".started_session_*")): _save_env_for_allure(request.config) helpers.touch(pytest_root_tmp / f".started_session_{worker_id}") yield with locking.FileLockIfXdist( f"{pytest_root_tmp}/{cluster_management.CLUSTER_LOCK}"): # save CLI coverage to dir specified by `--cli-coverage-dir` cluster_manager_obj = cluster_management.ClusterManager( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config) cluster_manager_obj.save_worker_cli_coverage() # perform cleanup if this is the last running pytest worker (pytest_root_tmp / f".started_session_{worker_id}").unlink() if not list(pytest_root_tmp.glob(".started_session_*")): # perform testnet cleanup _testnet_cleanup(pytest_root_tmp=pytest_root_tmp) if configuration.DEV_CLUSTER_RUNNING: # save cluster artifacts artifacts_base_dir = request.config.getoption( "--artifacts-base-dir") if artifacts_base_dir: state_dir = cluster_nodes.get_cluster_env().state_dir artifacts.save_cluster_artifacts(save_dir=pytest_root_tmp, state_dir=state_dir) else: # stop all cluster instances, save artifacts _stop_all_cluster_instances( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config, ) # copy collected artifacts to dir specified by `--artifacts-base-dir` artifacts.copy_artifacts(pytest_tmp_dir=pytest_root_tmp, pytest_config=request.config)
def set_needs_restart(self) -> None: """Indicate that the cluster instance needs restart.""" with locking.FileLockIfXdist(self.cluster_lock): self._log( f"c{self.cluster_instance_num}: called `set_needs_restart`") helpers.touch(self.instance_dir / f"{RESTART_NEEDED_GLOB}_{self.worker_id}")
def slot_length_start_cluster(tmp_path_factory: TempdirFactory) -> Path: """Update *slotLength* to 0.3.""" shared_tmp = temptools.get_pytest_shared_tmp(tmp_path_factory) # need to lock because this same fixture can run on several workers in parallel with locking.FileLockIfXdist(f"{shared_tmp}/startup_files_slot_03.lock"): destdir = shared_tmp / "startup_files_slot_03" destdir.mkdir(exist_ok=True) # return existing script if it is already generated by other worker destdir_ls = list(destdir.glob("start-cluster*")) if destdir_ls: return destdir_ls[0] startup_files = cluster_nodes.get_cluster_type( ).cluster_scripts.copy_scripts_files(destdir=destdir) with open(startup_files.genesis_spec, encoding="utf-8") as fp_in: genesis_spec = json.load(fp_in) genesis_spec["slotLength"] = 0.3 with open(startup_files.genesis_spec, "w", encoding="utf-8") as fp_out: json.dump(genesis_spec, fp_out) return startup_files.start_script
def _log(self, msg: str) -> None: """Log message.""" if not configuration.SCHEDULING_LOG: return with locking.FileLockIfXdist( self.log_lock), open(configuration.SCHEDULING_LOG, "a", encoding="utf-8") as logfile: logfile.write( f"{datetime.datetime.now()} on {self.worker_id}: {msg}\n")
def add_ignore_rule(files_glob: str, regex: str, ignore_file_id: str) -> None: """Add ignore rule for expected errors.""" cluster_env = cluster_nodes.get_cluster_env() rules_file = cluster_env.state_dir / f"{ERRORS_IGNORE_FILE_NAME}_{ignore_file_id}" lock_file = (temptools.get_basetemp() / f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock") with locking.FileLockIfXdist(lock_file), open(rules_file, "a", encoding="utf-8") as infile: infile.write(f"{files_glob};;{regex}\n")
def clean_ignore_rules(ignore_file_id: str) -> None: """Cleanup relevant ignore rules file. Delete ignore file identified by `ignore_file_id` when it is no longer valid. """ cluster_env = cluster_nodes.get_cluster_env() rules_file = cluster_env.state_dir / f"{ERRORS_IGNORE_FILE_NAME}_{ignore_file_id}" lock_file = (temptools.get_basetemp() / f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock") with locking.FileLockIfXdist(lock_file): rules_file.unlink(missing_ok=True)
def search_cluster_artifacts() -> List[Tuple[Path, str]]: """Search cluster artifacts for errors.""" cluster_env = cluster_nodes.get_cluster_env() lock_file = temptools.get_basetemp( ) / f"search_artifacts_{cluster_env.instance_num}.lock" with locking.FileLockIfXdist(lock_file): ignore_rules = _get_ignore_rules(cluster_env=cluster_env) errors = [] for logfile in cluster_env.state_dir.glob("*.std*"): # skip if the log file is status file or rotated log if logfile.name.endswith(".offset") or ROTATED_RE.match( logfile.name): continue # read seek offset (from where to start searching) and timestamp of last search offset_file = logfile.parent / f".{logfile.name}.offset" if offset_file.exists(): seek = _get_seek(offset_file) timestamp = os.path.getmtime(offset_file) else: seek = 0 timestamp = 0.0 errors_ignored = _get_ignore_regex(ignore_rules=ignore_rules, regexes=ERRORS_IGNORED, logfile=logfile) errors_ignored_re = re.compile(errors_ignored) # record offset for the "live" log file with open(offset_file, "w", encoding="utf-8") as outfile: outfile.write(str(helpers.get_eof_offset(logfile))) for logfile_rec in _get_rotated_logs(logfile=logfile, seek=seek, timestamp=timestamp): with open(logfile_rec.logfile, encoding="utf-8") as infile: infile.seek(seek) for line in infile: if ERRORS_RE.search(line) and not ( errors_ignored and errors_ignored_re.search(line)): errors.append((logfile, line)) return errors
def _get_ignore_rules( cluster_env: cluster_nodes.ClusterEnv) -> List[Tuple[str, str]]: """Get rules (file glob and regex) for ignored errors.""" rules: List[Tuple[str, str]] = [] lock_file = (temptools.get_basetemp() / f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock") with locking.FileLockIfXdist(lock_file): for rules_file in cluster_env.state_dir.glob( f"{ERRORS_IGNORE_FILE_NAME}_*"): with open(rules_file, encoding="utf-8") as infile: for line in infile: if ";;" not in line: continue files_glob, regex = line.split(";;") rules.append((files_glob, regex.rstrip("\n"))) return rules
def on_test_stop(self) -> None: """Perform actions after a test is finished.""" if self._cluster_instance_num == -1: return self._log(f"c{self._cluster_instance_num}: called `on_test_stop`") # search for errors in cluster logfiles errors = logfiles.search_cluster_artifacts() with locking.FileLockIfXdist(self.cluster_lock): # There's only one test running on a worker at a time. Deleting the coresponding rules # file right after a test is finished is therefore safe. The effect is that the rules # apply only from the time they were added (by `logfiles.add_ignore_rule`) until the end # of the test. # However sometimes we don't want to remove the rules file. Imagine situation when test # failed and cluster instance needs to be restarted. The failed test already finished, # but other tests are still running and need to finish first before restart can happen. # If the ignored error continues to get printed into log file, tests that are still # running on the cluster instance would report that error. Therefore if the cluster # instance is scheduled for restart, don't delete the rules file. if not list(self.instance_dir.glob(f"{RESTART_NEEDED_GLOB}_*")): logfiles.clean_ignore_rules(ignore_file_id=self.worker_id) # remove resource locking files created by the worker resource_locking_files = list( self.instance_dir.glob( f"{RESOURCE_LOCKED_GLOB}_*_{self.worker_id}")) for f in resource_locking_files: f.unlink() # remove "resource in use" files created by the worker resource_in_use_files = list( self.instance_dir.glob( f"{RESOURCE_IN_USE_GLOB}_*_{self.worker_id}")) for f in resource_in_use_files: f.unlink() # remove file that indicates that a test is running on the worker (self.instance_dir / f"{TEST_RUNNING_GLOB}_{self.worker_id}").unlink(missing_ok=True) if errors: logfiles.report_artifacts_errors(errors)
def short_kes_start_cluster(tmp_path_factory: TempdirFactory) -> Path: """Update *slotsPerKESPeriod* and *maxKESEvolutions*.""" shared_tmp = temptools.get_pytest_shared_tmp(tmp_path_factory) max_kes_evolutions = 10 # need to lock because this same fixture can run on several workers in parallel with locking.FileLockIfXdist(f"{shared_tmp}/startup_files_short_kes.lock"): destdir = shared_tmp / "startup_files_short_kes" destdir.mkdir(exist_ok=True) # return existing script if it is already generated by other worker destdir_ls = list(destdir.glob("start-cluster*")) if destdir_ls: return destdir_ls[0] startup_files = cluster_nodes.get_cluster_type().cluster_scripts.copy_scripts_files( destdir=destdir ) with open(startup_files.genesis_spec, encoding="utf-8") as fp_in: genesis_spec = json.load(fp_in) # KES needs to be valid at least until the local cluster is fully started. # We need to calculate how many slots there is from the start of Shelley epoch # until the cluster is fully started. # Assume k=10, i.e. k * 10 = 100 slots in Byron era. # Subtract one Byron epoch and current (last) epoch when calculating slots in # Shelley epochs. epoch_length = genesis_spec["epochLength"] cluster_start_time_slots = int((NUM_OF_EPOCHS - 2) * epoch_length + 100) exact_kes_period_slots = int(cluster_start_time_slots / max_kes_evolutions) genesis_spec["slotsPerKESPeriod"] = int(exact_kes_period_slots * 1.2) # add buffer genesis_spec["maxKESEvolutions"] = max_kes_evolutions with open(startup_files.genesis_spec, "w", encoding="utf-8") as fp_out: json.dump(genesis_spec, fp_out) return startup_files.start_script
def return_funds_to_faucet( *src_addrs: clusterlib.AddressRecord, cluster_obj: clusterlib.ClusterLib, faucet_addr: str, amount: Union[int, List[int]] = -1, tx_name: Optional[str] = None, destination_dir: FileType = ".", ) -> None: """Send `amount` from all `src_addrs` to `faucet_addr`. The amount of "-1" means all available funds. """ tx_name = tx_name or helpers.get_timestamped_rand_str() tx_name = f"{tx_name}_return_funds" if isinstance(amount, int): amount = [amount] * len(src_addrs) with locking.FileLockIfXdist( f"{temptools.get_basetemp()}/{faucet_addr}.lock"): try: logging.disable(logging.ERROR) for addr, amount_rec in zip(src_addrs, amount): fund_dst = [ clusterlib.TxOut(address=faucet_addr, amount=amount_rec) ] fund_tx_files = clusterlib.TxFiles( signing_key_files=[addr.skey_file]) # try to return funds; don't mind if there's not enough funds for fees etc. with contextlib.suppress(Exception): cluster_obj.send_funds( src_address=addr.address, destinations=fund_dst, tx_name=tx_name, tx_files=fund_tx_files, destination_dir=destination_dir, ) finally: logging.disable(logging.NOTSET)
def get( # noqa: C901 self, mark: str = "", lock_resources: Iterable[str] = (), use_resources: Iterable[str] = (), cleanup: bool = False, start_cmd: str = "", ) -> clusterlib.ClusterLib: """Return the `clusterlib.ClusterLib` instance once we can start the test. It checks current conditions and waits if the conditions don't allow to start the test right away. """ # pylint: disable=too-many-statements,too-many-branches assert not isinstance( lock_resources, str), "`lock_resources` must be sequence of strings" assert not isinstance( use_resources, str), "`use_resources` must be sequence of strings" if configuration.DEV_CLUSTER_RUNNING: if start_cmd: LOGGER.warning( f"Ignoring the '{start_cmd}' cluster start command as " "'DEV_CLUSTER_RUNNING' is set.") # check if the development cluster instance is ready by now so we don't need to obtain # cluster lock when it is not necessary if not self._is_dev_cluster_ready(): with locking.FileLockIfXdist(self.cm.cluster_lock): self._setup_dev_cluster() if configuration.FORBID_RESTART and start_cmd: raise RuntimeError( "Cannot use custom start command when 'FORBID_RESTART' is set." ) if start_cmd: if not (mark or (Resources.CLUSTER in lock_resources)): raise RuntimeError( "Custom start command can be used only together with singleton or `mark`." ) # always clean after test(s) that started cluster with custom configuration cleanup = True # Add `Resources.CLUSTER` to `use_resources`. Filter out `lock_resources` from the # list of `use_resources`. use_resources = list( set(use_resources).union({Resources.CLUSTER}) - set(lock_resources)) cget_status = ClusterGetStatus( mark=mark, lock_resources=lock_resources, use_resources=use_resources, cleanup=cleanup, start_cmd=start_cmd, current_test=os.environ.get("PYTEST_CURRENT_TEST") or "", ) marked_tests_cache: Dict[int, MarkedTestsStatus] = {} self.cm._log(f"want to run test '{cget_status.current_test}'") # iterate until it is possible to start the test while True: if cget_status.restart_ready: self._restart(start_cmd=start_cmd) if not cget_status.first_iteration: xdist_sleep(random.uniform(0.6, 1.2) * cget_status.sleep_delay) # nothing time consuming can go under this lock as all other workers will need to wait with locking.FileLockIfXdist(self.cm.cluster_lock): if self._is_already_running(cget_status): if not self.cm.cache.cluster_obj: raise AssertionError( "`cluster_obj` not available, that cannot happen") return self.cm.cache.cluster_obj # needs to be set here, before the first `continue` cget_status.first_iteration = False self.cm._cluster_instance_num = -1 # try all existing cluster instances for instance_num in range(self.cm.num_of_instances): # there's only one cluster instance when `DEV_CLUSTER_RUNNING` is set if configuration.DEV_CLUSTER_RUNNING and instance_num != 0: continue # if instance to run the test on was already decided, skip all other instances # pylint: disable=consider-using-in if (cget_status.selected_instance != -1 and instance_num != cget_status.selected_instance): continue cget_status.instance_num = instance_num cget_status.instance_dir = ( self.cm.pytest_tmp_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}") cget_status.instance_dir.mkdir(exist_ok=True) # cleanup cluster instance where attempt to start cluster failed repeatedly if (cget_status.instance_dir / CLUSTER_DEAD_FILE).exists(): self._cleanup_dead_clusters(cget_status) continue # cluster restart planned or in progress, so no new tests can start if self._restarted_by_other_worker(cget_status): cget_status.sleep_delay = 5 continue # are there tests already running on this cluster instance? cget_status.started_tests_sfiles = list( cget_status.instance_dir.glob( f"{TEST_RUNNING_GLOB}_*")) # "marked tests" = group of tests marked with a specific mark. # While these tests are running, no unmarked test can start. cget_status.marked_starting_sfiles = list( cget_status.instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_*")) cget_status.marked_running_sfiles = list( cget_status.instance_dir.glob( f"{TEST_CURR_MARK_GLOB}_*")) # if marked tests are already running, update their status self._update_marked_tests( marked_tests_cache=marked_tests_cache, cget_status=cget_status) # test has mark if mark: # select this instance for running marked tests if possible if not self._marked_select_instance(cget_status): cget_status.sleep_delay = 2 continue # check if we need to wait until unmarked tests are finished if (not cget_status.marked_running_sfiles and cget_status.started_tests_sfiles): cget_status.sleep_delay = 10 continue self.cm._log( f"c{instance_num}: in marked tests branch, " f"I have required mark '{mark}'") # no unmarked test can run while marked tests are starting or running elif cget_status.marked_running_sfiles or cget_status.marked_starting_sfiles: self.cm._log( f"c{instance_num}: marked tests starting or running, " f"I don't have mark") cget_status.sleep_delay = 2 continue # check availability of the required resources if not self._are_resources_available(cget_status): cget_status.sleep_delay = 5 continue # if restart is needed, indicate that the cluster will be restarted # (after all currently running tests are finished) if not self._init_restart(cget_status): continue # we've found suitable cluster instance cget_status.selected_instance = instance_num self.cm._cluster_instance_num = instance_num self.cm._log( f"c{instance_num}: can run test '{cget_status.current_test}'" ) # set environment variables that are needed when restarting the cluster # and running tests cluster_nodes.set_cluster_env(instance_num) # if needed, finish restart related actions if not self._finish_restart(cget_status): continue # from this point on, all conditions needed to start the test are met break else: # if the test cannot start on any instance, return to top-level loop continue self._create_test_status_files(cget_status) # Check if it is necessary to reload data. This still needs to happen under # global lock. state_dir = cluster_nodes.get_cluster_env().state_dir self._reload_cluster_obj(state_dir=state_dir) # cluster is ready, we can start the test break cluster_obj = self.cm.cache.cluster_obj if not cluster_obj: raise AssertionError( "`cluster_obj` not available, that cannot happen") cluster_obj.cluster_id = instance_num cluster_obj._cluster_manager = self.cm # type: ignore return cluster_obj
*dst_addrs: str, cluster_obj: clusterlib.ClusterLib, amount: int = 2_000_000, tx_name: Optional[str] = None, destination_dir: FileType = ".", ) -> None: """Send `amount` from genesis addr to all `dst_addrs`.""" fund_dst = [ clusterlib.TxOut(address=d, amount=amount) for d in dst_addrs if cluster_obj.get_address_balance(d) < amount ] if not fund_dst: return with locking.FileLockIfXdist( f"{temptools.get_basetemp()}/{cluster_obj.genesis_utxo_addr}.lock" ): tx_name = tx_name or helpers.get_timestamped_rand_str() tx_name = f"{tx_name}_genesis_funding" fund_tx_files = clusterlib.TxFiles(signing_key_files=[ *cluster_obj.genesis_keys.delegate_skeys, cluster_obj.genesis_keys.genesis_utxo_skey, ]) cluster_obj.send_funds( src_address=cluster_obj.genesis_utxo_addr, destinations=fund_dst, tx_name=tx_name, tx_files=fund_tx_files, destination_dir=destination_dir, )