Beispiel #1
0
def testenv_setup_teardown(
        tmp_path_factory: TempdirFactory, worker_id: str,
        request: FixtureRequest) -> Generator[None, None, None]:
    pytest_root_tmp = temptools.get_pytest_root_tmp(tmp_path_factory)

    with locking.FileLockIfXdist(
            f"{pytest_root_tmp}/{cluster_management.CLUSTER_LOCK}"):
        # save environment info for Allure
        if not list(pytest_root_tmp.glob(".started_session_*")):
            _save_env_for_allure(request.config)

        helpers.touch(pytest_root_tmp / f".started_session_{worker_id}")

    yield

    with locking.FileLockIfXdist(
            f"{pytest_root_tmp}/{cluster_management.CLUSTER_LOCK}"):
        # save CLI coverage to dir specified by `--cli-coverage-dir`
        cluster_manager_obj = cluster_management.ClusterManager(
            tmp_path_factory=tmp_path_factory,
            worker_id=worker_id,
            pytest_config=request.config)
        cluster_manager_obj.save_worker_cli_coverage()

        # perform cleanup if this is the last running pytest worker
        (pytest_root_tmp / f".started_session_{worker_id}").unlink()
        if not list(pytest_root_tmp.glob(".started_session_*")):
            # perform testnet cleanup
            _testnet_cleanup(pytest_root_tmp=pytest_root_tmp)

            if configuration.DEV_CLUSTER_RUNNING:
                # save cluster artifacts
                artifacts_base_dir = request.config.getoption(
                    "--artifacts-base-dir")
                if artifacts_base_dir:
                    state_dir = cluster_nodes.get_cluster_env().state_dir
                    artifacts.save_cluster_artifacts(save_dir=pytest_root_tmp,
                                                     state_dir=state_dir)
            else:
                # stop all cluster instances, save artifacts
                _stop_all_cluster_instances(
                    tmp_path_factory=tmp_path_factory,
                    worker_id=worker_id,
                    pytest_config=request.config,
                )

            # copy collected artifacts to dir specified by `--artifacts-base-dir`
            artifacts.copy_artifacts(pytest_tmp_dir=pytest_root_tmp,
                                     pytest_config=request.config)
 def set_needs_restart(self) -> None:
     """Indicate that the cluster instance needs restart."""
     with locking.FileLockIfXdist(self.cluster_lock):
         self._log(
             f"c{self.cluster_instance_num}: called `set_needs_restart`")
         helpers.touch(self.instance_dir /
                       f"{RESTART_NEEDED_GLOB}_{self.worker_id}")
Beispiel #3
0
def slot_length_start_cluster(tmp_path_factory: TempdirFactory) -> Path:
    """Update *slotLength* to 0.3."""
    shared_tmp = temptools.get_pytest_shared_tmp(tmp_path_factory)

    # need to lock because this same fixture can run on several workers in parallel
    with locking.FileLockIfXdist(f"{shared_tmp}/startup_files_slot_03.lock"):
        destdir = shared_tmp / "startup_files_slot_03"
        destdir.mkdir(exist_ok=True)

        # return existing script if it is already generated by other worker
        destdir_ls = list(destdir.glob("start-cluster*"))
        if destdir_ls:
            return destdir_ls[0]

        startup_files = cluster_nodes.get_cluster_type(
        ).cluster_scripts.copy_scripts_files(destdir=destdir)
        with open(startup_files.genesis_spec, encoding="utf-8") as fp_in:
            genesis_spec = json.load(fp_in)

        genesis_spec["slotLength"] = 0.3

        with open(startup_files.genesis_spec, "w", encoding="utf-8") as fp_out:
            json.dump(genesis_spec, fp_out)

        return startup_files.start_script
    def _log(self, msg: str) -> None:
        """Log message."""
        if not configuration.SCHEDULING_LOG:
            return

        with locking.FileLockIfXdist(
                self.log_lock), open(configuration.SCHEDULING_LOG,
                                     "a",
                                     encoding="utf-8") as logfile:
            logfile.write(
                f"{datetime.datetime.now()} on {self.worker_id}: {msg}\n")
Beispiel #5
0
def add_ignore_rule(files_glob: str, regex: str, ignore_file_id: str) -> None:
    """Add ignore rule for expected errors."""
    cluster_env = cluster_nodes.get_cluster_env()
    rules_file = cluster_env.state_dir / f"{ERRORS_IGNORE_FILE_NAME}_{ignore_file_id}"
    lock_file = (temptools.get_basetemp() /
                 f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock")

    with locking.FileLockIfXdist(lock_file), open(rules_file,
                                                  "a",
                                                  encoding="utf-8") as infile:
        infile.write(f"{files_glob};;{regex}\n")
Beispiel #6
0
def clean_ignore_rules(ignore_file_id: str) -> None:
    """Cleanup relevant ignore rules file.

    Delete ignore file identified by `ignore_file_id` when it is no longer valid.
    """
    cluster_env = cluster_nodes.get_cluster_env()
    rules_file = cluster_env.state_dir / f"{ERRORS_IGNORE_FILE_NAME}_{ignore_file_id}"
    lock_file = (temptools.get_basetemp() /
                 f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock")

    with locking.FileLockIfXdist(lock_file):
        rules_file.unlink(missing_ok=True)
Beispiel #7
0
def search_cluster_artifacts() -> List[Tuple[Path, str]]:
    """Search cluster artifacts for errors."""
    cluster_env = cluster_nodes.get_cluster_env()
    lock_file = temptools.get_basetemp(
    ) / f"search_artifacts_{cluster_env.instance_num}.lock"

    with locking.FileLockIfXdist(lock_file):
        ignore_rules = _get_ignore_rules(cluster_env=cluster_env)

        errors = []
        for logfile in cluster_env.state_dir.glob("*.std*"):
            # skip if the log file is status file or rotated log
            if logfile.name.endswith(".offset") or ROTATED_RE.match(
                    logfile.name):
                continue

            # read seek offset (from where to start searching) and timestamp of last search
            offset_file = logfile.parent / f".{logfile.name}.offset"
            if offset_file.exists():
                seek = _get_seek(offset_file)
                timestamp = os.path.getmtime(offset_file)
            else:
                seek = 0
                timestamp = 0.0

            errors_ignored = _get_ignore_regex(ignore_rules=ignore_rules,
                                               regexes=ERRORS_IGNORED,
                                               logfile=logfile)
            errors_ignored_re = re.compile(errors_ignored)

            # record offset for the "live" log file
            with open(offset_file, "w", encoding="utf-8") as outfile:
                outfile.write(str(helpers.get_eof_offset(logfile)))

            for logfile_rec in _get_rotated_logs(logfile=logfile,
                                                 seek=seek,
                                                 timestamp=timestamp):
                with open(logfile_rec.logfile, encoding="utf-8") as infile:
                    infile.seek(seek)
                    for line in infile:
                        if ERRORS_RE.search(line) and not (
                                errors_ignored
                                and errors_ignored_re.search(line)):
                            errors.append((logfile, line))

    return errors
Beispiel #8
0
def _get_ignore_rules(
        cluster_env: cluster_nodes.ClusterEnv) -> List[Tuple[str, str]]:
    """Get rules (file glob and regex) for ignored errors."""
    rules: List[Tuple[str, str]] = []
    lock_file = (temptools.get_basetemp() /
                 f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock")

    with locking.FileLockIfXdist(lock_file):
        for rules_file in cluster_env.state_dir.glob(
                f"{ERRORS_IGNORE_FILE_NAME}_*"):
            with open(rules_file, encoding="utf-8") as infile:
                for line in infile:
                    if ";;" not in line:
                        continue
                    files_glob, regex = line.split(";;")
                    rules.append((files_glob, regex.rstrip("\n")))

    return rules
    def on_test_stop(self) -> None:
        """Perform actions after a test is finished."""
        if self._cluster_instance_num == -1:
            return

        self._log(f"c{self._cluster_instance_num}: called `on_test_stop`")

        # search for errors in cluster logfiles
        errors = logfiles.search_cluster_artifacts()

        with locking.FileLockIfXdist(self.cluster_lock):
            # There's only one test running on a worker at a time. Deleting the coresponding rules
            # file right after a test is finished is therefore safe. The effect is that the rules
            # apply only from the time they were added (by `logfiles.add_ignore_rule`) until the end
            # of the test.
            # However sometimes we don't want to remove the rules file. Imagine situation when test
            # failed and cluster instance needs to be restarted. The failed test already finished,
            # but other tests are still running and need to finish first before restart can happen.
            # If the ignored error continues to get printed into log file, tests that are still
            # running on the cluster instance would report that error. Therefore if the cluster
            # instance is scheduled for restart, don't delete the rules file.
            if not list(self.instance_dir.glob(f"{RESTART_NEEDED_GLOB}_*")):
                logfiles.clean_ignore_rules(ignore_file_id=self.worker_id)

            # remove resource locking files created by the worker
            resource_locking_files = list(
                self.instance_dir.glob(
                    f"{RESOURCE_LOCKED_GLOB}_*_{self.worker_id}"))
            for f in resource_locking_files:
                f.unlink()

            # remove "resource in use" files created by the worker
            resource_in_use_files = list(
                self.instance_dir.glob(
                    f"{RESOURCE_IN_USE_GLOB}_*_{self.worker_id}"))
            for f in resource_in_use_files:
                f.unlink()

            # remove file that indicates that a test is running on the worker
            (self.instance_dir /
             f"{TEST_RUNNING_GLOB}_{self.worker_id}").unlink(missing_ok=True)

        if errors:
            logfiles.report_artifacts_errors(errors)
Beispiel #10
0
def short_kes_start_cluster(tmp_path_factory: TempdirFactory) -> Path:
    """Update *slotsPerKESPeriod* and *maxKESEvolutions*."""
    shared_tmp = temptools.get_pytest_shared_tmp(tmp_path_factory)
    max_kes_evolutions = 10

    # need to lock because this same fixture can run on several workers in parallel
    with locking.FileLockIfXdist(f"{shared_tmp}/startup_files_short_kes.lock"):
        destdir = shared_tmp / "startup_files_short_kes"
        destdir.mkdir(exist_ok=True)

        # return existing script if it is already generated by other worker
        destdir_ls = list(destdir.glob("start-cluster*"))
        if destdir_ls:
            return destdir_ls[0]

        startup_files = cluster_nodes.get_cluster_type().cluster_scripts.copy_scripts_files(
            destdir=destdir
        )
        with open(startup_files.genesis_spec, encoding="utf-8") as fp_in:
            genesis_spec = json.load(fp_in)

        # KES needs to be valid at least until the local cluster is fully started.
        # We need to calculate how many slots there is from the start of Shelley epoch
        # until the cluster is fully started.
        # Assume k=10, i.e. k * 10 = 100 slots in Byron era.
        # Subtract one Byron epoch and current (last) epoch when calculating slots in
        # Shelley epochs.
        epoch_length = genesis_spec["epochLength"]
        cluster_start_time_slots = int((NUM_OF_EPOCHS - 2) * epoch_length + 100)
        exact_kes_period_slots = int(cluster_start_time_slots / max_kes_evolutions)

        genesis_spec["slotsPerKESPeriod"] = int(exact_kes_period_slots * 1.2)  # add buffer
        genesis_spec["maxKESEvolutions"] = max_kes_evolutions

        with open(startup_files.genesis_spec, "w", encoding="utf-8") as fp_out:
            json.dump(genesis_spec, fp_out)

        return startup_files.start_script
def return_funds_to_faucet(
    *src_addrs: clusterlib.AddressRecord,
    cluster_obj: clusterlib.ClusterLib,
    faucet_addr: str,
    amount: Union[int, List[int]] = -1,
    tx_name: Optional[str] = None,
    destination_dir: FileType = ".",
) -> None:
    """Send `amount` from all `src_addrs` to `faucet_addr`.

    The amount of "-1" means all available funds.
    """
    tx_name = tx_name or helpers.get_timestamped_rand_str()
    tx_name = f"{tx_name}_return_funds"
    if isinstance(amount, int):
        amount = [amount] * len(src_addrs)

    with locking.FileLockIfXdist(
            f"{temptools.get_basetemp()}/{faucet_addr}.lock"):
        try:
            logging.disable(logging.ERROR)
            for addr, amount_rec in zip(src_addrs, amount):
                fund_dst = [
                    clusterlib.TxOut(address=faucet_addr, amount=amount_rec)
                ]
                fund_tx_files = clusterlib.TxFiles(
                    signing_key_files=[addr.skey_file])
                # try to return funds; don't mind if there's not enough funds for fees etc.
                with contextlib.suppress(Exception):
                    cluster_obj.send_funds(
                        src_address=addr.address,
                        destinations=fund_dst,
                        tx_name=tx_name,
                        tx_files=fund_tx_files,
                        destination_dir=destination_dir,
                    )
        finally:
            logging.disable(logging.NOTSET)
    def get(  # noqa: C901
        self,
        mark: str = "",
        lock_resources: Iterable[str] = (),
        use_resources: Iterable[str] = (),
        cleanup: bool = False,
        start_cmd: str = "",
    ) -> clusterlib.ClusterLib:
        """Return the `clusterlib.ClusterLib` instance once we can start the test.

        It checks current conditions and waits if the conditions don't allow to start the test
        right away.
        """
        # pylint: disable=too-many-statements,too-many-branches
        assert not isinstance(
            lock_resources,
            str), "`lock_resources` must be sequence of strings"
        assert not isinstance(
            use_resources, str), "`use_resources` must be sequence of strings"

        if configuration.DEV_CLUSTER_RUNNING:
            if start_cmd:
                LOGGER.warning(
                    f"Ignoring the '{start_cmd}' cluster start command as "
                    "'DEV_CLUSTER_RUNNING' is set.")
            # check if the development cluster instance is ready by now so we don't need to obtain
            # cluster lock when it is not necessary
            if not self._is_dev_cluster_ready():
                with locking.FileLockIfXdist(self.cm.cluster_lock):
                    self._setup_dev_cluster()

        if configuration.FORBID_RESTART and start_cmd:
            raise RuntimeError(
                "Cannot use custom start command when 'FORBID_RESTART' is set."
            )

        if start_cmd:
            if not (mark or (Resources.CLUSTER in lock_resources)):
                raise RuntimeError(
                    "Custom start command can be used only together with singleton or `mark`."
                )
            # always clean after test(s) that started cluster with custom configuration
            cleanup = True

        # Add `Resources.CLUSTER` to `use_resources`. Filter out `lock_resources` from the
        # list of `use_resources`.
        use_resources = list(
            set(use_resources).union({Resources.CLUSTER}) -
            set(lock_resources))

        cget_status = ClusterGetStatus(
            mark=mark,
            lock_resources=lock_resources,
            use_resources=use_resources,
            cleanup=cleanup,
            start_cmd=start_cmd,
            current_test=os.environ.get("PYTEST_CURRENT_TEST") or "",
        )
        marked_tests_cache: Dict[int, MarkedTestsStatus] = {}

        self.cm._log(f"want to run test '{cget_status.current_test}'")

        # iterate until it is possible to start the test
        while True:
            if cget_status.restart_ready:
                self._restart(start_cmd=start_cmd)

            if not cget_status.first_iteration:
                xdist_sleep(random.uniform(0.6, 1.2) * cget_status.sleep_delay)

            # nothing time consuming can go under this lock as all other workers will need to wait
            with locking.FileLockIfXdist(self.cm.cluster_lock):
                if self._is_already_running(cget_status):
                    if not self.cm.cache.cluster_obj:
                        raise AssertionError(
                            "`cluster_obj` not available, that cannot happen")
                    return self.cm.cache.cluster_obj

                # needs to be set here, before the first `continue`
                cget_status.first_iteration = False
                self.cm._cluster_instance_num = -1

                # try all existing cluster instances
                for instance_num in range(self.cm.num_of_instances):
                    # there's only one cluster instance when `DEV_CLUSTER_RUNNING` is set
                    if configuration.DEV_CLUSTER_RUNNING and instance_num != 0:
                        continue

                    # if instance to run the test on was already decided, skip all other instances
                    # pylint: disable=consider-using-in
                    if (cget_status.selected_instance != -1
                            and instance_num != cget_status.selected_instance):
                        continue

                    cget_status.instance_num = instance_num
                    cget_status.instance_dir = (
                        self.cm.pytest_tmp_dir /
                        f"{CLUSTER_DIR_TEMPLATE}{instance_num}")
                    cget_status.instance_dir.mkdir(exist_ok=True)

                    # cleanup cluster instance where attempt to start cluster failed repeatedly
                    if (cget_status.instance_dir / CLUSTER_DEAD_FILE).exists():
                        self._cleanup_dead_clusters(cget_status)
                        continue

                    # cluster restart planned or in progress, so no new tests can start
                    if self._restarted_by_other_worker(cget_status):
                        cget_status.sleep_delay = 5
                        continue

                    # are there tests already running on this cluster instance?
                    cget_status.started_tests_sfiles = list(
                        cget_status.instance_dir.glob(
                            f"{TEST_RUNNING_GLOB}_*"))

                    # "marked tests" = group of tests marked with a specific mark.
                    # While these tests are running, no unmarked test can start.
                    cget_status.marked_starting_sfiles = list(
                        cget_status.instance_dir.glob(
                            f"{TEST_MARK_STARTING_GLOB}_*"))
                    cget_status.marked_running_sfiles = list(
                        cget_status.instance_dir.glob(
                            f"{TEST_CURR_MARK_GLOB}_*"))

                    # if marked tests are already running, update their status
                    self._update_marked_tests(
                        marked_tests_cache=marked_tests_cache,
                        cget_status=cget_status)

                    # test has mark
                    if mark:
                        # select this instance for running marked tests if possible
                        if not self._marked_select_instance(cget_status):
                            cget_status.sleep_delay = 2
                            continue

                        # check if we need to wait until unmarked tests are finished
                        if (not cget_status.marked_running_sfiles
                                and cget_status.started_tests_sfiles):
                            cget_status.sleep_delay = 10
                            continue

                        self.cm._log(
                            f"c{instance_num}: in marked tests branch, "
                            f"I have required mark '{mark}'")

                    # no unmarked test can run while marked tests are starting or running
                    elif cget_status.marked_running_sfiles or cget_status.marked_starting_sfiles:
                        self.cm._log(
                            f"c{instance_num}: marked tests starting or running, "
                            f"I don't have mark")
                        cget_status.sleep_delay = 2
                        continue

                    # check availability of the required resources
                    if not self._are_resources_available(cget_status):
                        cget_status.sleep_delay = 5
                        continue

                    # if restart is needed, indicate that the cluster will be restarted
                    # (after all currently running tests are finished)
                    if not self._init_restart(cget_status):
                        continue

                    # we've found suitable cluster instance
                    cget_status.selected_instance = instance_num
                    self.cm._cluster_instance_num = instance_num
                    self.cm._log(
                        f"c{instance_num}: can run test '{cget_status.current_test}'"
                    )
                    # set environment variables that are needed when restarting the cluster
                    # and running tests
                    cluster_nodes.set_cluster_env(instance_num)

                    # if needed, finish restart related actions
                    if not self._finish_restart(cget_status):
                        continue

                    # from this point on, all conditions needed to start the test are met
                    break
                else:
                    # if the test cannot start on any instance, return to top-level loop
                    continue

                self._create_test_status_files(cget_status)

                # Check if it is necessary to reload data. This still needs to happen under
                # global lock.
                state_dir = cluster_nodes.get_cluster_env().state_dir
                self._reload_cluster_obj(state_dir=state_dir)

                # cluster is ready, we can start the test
                break

        cluster_obj = self.cm.cache.cluster_obj
        if not cluster_obj:
            raise AssertionError(
                "`cluster_obj` not available, that cannot happen")
        cluster_obj.cluster_id = instance_num
        cluster_obj._cluster_manager = self.cm  # type: ignore

        return cluster_obj
    *dst_addrs: str,
    cluster_obj: clusterlib.ClusterLib,
    amount: int = 2_000_000,
    tx_name: Optional[str] = None,
    destination_dir: FileType = ".",
) -> None:
    """Send `amount` from genesis addr to all `dst_addrs`."""
    fund_dst = [
        clusterlib.TxOut(address=d, amount=amount) for d in dst_addrs
        if cluster_obj.get_address_balance(d) < amount
    ]
    if not fund_dst:
        return

    with locking.FileLockIfXdist(
            f"{temptools.get_basetemp()}/{cluster_obj.genesis_utxo_addr}.lock"
    ):
        tx_name = tx_name or helpers.get_timestamped_rand_str()
        tx_name = f"{tx_name}_genesis_funding"
        fund_tx_files = clusterlib.TxFiles(signing_key_files=[
            *cluster_obj.genesis_keys.delegate_skeys,
            cluster_obj.genesis_keys.genesis_utxo_skey,
        ])

        cluster_obj.send_funds(
            src_address=cluster_obj.genesis_utxo_addr,
            destinations=fund_dst,
            tx_name=tx_name,
            tx_files=fund_tx_files,
            destination_dir=destination_dir,
        )