def conn(cls) -> psycopg2.extensions.connection:
     instance_num = cluster_nodes.get_cluster_env().instance_num
     conn = cls.conn_cache.get(instance_num)
     if conn is None or conn.closed == 1:
         conn = psycopg2.connect(f"dbname={DBSYNC_DB}{instance_num}")
         cls.conn_cache[instance_num] = conn
     return conn
Example #2
0
    def _reuse_dev_cluster(self) -> clusterlib.ClusterLib:
        """Reuse cluster that was already started outside of test framework."""
        instance_num = 0
        self.cm._cluster_instance_num = instance_num
        cluster_nodes.set_cluster_env(instance_num)
        state_dir = cluster_nodes.get_cluster_env().state_dir

        # make sure instance dir exists
        instance_dir = self.cm.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}"
        instance_dir.mkdir(exist_ok=True, parents=True)

        cluster_obj = self.cm.cache.cluster_obj
        if not cluster_obj:
            cluster_obj = cluster_nodes.get_cluster_type().get_cluster_obj()

        # setup faucet addresses
        if not (state_dir / cluster_nodes.ADDRS_DATA).exists():
            tmp_path = state_dir / "addrs_data"
            tmp_path.mkdir(exist_ok=True, parents=True)
            cluster_nodes.setup_test_addrs(cluster_obj, tmp_path)

        # check if it is necessary to reload data
        self._reload_cluster_obj(state_dir=state_dir)

        return cluster_obj
Example #3
0
def add_ignore_rule(files_glob: str, regex: str) -> None:
    """Add ignore rule for expected errors."""
    with helpers.FileLockIfXdist(f"{helpers.get_basetemp()}/ignore_rules.lock"):
        state_dir = cluster_nodes.get_cluster_env().state_dir
        rules_file = state_dir / ERRORS_RULES_FILE_NAME
        with open(rules_file, "a") as infile:
            infile.write(f"{files_glob};;{regex}\n")
 def _is_dev_cluster_ready(self) -> bool:
     """Check if development cluster instance is ready to be used."""
     work_dir = cluster_nodes.get_cluster_env().work_dir
     state_dir = work_dir / f"{cluster_nodes.STATE_CLUSTER}0"
     if (state_dir / cluster_nodes.ADDRS_DATA).exists():
         return True
     return False
Example #5
0
    def stop_all_clusters(self) -> None:
        """Stop all cluster instances."""
        self._log("called `stop_all_clusters`")
        for instance_num in range(self.num_of_instances):
            instance_dir = self.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}"
            if (not (instance_dir / CLUSTER_RUNNING_FILE).exists()
                    or (instance_dir / CLUSTER_STOPPED_FILE).exists()):
                self._log(f"cluster instance {instance_num} not running")
                continue

            startup_files = cluster_nodes.get_cluster_type(
            ).cluster_scripts.prepare_scripts_files(
                destdir=self._create_startup_files_dir(instance_num),
                instance_num=instance_num,
            )
            cluster_nodes.set_cluster_env(instance_num)
            self._log(
                f"stopping cluster instance {instance_num} with `{startup_files.stop_script}`"
            )

            state_dir = cluster_nodes.get_cluster_env().state_dir

            try:
                cluster_nodes.stop_cluster(cmd=str(startup_files.stop_script))
            except Exception as exc:
                LOGGER.error(f"While stopping cluster: {exc}")

            cli_coverage.save_start_script_coverage(
                log_file=state_dir / CLUSTER_START_CMDS_LOG,
                pytest_config=self.pytest_config,
            )
            cluster_nodes.save_cluster_artifacts(
                artifacts_dir=self.pytest_tmp_dir, clean=True)
            open(instance_dir / CLUSTER_STOPPED_FILE, "a").close()
            self._log(f"stopped cluster instance {instance_num}")
Example #6
0
def cleanup(
    cluster_obj: clusterlib.ClusterLib,
    location: FileType,
) -> None:
    """Cleanup a testnet with the help of testing artifacts."""
    cluster_env = cluster_nodes.get_cluster_env()
    faucet_addr_file = cluster_env.state_dir / "shelley" / "faucet.addr"
    faucet_payment = create_addr_record(faucet_addr_file)
    files_found = group_files(find_files(location))

    def _run(files: List[Path]) -> None:
        for fpath in files:
            # add random sleep for < 1s to prevent
            # "Network.Socket.connect: <socket: 11>: resource exhausted"
            time.sleep(random.random())

            f_name = fpath.name
            if f_name == "faucet.addr":
                continue
            if f_name.endswith("_stake.addr"):
                payment_addr = fpath.parent / f_name.replace(
                    "_stake.addr", ".addr")
                try:
                    payment = create_addr_record(payment_addr)
                    stake = create_addr_record(fpath)
                except ValueError as exc:
                    LOGGER.warning(f"Skipping '{fpath}':\n'{exc}'")
                    continue

                pool_user = clusterlib.PoolUser(payment=payment, stake=stake)

                deregister_stake_addr(cluster_obj=cluster_obj,
                                      pool_user=pool_user,
                                      name_template=f_name)

                withdraw_reward(
                    cluster_obj=cluster_obj,
                    stake_addr_record=stake,
                    dst_addr_record=payment,
                    name_template=f_name,
                )
            else:
                try:
                    payment = create_addr_record(fpath)
                except ValueError as exc:
                    LOGGER.warning(f"Skipping '{fpath}':\n'{exc}'")
                    continue
                return_funds_to_faucet(
                    cluster_obj=cluster_obj,
                    src_addr=payment,
                    faucet_addr=faucet_payment.address,
                    tx_name=f_name,
                )

    # run cleanup in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(_run, f) for f in files_found]
        concurrent.futures.wait(futures)
Example #7
0
def add_ignore_rule(files_glob: str, regex: str, ignore_file_id: str) -> None:
    """Add ignore rule for expected errors."""
    cluster_env = cluster_nodes.get_cluster_env()
    rules_file = cluster_env.state_dir / f"{ERRORS_IGNORE_FILE_NAME}_{ignore_file_id}"
    lock_file = (temptools.get_basetemp() /
                 f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock")

    with locking.FileLockIfXdist(lock_file), open(rules_file,
                                                  "a",
                                                  encoding="utf-8") as infile:
        infile.write(f"{files_glob};;{regex}\n")
Example #8
0
def expect_errors(regex_pairs: List[Tuple[str, str]]) -> Iterator[None]:
    """Make sure expected errors are present in logs.

    Args:
        regex_pairs: [(glob, regex)] - list of regexes that need to be present in files
            described by the glob
    """
    state_dir = cluster_nodes.get_cluster_env().state_dir

    glob_list = []
    for files_glob, regex in regex_pairs:
        add_ignore_rule(files_glob,
                        regex)  # don't report errors that are expected
        glob_list.append(files_glob)
    # resolve the globs
    _expanded_paths = [
        list(state_dir.glob(glob_item)) for glob_item in glob_list
    ]
    # flatten the list
    expanded_paths = list(itertools.chain.from_iterable(_expanded_paths))
    # record each end-of-file as a starting offset for searching the log file
    seek_offsets = {str(p): helpers.get_eof_offset(p) for p in expanded_paths}

    timestamp = time.time()

    yield

    for files_glob, regex in regex_pairs:
        regex_comp = re.compile(regex)
        # get list of records (file names and offsets) for given glob
        matching_files = fnmatch.filter(seek_offsets,
                                        f"{state_dir}/{files_glob}")
        for logfile in matching_files:
            # skip if the log file is rotated log, it will be handled by `get_rotated_logs`
            if ROTATED_RE.match(logfile):
                continue

            # search for the expected error
            seek = seek_offsets.get(logfile) or 0
            line_found = False
            for logfile_rec in get_rotated_logs(logfile=Path(logfile),
                                                seek=seek,
                                                timestamp=timestamp):
                with open(logfile_rec.logfile) as infile:
                    infile.seek(seek)
                    for line in infile:
                        if regex_comp.search(line):
                            line_found = True
                            break
                if line_found:
                    break
            else:
                raise AssertionError(
                    f"No line matching `{regex}` found in '{logfile}'.")
    def test_available_metrics(
        self,
        wait_epochs,
    ):
        """Test that available EKG metrics matches the expected schema."""
        # pylint: disable=unused-argument
        ekg_port = (cluster_nodes.get_cluster_type(
        ).cluster_scripts.get_instance_ports(
            cluster_nodes.get_cluster_env().instance_num).ekg_pool1)

        response = get_ekg_metrics(ekg_port)
        model_ekg.Model.validate(response.json())
Example #10
0
def clean_ignore_rules(ignore_file_id: str) -> None:
    """Cleanup relevant ignore rules file.

    Delete ignore file identified by `ignore_file_id` when it is no longer valid.
    """
    cluster_env = cluster_nodes.get_cluster_env()
    rules_file = cluster_env.state_dir / f"{ERRORS_IGNORE_FILE_NAME}_{ignore_file_id}"
    lock_file = (temptools.get_basetemp() /
                 f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock")

    with locking.FileLockIfXdist(lock_file):
        rules_file.unlink(missing_ok=True)
Example #11
0
def testenv_setup_teardown(
        tmp_path_factory: TempdirFactory, worker_id: str,
        request: FixtureRequest) -> Generator[None, None, None]:
    pytest_root_tmp = temptools.get_pytest_root_tmp(tmp_path_factory)

    with locking.FileLockIfXdist(
            f"{pytest_root_tmp}/{cluster_management.CLUSTER_LOCK}"):
        # save environment info for Allure
        if not list(pytest_root_tmp.glob(".started_session_*")):
            _save_env_for_allure(request.config)

        helpers.touch(pytest_root_tmp / f".started_session_{worker_id}")

    yield

    with locking.FileLockIfXdist(
            f"{pytest_root_tmp}/{cluster_management.CLUSTER_LOCK}"):
        # save CLI coverage to dir specified by `--cli-coverage-dir`
        cluster_manager_obj = cluster_management.ClusterManager(
            tmp_path_factory=tmp_path_factory,
            worker_id=worker_id,
            pytest_config=request.config)
        cluster_manager_obj.save_worker_cli_coverage()

        # perform cleanup if this is the last running pytest worker
        (pytest_root_tmp / f".started_session_{worker_id}").unlink()
        if not list(pytest_root_tmp.glob(".started_session_*")):
            # perform testnet cleanup
            _testnet_cleanup(pytest_root_tmp=pytest_root_tmp)

            if configuration.DEV_CLUSTER_RUNNING:
                # save cluster artifacts
                artifacts_base_dir = request.config.getoption(
                    "--artifacts-base-dir")
                if artifacts_base_dir:
                    state_dir = cluster_nodes.get_cluster_env().state_dir
                    artifacts.save_cluster_artifacts(save_dir=pytest_root_tmp,
                                                     state_dir=state_dir)
            else:
                # stop all cluster instances, save artifacts
                _stop_all_cluster_instances(
                    tmp_path_factory=tmp_path_factory,
                    worker_id=worker_id,
                    pytest_config=request.config,
                )

            # copy collected artifacts to dir specified by `--artifacts-base-dir`
            artifacts.copy_artifacts(pytest_tmp_dir=pytest_root_tmp,
                                     pytest_config=request.config)
    def test_available_metrics(
        self,
        wait_epochs,
    ):
        """Test that list of available metrics == list of expected metrics."""
        # pylint: disable=unused-argument
        prometheus_port = (cluster_nodes.get_cluster_type(
        ).cluster_scripts.get_instance_ports(
            cluster_nodes.get_cluster_env().instance_num).prometheus_pool1)

        response = get_prometheus_metrics(prometheus_port)

        metrics = response.text.strip().split("\n")
        metrics_keys = sorted(m.split(" ")[0] for m in metrics)
        assert metrics_keys == EXPECTED_METRICS, "Metrics differ"
    def _setup_dev_cluster(self) -> None:
        """Set up cluster instance that was already started outside of test framework."""
        work_dir = cluster_nodes.get_cluster_env().work_dir
        state_dir = work_dir / f"{cluster_nodes.STATE_CLUSTER}0"
        if (state_dir / cluster_nodes.ADDRS_DATA).exists():
            return

        self.cm._log("c0: setting up dev cluster")

        # Create "addrs_data" directly in the cluster state dir, so it can be reused
        # (in normal non-`DEV_CLUSTER_RUNNING` setup we want "addrs_data" stored among
        # tests artifacts, so it can be used during cleanup etc.).
        tmp_path = state_dir / "addrs_data"
        tmp_path.mkdir(exist_ok=True, parents=True)
        cluster_obj = cluster_nodes.get_cluster_type().get_cluster_obj()
        cluster_nodes.setup_test_addrs(cluster_obj=cluster_obj,
                                       destination_dir=tmp_path)
    def stop_all_clusters(self) -> None:
        """Stop all cluster instances."""
        self._log("called `stop_all_clusters`")

        # don't stop cluster if it was started outside of test framework
        if configuration.DEV_CLUSTER_RUNNING:
            LOGGER.warning(
                "Ignoring request to stop clusters as 'DEV_CLUSTER_RUNNING' is set."
            )
            return

        work_dir = cluster_nodes.get_cluster_env().work_dir

        for instance_num in range(self.num_of_instances):
            instance_dir = self.pytest_tmp_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}"
            if (not (instance_dir / CLUSTER_RUNNING_FILE).exists()
                    or (instance_dir / CLUSTER_STOPPED_FILE).exists()):
                self._log(f"c{instance_num}: cluster instance not running")
                continue

            state_dir = work_dir / f"{cluster_nodes.STATE_CLUSTER}{instance_num}"

            stop_script = state_dir / cluster_scripts.STOP_SCRIPT
            if not stop_script.exists():
                self._log(f"c{instance_num}: stop script doesn't exist!")
                continue

            self._log(
                f"c{instance_num}: stopping cluster instance with `{stop_script}`"
            )
            try:
                helpers.run_command(str(stop_script))
            except Exception as err:
                self._log(f"c{instance_num}: failed to stop cluster:\n{err}")

            artifacts.save_start_script_coverage(
                log_file=state_dir / CLUSTER_START_CMDS_LOG,
                pytest_config=self.pytest_config,
            )
            artifacts.save_cluster_artifacts(save_dir=self.pytest_tmp_dir,
                                             state_dir=state_dir)

            shutil.rmtree(state_dir, ignore_errors=True)

            helpers.touch(instance_dir / CLUSTER_STOPPED_FILE)
            self._log(f"c{instance_num}: stopped cluster instance")
Example #15
0
def search_cluster_artifacts() -> List[Tuple[Path, str]]:
    """Search cluster artifacts for errors."""
    cluster_env = cluster_nodes.get_cluster_env()
    lock_file = temptools.get_basetemp(
    ) / f"search_artifacts_{cluster_env.instance_num}.lock"

    with locking.FileLockIfXdist(lock_file):
        ignore_rules = _get_ignore_rules(cluster_env=cluster_env)

        errors = []
        for logfile in cluster_env.state_dir.glob("*.std*"):
            # skip if the log file is status file or rotated log
            if logfile.name.endswith(".offset") or ROTATED_RE.match(
                    logfile.name):
                continue

            # read seek offset (from where to start searching) and timestamp of last search
            offset_file = logfile.parent / f".{logfile.name}.offset"
            if offset_file.exists():
                seek = _get_seek(offset_file)
                timestamp = os.path.getmtime(offset_file)
            else:
                seek = 0
                timestamp = 0.0

            errors_ignored = _get_ignore_regex(ignore_rules=ignore_rules,
                                               regexes=ERRORS_IGNORED,
                                               logfile=logfile)
            errors_ignored_re = re.compile(errors_ignored)

            # record offset for the "live" log file
            with open(offset_file, "w", encoding="utf-8") as outfile:
                outfile.write(str(helpers.get_eof_offset(logfile)))

            for logfile_rec in _get_rotated_logs(logfile=logfile,
                                                 seek=seek,
                                                 timestamp=timestamp):
                with open(logfile_rec.logfile, encoding="utf-8") as infile:
                    infile.seek(seek)
                    for line in infile:
                        if ERRORS_RE.search(line) and not (
                                errors_ignored
                                and errors_ignored_re.search(line)):
                            errors.append((logfile, line))

    return errors
    def get(  # noqa: C901
        self,
        mark: str = "",
        lock_resources: Iterable[str] = (),
        use_resources: Iterable[str] = (),
        cleanup: bool = False,
        start_cmd: str = "",
    ) -> clusterlib.ClusterLib:
        """Return the `clusterlib.ClusterLib` instance once we can start the test.

        It checks current conditions and waits if the conditions don't allow to start the test
        right away.
        """
        # pylint: disable=too-many-statements,too-many-branches
        assert not isinstance(
            lock_resources,
            str), "`lock_resources` must be sequence of strings"
        assert not isinstance(
            use_resources, str), "`use_resources` must be sequence of strings"

        if configuration.DEV_CLUSTER_RUNNING:
            if start_cmd:
                LOGGER.warning(
                    f"Ignoring the '{start_cmd}' cluster start command as "
                    "'DEV_CLUSTER_RUNNING' is set.")
            # check if the development cluster instance is ready by now so we don't need to obtain
            # cluster lock when it is not necessary
            if not self._is_dev_cluster_ready():
                with locking.FileLockIfXdist(self.cm.cluster_lock):
                    self._setup_dev_cluster()

        if configuration.FORBID_RESTART and start_cmd:
            raise RuntimeError(
                "Cannot use custom start command when 'FORBID_RESTART' is set."
            )

        if start_cmd:
            if not (mark or (Resources.CLUSTER in lock_resources)):
                raise RuntimeError(
                    "Custom start command can be used only together with singleton or `mark`."
                )
            # always clean after test(s) that started cluster with custom configuration
            cleanup = True

        # Add `Resources.CLUSTER` to `use_resources`. Filter out `lock_resources` from the
        # list of `use_resources`.
        use_resources = list(
            set(use_resources).union({Resources.CLUSTER}) -
            set(lock_resources))

        cget_status = ClusterGetStatus(
            mark=mark,
            lock_resources=lock_resources,
            use_resources=use_resources,
            cleanup=cleanup,
            start_cmd=start_cmd,
            current_test=os.environ.get("PYTEST_CURRENT_TEST") or "",
        )
        marked_tests_cache: Dict[int, MarkedTestsStatus] = {}

        self.cm._log(f"want to run test '{cget_status.current_test}'")

        # iterate until it is possible to start the test
        while True:
            if cget_status.restart_ready:
                self._restart(start_cmd=start_cmd)

            if not cget_status.first_iteration:
                xdist_sleep(random.uniform(0.6, 1.2) * cget_status.sleep_delay)

            # nothing time consuming can go under this lock as all other workers will need to wait
            with locking.FileLockIfXdist(self.cm.cluster_lock):
                if self._is_already_running(cget_status):
                    if not self.cm.cache.cluster_obj:
                        raise AssertionError(
                            "`cluster_obj` not available, that cannot happen")
                    return self.cm.cache.cluster_obj

                # needs to be set here, before the first `continue`
                cget_status.first_iteration = False
                self.cm._cluster_instance_num = -1

                # try all existing cluster instances
                for instance_num in range(self.cm.num_of_instances):
                    # there's only one cluster instance when `DEV_CLUSTER_RUNNING` is set
                    if configuration.DEV_CLUSTER_RUNNING and instance_num != 0:
                        continue

                    # if instance to run the test on was already decided, skip all other instances
                    # pylint: disable=consider-using-in
                    if (cget_status.selected_instance != -1
                            and instance_num != cget_status.selected_instance):
                        continue

                    cget_status.instance_num = instance_num
                    cget_status.instance_dir = (
                        self.cm.pytest_tmp_dir /
                        f"{CLUSTER_DIR_TEMPLATE}{instance_num}")
                    cget_status.instance_dir.mkdir(exist_ok=True)

                    # cleanup cluster instance where attempt to start cluster failed repeatedly
                    if (cget_status.instance_dir / CLUSTER_DEAD_FILE).exists():
                        self._cleanup_dead_clusters(cget_status)
                        continue

                    # cluster restart planned or in progress, so no new tests can start
                    if self._restarted_by_other_worker(cget_status):
                        cget_status.sleep_delay = 5
                        continue

                    # are there tests already running on this cluster instance?
                    cget_status.started_tests_sfiles = list(
                        cget_status.instance_dir.glob(
                            f"{TEST_RUNNING_GLOB}_*"))

                    # "marked tests" = group of tests marked with a specific mark.
                    # While these tests are running, no unmarked test can start.
                    cget_status.marked_starting_sfiles = list(
                        cget_status.instance_dir.glob(
                            f"{TEST_MARK_STARTING_GLOB}_*"))
                    cget_status.marked_running_sfiles = list(
                        cget_status.instance_dir.glob(
                            f"{TEST_CURR_MARK_GLOB}_*"))

                    # if marked tests are already running, update their status
                    self._update_marked_tests(
                        marked_tests_cache=marked_tests_cache,
                        cget_status=cget_status)

                    # test has mark
                    if mark:
                        # select this instance for running marked tests if possible
                        if not self._marked_select_instance(cget_status):
                            cget_status.sleep_delay = 2
                            continue

                        # check if we need to wait until unmarked tests are finished
                        if (not cget_status.marked_running_sfiles
                                and cget_status.started_tests_sfiles):
                            cget_status.sleep_delay = 10
                            continue

                        self.cm._log(
                            f"c{instance_num}: in marked tests branch, "
                            f"I have required mark '{mark}'")

                    # no unmarked test can run while marked tests are starting or running
                    elif cget_status.marked_running_sfiles or cget_status.marked_starting_sfiles:
                        self.cm._log(
                            f"c{instance_num}: marked tests starting or running, "
                            f"I don't have mark")
                        cget_status.sleep_delay = 2
                        continue

                    # check availability of the required resources
                    if not self._are_resources_available(cget_status):
                        cget_status.sleep_delay = 5
                        continue

                    # if restart is needed, indicate that the cluster will be restarted
                    # (after all currently running tests are finished)
                    if not self._init_restart(cget_status):
                        continue

                    # we've found suitable cluster instance
                    cget_status.selected_instance = instance_num
                    self.cm._cluster_instance_num = instance_num
                    self.cm._log(
                        f"c{instance_num}: can run test '{cget_status.current_test}'"
                    )
                    # set environment variables that are needed when restarting the cluster
                    # and running tests
                    cluster_nodes.set_cluster_env(instance_num)

                    # if needed, finish restart related actions
                    if not self._finish_restart(cget_status):
                        continue

                    # from this point on, all conditions needed to start the test are met
                    break
                else:
                    # if the test cannot start on any instance, return to top-level loop
                    continue

                self._create_test_status_files(cget_status)

                # Check if it is necessary to reload data. This still needs to happen under
                # global lock.
                state_dir = cluster_nodes.get_cluster_env().state_dir
                self._reload_cluster_obj(state_dir=state_dir)

                # cluster is ready, we can start the test
                break

        cluster_obj = self.cm.cache.cluster_obj
        if not cluster_obj:
            raise AssertionError(
                "`cluster_obj` not available, that cannot happen")
        cluster_obj.cluster_id = instance_num
        cluster_obj._cluster_manager = self.cm  # type: ignore

        return cluster_obj
    def _restart(self,
                 start_cmd: str = "",
                 stop_cmd: str = "") -> bool:  # noqa: C901
        """Restart cluster.

        Not called under global lock!
        """
        # pylint: disable=too-many-branches
        cluster_running_file = self.cm.instance_dir / CLUSTER_RUNNING_FILE

        # don't restart cluster if it was started outside of test framework
        if configuration.DEV_CLUSTER_RUNNING:
            self.cm._log(
                f"c{self.cm.cluster_instance_num}: ignoring restart, dev cluster is running"
            )
            if cluster_running_file.exists():
                LOGGER.warning(
                    "Ignoring requested cluster restart as 'DEV_CLUSTER_RUNNING' is set."
                )
            else:
                helpers.touch(cluster_running_file)
            return True

        # fail if cluster restart is forbidden and it was already started
        if configuration.FORBID_RESTART and cluster_running_file.exists():
            raise RuntimeError(
                "Cannot restart cluster when 'FORBID_RESTART' is set.")

        self.cm._log(
            f"c{self.cm.cluster_instance_num}: called `_restart`, start_cmd='{start_cmd}', "
            f"stop_cmd='{stop_cmd}'")

        startup_files = cluster_nodes.get_cluster_type(
        ).cluster_scripts.prepare_scripts_files(
            destdir=self.cm._create_startup_files_dir(
                self.cm.cluster_instance_num),
            instance_num=self.cm.cluster_instance_num,
            start_script=start_cmd,
            stop_script=stop_cmd,
        )

        state_dir = cluster_nodes.get_cluster_env().state_dir

        self.cm._log(
            f"c{self.cm.cluster_instance_num}: in `_restart`, new files "
            f"start_cmd='{startup_files.start_script}', "
            f"stop_cmd='{startup_files.stop_script}'")

        excp: Optional[Exception] = None
        for i in range(2):
            if i > 0:
                self.cm._log(
                    f"c{self.cm.cluster_instance_num}: failed to start cluster:\n{excp}\nretrying"
                )
                time.sleep(0.2)

            try:
                LOGGER.info(
                    f"Stopping cluster with `{startup_files.stop_script}`.")
                helpers.run_command(str(startup_files.stop_script))
            except Exception as err:
                self.cm._log(
                    f"c{self.cm.cluster_instance_num}: failed to stop cluster:\n{err}"
                )

            # save artifacts only when produced during this test run
            if cluster_running_file.exists():
                artifacts.save_start_script_coverage(
                    log_file=state_dir / CLUSTER_START_CMDS_LOG,
                    pytest_config=self.cm.pytest_config,
                )
                artifacts.save_cluster_artifacts(
                    save_dir=self.cm.pytest_tmp_dir, state_dir=state_dir)

            shutil.rmtree(state_dir, ignore_errors=True)

            with contextlib.suppress(Exception):
                _kill_supervisor(self.cm.cluster_instance_num)

            try:
                cluster_obj = cluster_nodes.start_cluster(
                    cmd=str(startup_files.start_script),
                    args=startup_files.start_script_args)
            except Exception as err:
                LOGGER.error(f"Failed to start cluster: {err}")
                excp = err
            else:
                break
        else:
            self.cm._log(
                f"c{self.cm.cluster_instance_num}: failed to start cluster:\n{excp}\ncluster dead"
            )
            if not configuration.IS_XDIST:
                pytest.exit(msg=f"Failed to start cluster, exception: {excp}",
                            returncode=1)
            helpers.touch(self.cm.instance_dir / CLUSTER_DEAD_FILE)
            return False

        # Create temp dir for faucet addresses data.
        # Pytest's mktemp adds number to the end of the dir name, so keep the trailing '_'
        # as separator. Resulting dir name is e.g. 'addrs_data_ci3_0'.
        tmp_path = Path(
            self.cm.tmp_path_factory.mktemp(
                f"addrs_data_ci{self.cm.cluster_instance_num}_"))
        # setup faucet addresses
        cluster_nodes.setup_test_addrs(cluster_obj=cluster_obj,
                                       destination_dir=tmp_path)

        # create file that indicates that the cluster is running
        if not cluster_running_file.exists():
            helpers.touch(cluster_running_file)

        return True
Example #18
0
    def get(  # noqa: C901
        self,
        singleton: bool = False,
        mark: str = "",
        lock_resources: UnpackableSequence = (),
        use_resources: UnpackableSequence = (),
        cleanup: bool = False,
        start_cmd: str = "",
    ) -> clusterlib.ClusterLib:
        """Return the `clusterlib.ClusterLib` instance once we can start the test.

        It checks current conditions and waits if the conditions don't allow to start the test
        right away.
        """
        # pylint: disable=too-many-statements,too-many-branches,too-many-locals

        # don't start new cluster if it was already started outside of test framework
        if DEV_CLUSTER_RUNNING:
            if start_cmd:
                LOGGER.warning(
                    f"Ignoring the '{start_cmd}' cluster start command as "
                    "'DEV_CLUSTER_RUNNING' is set.")
            return self._reuse_dev_cluster()

        if FORBID_RESTART and start_cmd:
            raise RuntimeError(
                "Cannot use custom start command when 'FORBID_RESTART' is set."
            )

        selected_instance = -1
        restart_here = False
        restart_ready = False
        first_iteration = True
        sleep_delay = 1
        marked_tests_cache: Dict[int, MarkedTestsStatus] = {}

        if start_cmd:
            if not (singleton or mark):
                raise AssertionError(
                    "Custom start command can be used only together with `singleton` or `mark`"
                )
            # always clean after test(s) that started cluster with custom configuration
            cleanup = True

        # iterate until it is possible to start the test
        while True:
            if restart_ready:
                self._restart(start_cmd=start_cmd)

            if not first_iteration:
                helpers.xdist_sleep(random.random() * sleep_delay)

            # nothing time consuming can go under this lock as it will block all other workers
            with helpers.FileLockIfXdist(self.cm.cluster_lock):
                test_on_worker = list(
                    self.cm.lock_dir.glob(
                        f"{CLUSTER_DIR_TEMPLATE}*/{TEST_RUNNING_GLOB}_{self.cm.worker_id}"
                    ))

                # test is already running, nothing to set up
                if (first_iteration and test_on_worker
                        and self.cm._cluster_instance_num != -1
                        and self.cm.cache.cluster_obj):
                    self.cm._log(f"{test_on_worker[0]} already exists")
                    return self.cm.cache.cluster_obj

                first_iteration = False  # needs to be set here, before the first `continue`
                self.cm._cluster_instance_num = -1

                # try all existing cluster instances
                for instance_num in range(self.cm.num_of_instances):
                    # if instance to run the test on was already decided, skip all other instances
                    # pylint: disable=consider-using-in
                    if selected_instance != -1 and instance_num != selected_instance:
                        continue

                    instance_dir = self.cm.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}"
                    instance_dir.mkdir(exist_ok=True)

                    # if the selected instance failed to start, move on to other instance
                    if (instance_dir / CLUSTER_DEAD_FILE).exists():
                        selected_instance = -1
                        restart_here = False
                        restart_ready = False
                        # remove status files that are checked by other workers
                        for sf in (
                                *instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*"),
                                *instance_dir.glob(
                                    f"{TEST_MARK_STARTING_GLOB}_*"),
                        ):
                            os.remove(sf)

                        dead_clusters = list(
                            self.cm.lock_dir.glob(
                                f"{CLUSTER_DIR_TEMPLATE}*/{CLUSTER_DEAD_FILE}")
                        )
                        if len(dead_clusters) == self.cm.num_of_instances:
                            raise RuntimeError(
                                "All clusters are dead, cannot run.")
                        continue

                    # singleton test is running, so no other test can be started
                    if (instance_dir / TEST_SINGLETON_FILE).exists():
                        self.cm._log(
                            f"c{instance_num}: singleton test in progress, cannot run"
                        )
                        sleep_delay = 5
                        continue

                    restart_in_progress = list(
                        instance_dir.glob(f"{RESTART_IN_PROGRESS_GLOB}_*"))
                    # cluster restart planned, no new tests can start
                    if not restart_here and restart_in_progress:
                        # no log message here, it would be too many of them
                        sleep_delay = 5
                        continue

                    started_tests = list(
                        instance_dir.glob(f"{TEST_RUNNING_GLOB}_*"))

                    # "marked tests" = group of tests marked with a specific mark.
                    # While these tests are running, no unmarked test can start.
                    marked_starting = list(
                        instance_dir.glob(f"{TEST_MARK_STARTING_GLOB}_*"))
                    marked_running = list(
                        instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*"))

                    if mark:
                        marked_running_my = (
                            instance_dir /
                            f"{TEST_CURR_MARK_GLOB}_{mark}").exists()
                        marked_starting_my = list(
                            instance_dir.glob(
                                f"{TEST_MARK_STARTING_GLOB}_{mark}_*"))

                        marked_running_my_anywhere = list(
                            self.cm.lock_dir.glob(
                                f"{CLUSTER_DIR_TEMPLATE}*/{TEST_CURR_MARK_GLOB}_{mark}"
                            ))
                        # check if tests with my mark are running on some other cluster instance
                        if not marked_running_my and marked_running_my_anywhere:
                            self.cm._log(
                                f"c{instance_num}: tests marked with my mark '{mark}' "
                                "already running on other cluster instance, cannot run"
                            )
                            continue

                        marked_starting_my_anywhere = list(
                            self.cm.lock_dir.glob(
                                f"{CLUSTER_DIR_TEMPLATE}*/{TEST_MARK_STARTING_GLOB}_{mark}_*"
                            ))
                        # check if tests with my mark are starting on some other cluster instance
                        if not marked_starting_my and marked_starting_my_anywhere:
                            self.cm._log(
                                f"c{instance_num}: tests marked with my mark '{mark}' starting "
                                "on other cluster instance, cannot run")
                            continue

                        # check if this test has the same mark as currently running marked tests
                        if marked_running_my or marked_starting_my:
                            # lock to this cluster instance
                            selected_instance = instance_num
                        elif marked_running or marked_starting:
                            self.cm._log(
                                f"c{instance_num}: tests marked with other mark starting "
                                f"or running, I have different mark '{mark}'")
                            continue

                        # check if needs to wait until marked tests can run
                        if marked_starting_my and started_tests:
                            self.cm._log(
                                f"c{instance_num}: unmarked tests running, wants to start '{mark}'"
                            )
                            sleep_delay = 2
                            continue

                    # no unmarked test can run while marked tests are starting or running
                    elif marked_running or marked_starting:
                        self.cm._log(
                            f"c{instance_num}: marked tests starting or running, "
                            f"I don't have mark")
                        sleep_delay = 5
                        continue

                    # is this the first marked test that wants to run?
                    initial_marked_test = bool(mark and not marked_running)

                    # indicate that it is planned to start marked tests as soon as
                    # all currently running tests are finished or the cluster is restarted
                    if initial_marked_test:
                        # lock to this cluster instance
                        selected_instance = instance_num
                        mark_starting_file = (
                            instance_dir /
                            f"{TEST_MARK_STARTING_GLOB}_{mark}_{self.cm.worker_id}"
                        )
                        if not mark_starting_file.exists():
                            open(
                                mark_starting_file,
                                "a",
                            ).close()
                        if started_tests:
                            self.cm._log(
                                f"c{instance_num}: unmarked tests running, wants to start '{mark}'"
                            )
                            sleep_delay = 3
                            continue

                    # get marked tests status
                    marked_tests_status = self._get_marked_tests_status(
                        cache=marked_tests_cache, instance_num=instance_num)

                    # marked tests are already running
                    if marked_running:
                        active_mark_file = marked_running[0].name

                        # update marked tests status
                        self._update_marked_tests(
                            marked_tests_status=marked_tests_status,
                            active_mark_name=active_mark_file,
                            started_tests=started_tests,
                            instance_num=instance_num,
                        )

                        self.cm._log(
                            f"c{instance_num}: in marked tests branch, "
                            f"I have required mark '{mark}'")

                    # reset counter of cycles with no marked test running
                    marked_tests_status.no_marked_tests_iter = 0

                    # this test is a singleton - no other test can run while this one is running
                    if singleton and started_tests:
                        self.cm._log(
                            f"c{instance_num}: tests are running, cannot start singleton"
                        )
                        sleep_delay = 5
                        continue

                    # this test wants to lock some resources, check if these are not
                    # locked or in use
                    if lock_resources:
                        res_usable = self._are_resources_usable(
                            resources=lock_resources,
                            instance_dir=instance_dir,
                            instance_num=instance_num,
                        )
                        if not res_usable:
                            sleep_delay = 5
                            continue

                    # filter out `lock_resources` from the list of `use_resources`
                    if use_resources and lock_resources:
                        use_resources = list(
                            set(use_resources) - set(lock_resources))

                    # this test wants to use some resources, check if these are not locked
                    if use_resources:
                        res_locked = self._are_resources_locked(
                            resources=use_resources,
                            instance_dir=instance_dir,
                            instance_num=instance_num,
                        )
                        if res_locked:
                            sleep_delay = 5
                            continue

                    # indicate that the cluster will be restarted
                    new_cmd_restart = bool(start_cmd and
                                           (initial_marked_test or singleton))
                    if not restart_here and (
                            new_cmd_restart
                            or self._is_restart_needed(instance_num)):
                        if started_tests:
                            self.cm._log(
                                f"c{instance_num}: tests are running, cannot restart"
                            )
                            continue

                        # Cluster restart will be performed by this worker.
                        # By setting `restart_here`, we make sure this worker continue on
                        # this cluster instance after restart. It is important because
                        # the `start_cmd` used for starting the cluster might be speciffic
                        # to the test.
                        restart_here = True
                        self.cm._log(
                            f"c{instance_num}: setting to restart cluster")
                        selected_instance = instance_num
                        restart_in_progress_file = (
                            instance_dir /
                            f"{RESTART_IN_PROGRESS_GLOB}_{self.cm.worker_id}")
                        if not restart_in_progress_file.exists():
                            open(restart_in_progress_file, "a").close()

                    # we've found suitable cluster instance
                    selected_instance = instance_num
                    self.cm._cluster_instance_num = instance_num
                    cluster_nodes.set_cluster_env(instance_num)

                    if restart_here:
                        if restart_ready:
                            # The cluster was already restarted if we are here and
                            # `restart_ready` is still True.
                            restart_ready = False

                            # Remove status files that are no longer valid after restart.
                            for f in instance_dir.glob(
                                    f"{RESTART_IN_PROGRESS_GLOB}_*"):
                                os.remove(f)
                            for f in instance_dir.glob(
                                    f"{RESTART_NEEDED_GLOB}_*"):
                                os.remove(f)
                        else:
                            self.cm._log(f"c{instance_num}: calling restart")
                            # the actual `_restart` function will be called outside
                            # of global lock
                            restart_ready = True
                            continue

                    # from this point on, all conditions needed to start the test are met

                    # this test is a singleton
                    if singleton:
                        self.cm._log(f"c{instance_num}: starting singleton")
                        open(self.cm.instance_dir / TEST_SINGLETON_FILE,
                             "a").close()

                    # this test is a first marked test
                    if initial_marked_test:
                        self.cm._log(
                            f"c{instance_num}: starting '{mark}' tests")
                        open(
                            self.cm.instance_dir /
                            f"{TEST_CURR_MARK_GLOB}_{mark}", "a").close()
                        for sf in marked_starting:
                            os.remove(sf)

                    # create status file for each in-use resource
                    _ = [
                        open(
                            self.cm.instance_dir /
                            f"{RESOURCE_IN_USE_GLOB}_{r}_{self.cm.worker_id}",
                            "a",
                        ).close() for r in use_resources
                    ]

                    # create status file for each locked resource
                    _ = [
                        open(
                            self.cm.instance_dir /
                            f"{RESOURCE_LOCKED_GLOB}_{r}_{self.cm.worker_id}",
                            "a",
                        ).close() for r in lock_resources
                    ]

                    # cleanup = cluster restart after test (group of tests) is finished
                    if cleanup:
                        # cleanup after group of test that are marked with a marker
                        if mark:
                            self.cm._log(f"c{instance_num}: cleanup and mark")
                            open(
                                self.cm.instance_dir /
                                f"{RESTART_AFTER_MARK_GLOB}_{self.cm.worker_id}",
                                "a",
                            ).close()
                        # cleanup after single test (e.g. singleton)
                        else:
                            self.cm._log(
                                f"c{instance_num}: cleanup and not mark")
                            open(
                                self.cm.instance_dir /
                                f"{RESTART_NEEDED_GLOB}_{self.cm.worker_id}",
                                "a",
                            ).close()

                    break
                else:
                    # if the test cannot start on any instance, return to top-level loop
                    continue

                test_running_file = (
                    self.cm.instance_dir /
                    f"{TEST_RUNNING_GLOB}_{self.cm.worker_id}")
                self.cm._log(
                    f"c{self.cm.cluster_instance_num}: creating {test_running_file}"
                )
                open(test_running_file, "a").close()

                # check if it is necessary to reload data
                state_dir = cluster_nodes.get_cluster_env().state_dir
                self._reload_cluster_obj(state_dir=state_dir)

                cluster_obj = self.cm.cache.cluster_obj
                if not cluster_obj:
                    cluster_obj = cluster_nodes.get_cluster_type(
                    ).get_cluster_obj()

                # `cluster_obj` is ready, we can start the test
                break

        return cluster_obj
Example #19
0
    def _restart(self,
                 start_cmd: str = "",
                 stop_cmd: str = "") -> bool:  # noqa: C901
        """Restart cluster.

        Not called under global lock!
        """
        # pylint: disable=too-many-branches
        cluster_running_file = self.cm.instance_dir / CLUSTER_RUNNING_FILE

        # don't restart cluster if it was started outside of test framework
        if DEV_CLUSTER_RUNNING:
            if cluster_running_file.exists():
                LOGGER.warning(
                    "Ignoring requested cluster restart as 'DEV_CLUSTER_RUNNING' is set."
                )
            else:
                open(cluster_running_file, "a").close()
            return True

        # fail if cluster restart is forbidden and it was already started
        if FORBID_RESTART and cluster_running_file.exists():
            raise RuntimeError(
                "Cannot restart cluster when 'FORBID_RESTART' is set.")

        self.cm._log(
            f"c{self.cm.cluster_instance_num}: called `_restart`, start_cmd='{start_cmd}', "
            f"stop_cmd='{stop_cmd}'")

        startup_files = cluster_nodes.get_cluster_type(
        ).cluster_scripts.prepare_scripts_files(
            destdir=self.cm._create_startup_files_dir(
                self.cm.cluster_instance_num),
            instance_num=self.cm.cluster_instance_num,
            start_script=start_cmd,
            stop_script=stop_cmd,
        )

        state_dir = cluster_nodes.get_cluster_env().state_dir

        self.cm._log(
            f"c{self.cm.cluster_instance_num}: in `_restart`, new files "
            f"start_cmd='{startup_files.start_script}', "
            f"stop_cmd='{startup_files.stop_script}'")

        excp: Optional[Exception] = None
        for i in range(2):
            if i > 0:
                self.cm._log(
                    f"c{self.cm.cluster_instance_num}: failed to start cluster:\n{excp}\nretrying"
                )
                time.sleep(0.2)

            try:
                cluster_nodes.stop_cluster(cmd=str(startup_files.stop_script))
            except Exception as err:
                self.cm._log(
                    f"c{self.cm.cluster_instance_num}: failed to stop cluster:\n{err}"
                )

            # save artifacts only when produced during this test run
            if cluster_running_file.exists():
                cli_coverage.save_start_script_coverage(
                    log_file=state_dir / CLUSTER_START_CMDS_LOG,
                    pytest_config=self.cm.pytest_config,
                )
                self._restart_save_cluster_artifacts(clean=True)

            try:
                _kill_supervisor(self.cm.cluster_instance_num)
            except Exception:
                pass

            try:
                cluster_obj = cluster_nodes.start_cluster(
                    cmd=str(startup_files.start_script),
                    args=startup_files.start_script_args)
            except Exception as err:
                LOGGER.error(f"Failed to start cluster: {err}")
                excp = err
            else:
                break
        else:
            self.cm._log(
                f"c{self.cm.cluster_instance_num}: failed to start cluster:\n{excp}\ncluster dead"
            )
            if not helpers.IS_XDIST:
                pytest.exit(msg=f"Failed to start cluster, exception: {excp}",
                            returncode=1)
            open(self.cm.instance_dir / CLUSTER_DEAD_FILE, "a").close()
            return False

        # setup faucet addresses
        tmp_path = Path(self.cm.tmp_path_factory.mktemp("addrs_data"))
        cluster_nodes.setup_test_addrs(cluster_obj, tmp_path)

        # create file that indicates that the cluster is running
        if not cluster_running_file.exists():
            open(cluster_running_file, "a").close()

        return True