Beispiel #1
0
    def _reuse_dev_cluster(self) -> clusterlib.ClusterLib:
        """Reuse cluster that was already started outside of test framework."""
        instance_num = 0
        self.cm._cluster_instance_num = instance_num
        cluster_nodes.set_cluster_env(instance_num)
        state_dir = cluster_nodes.get_cluster_env().state_dir

        # make sure instance dir exists
        instance_dir = self.cm.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}"
        instance_dir.mkdir(exist_ok=True, parents=True)

        cluster_obj = self.cm.cache.cluster_obj
        if not cluster_obj:
            cluster_obj = cluster_nodes.get_cluster_type().get_cluster_obj()

        # setup faucet addresses
        if not (state_dir / cluster_nodes.ADDRS_DATA).exists():
            tmp_path = state_dir / "addrs_data"
            tmp_path.mkdir(exist_ok=True, parents=True)
            cluster_nodes.setup_test_addrs(cluster_obj, tmp_path)

        # check if it is necessary to reload data
        self._reload_cluster_obj(state_dir=state_dir)

        return cluster_obj
Beispiel #2
0
    def stop_all_clusters(self) -> None:
        """Stop all cluster instances."""
        self._log("called `stop_all_clusters`")
        for instance_num in range(self.num_of_instances):
            instance_dir = self.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}"
            if (not (instance_dir / CLUSTER_RUNNING_FILE).exists()
                    or (instance_dir / CLUSTER_STOPPED_FILE).exists()):
                self._log(f"cluster instance {instance_num} not running")
                continue

            startup_files = cluster_nodes.get_cluster_type(
            ).cluster_scripts.prepare_scripts_files(
                destdir=self._create_startup_files_dir(instance_num),
                instance_num=instance_num,
            )
            cluster_nodes.set_cluster_env(instance_num)
            self._log(
                f"stopping cluster instance {instance_num} with `{startup_files.stop_script}`"
            )

            state_dir = cluster_nodes.get_cluster_env().state_dir

            try:
                cluster_nodes.stop_cluster(cmd=str(startup_files.stop_script))
            except Exception as exc:
                LOGGER.error(f"While stopping cluster: {exc}")

            cli_coverage.save_start_script_coverage(
                log_file=state_dir / CLUSTER_START_CMDS_LOG,
                pytest_config=self.pytest_config,
            )
            cluster_nodes.save_cluster_artifacts(
                artifacts_dir=self.pytest_tmp_dir, clean=True)
            open(instance_dir / CLUSTER_STOPPED_FILE, "a").close()
            self._log(f"stopped cluster instance {instance_num}")
    def get(  # noqa: C901
        self,
        mark: str = "",
        lock_resources: Iterable[str] = (),
        use_resources: Iterable[str] = (),
        cleanup: bool = False,
        start_cmd: str = "",
    ) -> clusterlib.ClusterLib:
        """Return the `clusterlib.ClusterLib` instance once we can start the test.

        It checks current conditions and waits if the conditions don't allow to start the test
        right away.
        """
        # pylint: disable=too-many-statements,too-many-branches
        assert not isinstance(
            lock_resources,
            str), "`lock_resources` must be sequence of strings"
        assert not isinstance(
            use_resources, str), "`use_resources` must be sequence of strings"

        if configuration.DEV_CLUSTER_RUNNING:
            if start_cmd:
                LOGGER.warning(
                    f"Ignoring the '{start_cmd}' cluster start command as "
                    "'DEV_CLUSTER_RUNNING' is set.")
            # check if the development cluster instance is ready by now so we don't need to obtain
            # cluster lock when it is not necessary
            if not self._is_dev_cluster_ready():
                with locking.FileLockIfXdist(self.cm.cluster_lock):
                    self._setup_dev_cluster()

        if configuration.FORBID_RESTART and start_cmd:
            raise RuntimeError(
                "Cannot use custom start command when 'FORBID_RESTART' is set."
            )

        if start_cmd:
            if not (mark or (Resources.CLUSTER in lock_resources)):
                raise RuntimeError(
                    "Custom start command can be used only together with singleton or `mark`."
                )
            # always clean after test(s) that started cluster with custom configuration
            cleanup = True

        # Add `Resources.CLUSTER` to `use_resources`. Filter out `lock_resources` from the
        # list of `use_resources`.
        use_resources = list(
            set(use_resources).union({Resources.CLUSTER}) -
            set(lock_resources))

        cget_status = ClusterGetStatus(
            mark=mark,
            lock_resources=lock_resources,
            use_resources=use_resources,
            cleanup=cleanup,
            start_cmd=start_cmd,
            current_test=os.environ.get("PYTEST_CURRENT_TEST") or "",
        )
        marked_tests_cache: Dict[int, MarkedTestsStatus] = {}

        self.cm._log(f"want to run test '{cget_status.current_test}'")

        # iterate until it is possible to start the test
        while True:
            if cget_status.restart_ready:
                self._restart(start_cmd=start_cmd)

            if not cget_status.first_iteration:
                xdist_sleep(random.uniform(0.6, 1.2) * cget_status.sleep_delay)

            # nothing time consuming can go under this lock as all other workers will need to wait
            with locking.FileLockIfXdist(self.cm.cluster_lock):
                if self._is_already_running(cget_status):
                    if not self.cm.cache.cluster_obj:
                        raise AssertionError(
                            "`cluster_obj` not available, that cannot happen")
                    return self.cm.cache.cluster_obj

                # needs to be set here, before the first `continue`
                cget_status.first_iteration = False
                self.cm._cluster_instance_num = -1

                # try all existing cluster instances
                for instance_num in range(self.cm.num_of_instances):
                    # there's only one cluster instance when `DEV_CLUSTER_RUNNING` is set
                    if configuration.DEV_CLUSTER_RUNNING and instance_num != 0:
                        continue

                    # if instance to run the test on was already decided, skip all other instances
                    # pylint: disable=consider-using-in
                    if (cget_status.selected_instance != -1
                            and instance_num != cget_status.selected_instance):
                        continue

                    cget_status.instance_num = instance_num
                    cget_status.instance_dir = (
                        self.cm.pytest_tmp_dir /
                        f"{CLUSTER_DIR_TEMPLATE}{instance_num}")
                    cget_status.instance_dir.mkdir(exist_ok=True)

                    # cleanup cluster instance where attempt to start cluster failed repeatedly
                    if (cget_status.instance_dir / CLUSTER_DEAD_FILE).exists():
                        self._cleanup_dead_clusters(cget_status)
                        continue

                    # cluster restart planned or in progress, so no new tests can start
                    if self._restarted_by_other_worker(cget_status):
                        cget_status.sleep_delay = 5
                        continue

                    # are there tests already running on this cluster instance?
                    cget_status.started_tests_sfiles = list(
                        cget_status.instance_dir.glob(
                            f"{TEST_RUNNING_GLOB}_*"))

                    # "marked tests" = group of tests marked with a specific mark.
                    # While these tests are running, no unmarked test can start.
                    cget_status.marked_starting_sfiles = list(
                        cget_status.instance_dir.glob(
                            f"{TEST_MARK_STARTING_GLOB}_*"))
                    cget_status.marked_running_sfiles = list(
                        cget_status.instance_dir.glob(
                            f"{TEST_CURR_MARK_GLOB}_*"))

                    # if marked tests are already running, update their status
                    self._update_marked_tests(
                        marked_tests_cache=marked_tests_cache,
                        cget_status=cget_status)

                    # test has mark
                    if mark:
                        # select this instance for running marked tests if possible
                        if not self._marked_select_instance(cget_status):
                            cget_status.sleep_delay = 2
                            continue

                        # check if we need to wait until unmarked tests are finished
                        if (not cget_status.marked_running_sfiles
                                and cget_status.started_tests_sfiles):
                            cget_status.sleep_delay = 10
                            continue

                        self.cm._log(
                            f"c{instance_num}: in marked tests branch, "
                            f"I have required mark '{mark}'")

                    # no unmarked test can run while marked tests are starting or running
                    elif cget_status.marked_running_sfiles or cget_status.marked_starting_sfiles:
                        self.cm._log(
                            f"c{instance_num}: marked tests starting or running, "
                            f"I don't have mark")
                        cget_status.sleep_delay = 2
                        continue

                    # check availability of the required resources
                    if not self._are_resources_available(cget_status):
                        cget_status.sleep_delay = 5
                        continue

                    # if restart is needed, indicate that the cluster will be restarted
                    # (after all currently running tests are finished)
                    if not self._init_restart(cget_status):
                        continue

                    # we've found suitable cluster instance
                    cget_status.selected_instance = instance_num
                    self.cm._cluster_instance_num = instance_num
                    self.cm._log(
                        f"c{instance_num}: can run test '{cget_status.current_test}'"
                    )
                    # set environment variables that are needed when restarting the cluster
                    # and running tests
                    cluster_nodes.set_cluster_env(instance_num)

                    # if needed, finish restart related actions
                    if not self._finish_restart(cget_status):
                        continue

                    # from this point on, all conditions needed to start the test are met
                    break
                else:
                    # if the test cannot start on any instance, return to top-level loop
                    continue

                self._create_test_status_files(cget_status)

                # Check if it is necessary to reload data. This still needs to happen under
                # global lock.
                state_dir = cluster_nodes.get_cluster_env().state_dir
                self._reload_cluster_obj(state_dir=state_dir)

                # cluster is ready, we can start the test
                break

        cluster_obj = self.cm.cache.cluster_obj
        if not cluster_obj:
            raise AssertionError(
                "`cluster_obj` not available, that cannot happen")
        cluster_obj.cluster_id = instance_num
        cluster_obj._cluster_manager = self.cm  # type: ignore

        return cluster_obj
Beispiel #4
0
    def get(  # noqa: C901
        self,
        singleton: bool = False,
        mark: str = "",
        lock_resources: UnpackableSequence = (),
        use_resources: UnpackableSequence = (),
        cleanup: bool = False,
        start_cmd: str = "",
    ) -> clusterlib.ClusterLib:
        """Return the `clusterlib.ClusterLib` instance once we can start the test.

        It checks current conditions and waits if the conditions don't allow to start the test
        right away.
        """
        # pylint: disable=too-many-statements,too-many-branches,too-many-locals

        # don't start new cluster if it was already started outside of test framework
        if DEV_CLUSTER_RUNNING:
            if start_cmd:
                LOGGER.warning(
                    f"Ignoring the '{start_cmd}' cluster start command as "
                    "'DEV_CLUSTER_RUNNING' is set.")
            return self._reuse_dev_cluster()

        if FORBID_RESTART and start_cmd:
            raise RuntimeError(
                "Cannot use custom start command when 'FORBID_RESTART' is set."
            )

        selected_instance = -1
        restart_here = False
        restart_ready = False
        first_iteration = True
        sleep_delay = 1
        marked_tests_cache: Dict[int, MarkedTestsStatus] = {}

        if start_cmd:
            if not (singleton or mark):
                raise AssertionError(
                    "Custom start command can be used only together with `singleton` or `mark`"
                )
            # always clean after test(s) that started cluster with custom configuration
            cleanup = True

        # iterate until it is possible to start the test
        while True:
            if restart_ready:
                self._restart(start_cmd=start_cmd)

            if not first_iteration:
                helpers.xdist_sleep(random.random() * sleep_delay)

            # nothing time consuming can go under this lock as it will block all other workers
            with helpers.FileLockIfXdist(self.cm.cluster_lock):
                test_on_worker = list(
                    self.cm.lock_dir.glob(
                        f"{CLUSTER_DIR_TEMPLATE}*/{TEST_RUNNING_GLOB}_{self.cm.worker_id}"
                    ))

                # test is already running, nothing to set up
                if (first_iteration and test_on_worker
                        and self.cm._cluster_instance_num != -1
                        and self.cm.cache.cluster_obj):
                    self.cm._log(f"{test_on_worker[0]} already exists")
                    return self.cm.cache.cluster_obj

                first_iteration = False  # needs to be set here, before the first `continue`
                self.cm._cluster_instance_num = -1

                # try all existing cluster instances
                for instance_num in range(self.cm.num_of_instances):
                    # if instance to run the test on was already decided, skip all other instances
                    # pylint: disable=consider-using-in
                    if selected_instance != -1 and instance_num != selected_instance:
                        continue

                    instance_dir = self.cm.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}"
                    instance_dir.mkdir(exist_ok=True)

                    # if the selected instance failed to start, move on to other instance
                    if (instance_dir / CLUSTER_DEAD_FILE).exists():
                        selected_instance = -1
                        restart_here = False
                        restart_ready = False
                        # remove status files that are checked by other workers
                        for sf in (
                                *instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*"),
                                *instance_dir.glob(
                                    f"{TEST_MARK_STARTING_GLOB}_*"),
                        ):
                            os.remove(sf)

                        dead_clusters = list(
                            self.cm.lock_dir.glob(
                                f"{CLUSTER_DIR_TEMPLATE}*/{CLUSTER_DEAD_FILE}")
                        )
                        if len(dead_clusters) == self.cm.num_of_instances:
                            raise RuntimeError(
                                "All clusters are dead, cannot run.")
                        continue

                    # singleton test is running, so no other test can be started
                    if (instance_dir / TEST_SINGLETON_FILE).exists():
                        self.cm._log(
                            f"c{instance_num}: singleton test in progress, cannot run"
                        )
                        sleep_delay = 5
                        continue

                    restart_in_progress = list(
                        instance_dir.glob(f"{RESTART_IN_PROGRESS_GLOB}_*"))
                    # cluster restart planned, no new tests can start
                    if not restart_here and restart_in_progress:
                        # no log message here, it would be too many of them
                        sleep_delay = 5
                        continue

                    started_tests = list(
                        instance_dir.glob(f"{TEST_RUNNING_GLOB}_*"))

                    # "marked tests" = group of tests marked with a specific mark.
                    # While these tests are running, no unmarked test can start.
                    marked_starting = list(
                        instance_dir.glob(f"{TEST_MARK_STARTING_GLOB}_*"))
                    marked_running = list(
                        instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*"))

                    if mark:
                        marked_running_my = (
                            instance_dir /
                            f"{TEST_CURR_MARK_GLOB}_{mark}").exists()
                        marked_starting_my = list(
                            instance_dir.glob(
                                f"{TEST_MARK_STARTING_GLOB}_{mark}_*"))

                        marked_running_my_anywhere = list(
                            self.cm.lock_dir.glob(
                                f"{CLUSTER_DIR_TEMPLATE}*/{TEST_CURR_MARK_GLOB}_{mark}"
                            ))
                        # check if tests with my mark are running on some other cluster instance
                        if not marked_running_my and marked_running_my_anywhere:
                            self.cm._log(
                                f"c{instance_num}: tests marked with my mark '{mark}' "
                                "already running on other cluster instance, cannot run"
                            )
                            continue

                        marked_starting_my_anywhere = list(
                            self.cm.lock_dir.glob(
                                f"{CLUSTER_DIR_TEMPLATE}*/{TEST_MARK_STARTING_GLOB}_{mark}_*"
                            ))
                        # check if tests with my mark are starting on some other cluster instance
                        if not marked_starting_my and marked_starting_my_anywhere:
                            self.cm._log(
                                f"c{instance_num}: tests marked with my mark '{mark}' starting "
                                "on other cluster instance, cannot run")
                            continue

                        # check if this test has the same mark as currently running marked tests
                        if marked_running_my or marked_starting_my:
                            # lock to this cluster instance
                            selected_instance = instance_num
                        elif marked_running or marked_starting:
                            self.cm._log(
                                f"c{instance_num}: tests marked with other mark starting "
                                f"or running, I have different mark '{mark}'")
                            continue

                        # check if needs to wait until marked tests can run
                        if marked_starting_my and started_tests:
                            self.cm._log(
                                f"c{instance_num}: unmarked tests running, wants to start '{mark}'"
                            )
                            sleep_delay = 2
                            continue

                    # no unmarked test can run while marked tests are starting or running
                    elif marked_running or marked_starting:
                        self.cm._log(
                            f"c{instance_num}: marked tests starting or running, "
                            f"I don't have mark")
                        sleep_delay = 5
                        continue

                    # is this the first marked test that wants to run?
                    initial_marked_test = bool(mark and not marked_running)

                    # indicate that it is planned to start marked tests as soon as
                    # all currently running tests are finished or the cluster is restarted
                    if initial_marked_test:
                        # lock to this cluster instance
                        selected_instance = instance_num
                        mark_starting_file = (
                            instance_dir /
                            f"{TEST_MARK_STARTING_GLOB}_{mark}_{self.cm.worker_id}"
                        )
                        if not mark_starting_file.exists():
                            open(
                                mark_starting_file,
                                "a",
                            ).close()
                        if started_tests:
                            self.cm._log(
                                f"c{instance_num}: unmarked tests running, wants to start '{mark}'"
                            )
                            sleep_delay = 3
                            continue

                    # get marked tests status
                    marked_tests_status = self._get_marked_tests_status(
                        cache=marked_tests_cache, instance_num=instance_num)

                    # marked tests are already running
                    if marked_running:
                        active_mark_file = marked_running[0].name

                        # update marked tests status
                        self._update_marked_tests(
                            marked_tests_status=marked_tests_status,
                            active_mark_name=active_mark_file,
                            started_tests=started_tests,
                            instance_num=instance_num,
                        )

                        self.cm._log(
                            f"c{instance_num}: in marked tests branch, "
                            f"I have required mark '{mark}'")

                    # reset counter of cycles with no marked test running
                    marked_tests_status.no_marked_tests_iter = 0

                    # this test is a singleton - no other test can run while this one is running
                    if singleton and started_tests:
                        self.cm._log(
                            f"c{instance_num}: tests are running, cannot start singleton"
                        )
                        sleep_delay = 5
                        continue

                    # this test wants to lock some resources, check if these are not
                    # locked or in use
                    if lock_resources:
                        res_usable = self._are_resources_usable(
                            resources=lock_resources,
                            instance_dir=instance_dir,
                            instance_num=instance_num,
                        )
                        if not res_usable:
                            sleep_delay = 5
                            continue

                    # filter out `lock_resources` from the list of `use_resources`
                    if use_resources and lock_resources:
                        use_resources = list(
                            set(use_resources) - set(lock_resources))

                    # this test wants to use some resources, check if these are not locked
                    if use_resources:
                        res_locked = self._are_resources_locked(
                            resources=use_resources,
                            instance_dir=instance_dir,
                            instance_num=instance_num,
                        )
                        if res_locked:
                            sleep_delay = 5
                            continue

                    # indicate that the cluster will be restarted
                    new_cmd_restart = bool(start_cmd and
                                           (initial_marked_test or singleton))
                    if not restart_here and (
                            new_cmd_restart
                            or self._is_restart_needed(instance_num)):
                        if started_tests:
                            self.cm._log(
                                f"c{instance_num}: tests are running, cannot restart"
                            )
                            continue

                        # Cluster restart will be performed by this worker.
                        # By setting `restart_here`, we make sure this worker continue on
                        # this cluster instance after restart. It is important because
                        # the `start_cmd` used for starting the cluster might be speciffic
                        # to the test.
                        restart_here = True
                        self.cm._log(
                            f"c{instance_num}: setting to restart cluster")
                        selected_instance = instance_num
                        restart_in_progress_file = (
                            instance_dir /
                            f"{RESTART_IN_PROGRESS_GLOB}_{self.cm.worker_id}")
                        if not restart_in_progress_file.exists():
                            open(restart_in_progress_file, "a").close()

                    # we've found suitable cluster instance
                    selected_instance = instance_num
                    self.cm._cluster_instance_num = instance_num
                    cluster_nodes.set_cluster_env(instance_num)

                    if restart_here:
                        if restart_ready:
                            # The cluster was already restarted if we are here and
                            # `restart_ready` is still True.
                            restart_ready = False

                            # Remove status files that are no longer valid after restart.
                            for f in instance_dir.glob(
                                    f"{RESTART_IN_PROGRESS_GLOB}_*"):
                                os.remove(f)
                            for f in instance_dir.glob(
                                    f"{RESTART_NEEDED_GLOB}_*"):
                                os.remove(f)
                        else:
                            self.cm._log(f"c{instance_num}: calling restart")
                            # the actual `_restart` function will be called outside
                            # of global lock
                            restart_ready = True
                            continue

                    # from this point on, all conditions needed to start the test are met

                    # this test is a singleton
                    if singleton:
                        self.cm._log(f"c{instance_num}: starting singleton")
                        open(self.cm.instance_dir / TEST_SINGLETON_FILE,
                             "a").close()

                    # this test is a first marked test
                    if initial_marked_test:
                        self.cm._log(
                            f"c{instance_num}: starting '{mark}' tests")
                        open(
                            self.cm.instance_dir /
                            f"{TEST_CURR_MARK_GLOB}_{mark}", "a").close()
                        for sf in marked_starting:
                            os.remove(sf)

                    # create status file for each in-use resource
                    _ = [
                        open(
                            self.cm.instance_dir /
                            f"{RESOURCE_IN_USE_GLOB}_{r}_{self.cm.worker_id}",
                            "a",
                        ).close() for r in use_resources
                    ]

                    # create status file for each locked resource
                    _ = [
                        open(
                            self.cm.instance_dir /
                            f"{RESOURCE_LOCKED_GLOB}_{r}_{self.cm.worker_id}",
                            "a",
                        ).close() for r in lock_resources
                    ]

                    # cleanup = cluster restart after test (group of tests) is finished
                    if cleanup:
                        # cleanup after group of test that are marked with a marker
                        if mark:
                            self.cm._log(f"c{instance_num}: cleanup and mark")
                            open(
                                self.cm.instance_dir /
                                f"{RESTART_AFTER_MARK_GLOB}_{self.cm.worker_id}",
                                "a",
                            ).close()
                        # cleanup after single test (e.g. singleton)
                        else:
                            self.cm._log(
                                f"c{instance_num}: cleanup and not mark")
                            open(
                                self.cm.instance_dir /
                                f"{RESTART_NEEDED_GLOB}_{self.cm.worker_id}",
                                "a",
                            ).close()

                    break
                else:
                    # if the test cannot start on any instance, return to top-level loop
                    continue

                test_running_file = (
                    self.cm.instance_dir /
                    f"{TEST_RUNNING_GLOB}_{self.cm.worker_id}")
                self.cm._log(
                    f"c{self.cm.cluster_instance_num}: creating {test_running_file}"
                )
                open(test_running_file, "a").close()

                # check if it is necessary to reload data
                state_dir = cluster_nodes.get_cluster_env().state_dir
                self._reload_cluster_obj(state_dir=state_dir)

                cluster_obj = self.cm.cache.cluster_obj
                if not cluster_obj:
                    cluster_obj = cluster_nodes.get_cluster_type(
                    ).get_cluster_obj()

                # `cluster_obj` is ready, we can start the test
                break

        return cluster_obj