Example #1
0
    def __init__(self,
                 session_context,
                 module=None,
                 cls=None,
                 function=None,
                 injected_args=None):
        """
        :type session_context: ducktape.tests.session.SessionContext
        """
        self.module = module
        self.cls = cls
        self.function = function
        self.injected_args = injected_args

        self.session_context = session_context
        self.services = ServiceRegistry()

        # dict for toggling service log collection on/off
        self.log_collect = {}

        # Individual test results go here
        mkdir_p(self.results_dir)

        self._logger_configured = False
        self.configure_logger()
Example #2
0
    def __init__(self, **kwargs):
        """
        :param session_context
        :param module
        :param cls
        :param function
        :param injected_args
        :param service_registry
        :param cluster_use_metadata
        """
        self.session_context = kwargs.get("session_context")
        self.module = kwargs.get("module")
        self.cls = kwargs.get("cls")
        self.function = kwargs.get("function")
        self.injected_args = kwargs.get("injected_args")
        self.ignore = kwargs.get("ignore", False)

        # cluster_use_metadata is a dict containing information about how this test will use cluster resources
        # to date, this only includes "num_nodes"
        self.cluster_use_metadata = copy.copy(kwargs.get("cluster_use_metadata", {}))

        self.services = ServiceRegistry()

        # dict for toggling service log collection on/off
        self.log_collect = {}
Example #3
0
    def __init__(self, **kwargs):
        """
        :param session_context
        :param module
        :param cls
        :param function
        :param injected_args
        """
        self.session_context = kwargs.get("session_context", None)
        self.module = kwargs.get("module", None)
        self.cls = kwargs.get("cls", None)
        self.function = kwargs.get("function", None)
        self.injected_args = kwargs.get("injected_args", None)
        self.ignore = kwargs.get("ignore", False)

        self.services = ServiceRegistry()

        # dict for toggling service log collection on/off
        self.log_collect = {}
Example #4
0
    def __init__(self, **kwargs):
        """
        :param session_context:
        :param cluster: the cluster object which will be used by this test
        :param module: name of the module containing the test class/method
        :param cls: class object containing the test method
        :param function: the test method
        :param file: file containing this module
        :param injected_args: a dict containing keyword args which will be passed to the test method
        :param cluster_use_metadata: dict containing information about how this test will use cluster resources,
               to date, this only includes "num_nodes"
        """

        self.session_context = kwargs.get("session_context")
        self.cluster = kwargs.get("cluster")
        self.module = kwargs.get("module")

        if kwargs.get("file") is not None:
            self.file = os.path.abspath(kwargs.get("file"))
        else:
            self.file = None
        self.cls = kwargs.get("cls")
        self.function = kwargs.get("function")
        self.injected_args = kwargs.get("injected_args")
        self.ignore = kwargs.get("ignore", False)

        # cluster_use_metadata is a dict containing information about how this test will use cluster resources
        # to date, this only includes "num_nodes"
        self.cluster_use_metadata = copy.copy(
            kwargs.get("cluster_use_metadata", {}))

        self.services = ServiceRegistry()
        self.test_index = None

        # dict for toggling service log collection on/off
        self.log_collect = {}

        self._logger = None
        self._local_scratch_dir = None
Example #5
0
    def __init__(self, session_context, module=None, cls=None, function=None, config=None):
        """
        :type session_context: ducktape.tests.session.SessionContext
        """
        self.module = module
        self.cls = cls
        self.function = function
        self.config = config
        self.session_context = session_context
        self.cluster = session_context.cluster
        self.services = ServiceRegistry()

        # dict for toggling service log collection on/off
        self.log_collect = {}

        self.results_dir = self.session_context.results_dir
        if self.cls is not None:
            self.results_dir = os.path.join(self.results_dir, self.cls.__name__)
        if self.function is not None:
            self.results_dir = os.path.join(self.results_dir, self.function.__name__)
        mkdir_p(self.results_dir)

        self._logger_configured = False
        self.configure_logger()
Example #6
0
    def run(self):
        self.log(logging.INFO, "Loading test %s" % str(self.test_metadata))
        self.test_context = self._collect_test_context(**self.test_metadata)
        self.test_context.test_index = self.test_index

        self.send(self.message.running())
        if self.test_context.ignore:
            # Skip running this test, but keep track of the fact that we ignored it
            result = TestResult(self.test_context,
                                self.test_index,
                                self.session_context,
                                test_status=IGNORE,
                                start_time=time.time(),
                                stop_time=time.time())
            result.report()
            # Tell the server we are finished
            self.send(self.message.finished(result=result))
            return

        start_time = -1
        stop_time = -1
        test_status = FAIL
        summary = []
        data = None
        self.all_services = ServiceRegistry()

        num_runs = 0

        try:
            while test_status == FAIL and num_runs < self.deflake_num:
                num_runs += 1
                self.log(logging.INFO, "on run {}/{}".format(num_runs, self.deflake_num))
                start_time = time.time()
                test_status, summary, data = self._do_run(num_runs)

                if test_status == PASS and num_runs > 1:
                    test_status = FLAKY

                msg = str(test_status.to_json())
                if summary:
                    msg += ": {}".format(summary)
                if num_runs != self.deflake_num:
                    msg += "\n" + "~" * max(len(line) for line in summary.split('\n'))

                self.log(logging.INFO, msg)

        finally:
            stop_time = time.time()

            test_status, summary = self._check_cluster_utilization(test_status, summary)

            if num_runs > 1:
                # for reporting purposes report all services
                self.test_context.services = self.all_services
            # for flaky tests, we report the start and end time of the successful run, and not the whole run period
            result = TestResult(
                self.test_context,
                self.test_index,
                self.session_context,
                test_status,
                summary,
                data,
                start_time,
                stop_time)

            self.log(logging.INFO, "Data: %s" % str(result.data))

            result.report()
            # Tell the server we are finished
            self._do_safely(lambda: self.send(self.message.finished(result=result)),
                            "Problem sending FINISHED message for " + str(self.test_metadata) + ":\n")
            # Release test_context resources only after creating the result and finishing logging activity
            # The Sender object uses the same logger, so we postpone closing until after the finished message is sent
            self.test_context.close()
            self.all_services = None
            self.test_context = None
            self.test = None
Example #7
0
class RunnerClient(object):
    """Run a single test"""

    def __init__(self, server_hostname, server_port, test_id,
                 test_index, logger_name, log_dir, debug, fail_bad_cluster_utilization, deflake_num):
        signal.signal(signal.SIGTERM, self._sigterm_handler)  # register a SIGTERM handler

        self.serde = SerDe()
        self.logger = test_logger(logger_name, log_dir, debug)
        self.runner_port = server_port

        self.fail_bad_cluster_utilization = fail_bad_cluster_utilization
        self.test_id = test_id
        self.test_index = test_index
        self.id = "test-runner-%d-%d" % (os.getpid(), id(self))
        self.message = ClientEventFactory(self.test_id, self.test_index, self.id)
        self.sender = Sender(server_hostname, str(self.runner_port), self.message, self.logger)

        ready_reply = self.sender.send(self.message.ready())
        self.session_context = ready_reply["session_context"]
        self.test_metadata = ready_reply["test_metadata"]
        self.cluster = ready_reply["cluster"]

        self.deflake_num = deflake_num

        # Wait to instantiate the test object until running the test
        self.test = None
        self.test_context = None
        self.all_services = None

    def send(self, event):
        return self.sender.send(event)

    def _sigterm_handler(self, signum, frame):
        """Translate SIGTERM to SIGINT on this process

        python will treat SIGINT as a Keyboard exception. Exception handling does the rest.
        """
        os.kill(os.getpid(), signal.SIGINT)

    def _collect_test_context(self, directory, file_name, cls_name, method_name, injected_args):
        loader = TestLoader(self.session_context, self.logger, injected_args=injected_args, cluster=self.cluster)
        # TODO: deal with this in a more graceful fashion.
        #       In an unlikely even that discover either raises the exception or fails to find exactly one test
        #       we should probably continue trying other tests rather than killing this process
        loaded_context_list = loader.discover(directory, file_name, cls_name, method_name)

        assert len(loaded_context_list) == 1
        test_context = loaded_context_list[0]
        test_context.cluster = self.cluster
        return test_context

    def run(self):
        self.log(logging.INFO, "Loading test %s" % str(self.test_metadata))
        self.test_context = self._collect_test_context(**self.test_metadata)
        self.test_context.test_index = self.test_index

        self.send(self.message.running())
        if self.test_context.ignore:
            # Skip running this test, but keep track of the fact that we ignored it
            result = TestResult(self.test_context,
                                self.test_index,
                                self.session_context,
                                test_status=IGNORE,
                                start_time=time.time(),
                                stop_time=time.time())
            result.report()
            # Tell the server we are finished
            self.send(self.message.finished(result=result))
            return

        start_time = -1
        stop_time = -1
        test_status = FAIL
        summary = []
        data = None
        self.all_services = ServiceRegistry()

        num_runs = 0

        try:
            while test_status == FAIL and num_runs < self.deflake_num:
                num_runs += 1
                self.log(logging.INFO, "on run {}/{}".format(num_runs, self.deflake_num))
                start_time = time.time()
                test_status, summary, data = self._do_run(num_runs)

                if test_status == PASS and num_runs > 1:
                    test_status = FLAKY

                msg = str(test_status.to_json())
                if summary:
                    msg += ": {}".format(summary)
                if num_runs != self.deflake_num:
                    msg += "\n" + "~" * max(len(line) for line in summary.split('\n'))

                self.log(logging.INFO, msg)

        finally:
            stop_time = time.time()

            test_status, summary = self._check_cluster_utilization(test_status, summary)

            if num_runs > 1:
                # for reporting purposes report all services
                self.test_context.services = self.all_services
            # for flaky tests, we report the start and end time of the successful run, and not the whole run period
            result = TestResult(
                self.test_context,
                self.test_index,
                self.session_context,
                test_status,
                summary,
                data,
                start_time,
                stop_time)

            self.log(logging.INFO, "Data: %s" % str(result.data))

            result.report()
            # Tell the server we are finished
            self._do_safely(lambda: self.send(self.message.finished(result=result)),
                            "Problem sending FINISHED message for " + str(self.test_metadata) + ":\n")
            # Release test_context resources only after creating the result and finishing logging activity
            # The Sender object uses the same logger, so we postpone closing until after the finished message is sent
            self.test_context.close()
            self.all_services = None
            self.test_context = None
            self.test = None

    def _do_run(self, num_runs):
        test_status = FAIL
        summary = []
        data = None
        sid_factory = MultiRunServiceIdFactory(num_runs) if self.deflake_num > 1 else service_id_factory
        try:
            # Results from this test, as well as logs will be dumped here
            mkdir_p(TestContext.results_dir(self.test_context, self.test_index))
            # Instantiate test
            self.test = self.test_context.cls(self.test_context)
            # Check if there are enough nodes
            self._check_min_cluster_spec()
            # Run the test unit

            self.setup_test()

            data = self.run_test()

            test_status = PASS

        except BaseException as e:
            # mark the test as failed before doing anything else
            test_status = FAIL
            err_trace = self._exc_msg(e)
            summary.append(err_trace)

        finally:
            for service in self.test_context.services:
                service.service_id_factory = sid_factory
                self.all_services.append(service)

            self.teardown_test(teardown_services=not self.session_context.no_teardown, test_status=test_status)

            if hasattr(self.test_context, "services"):
                service_errors = self.test_context.services.errors()
                if service_errors:
                    summary.extend(["\n\n", service_errors])

            # free nodes
            if self.test:
                self.log(logging.DEBUG, "Freeing nodes...")
                self._do_safely(self.test.free_nodes, "Error freeing nodes:")
            return test_status, "".join(summary), data

    def _check_min_cluster_spec(self):
        self.log(logging.DEBUG, "Checking if there are enough nodes...")
        min_cluster_spec = self.test.min_cluster_spec()
        os_to_num_nodes = {}
        for node_spec in min_cluster_spec:
            if not os_to_num_nodes.get(node_spec.operating_system):
                os_to_num_nodes[node_spec.operating_system] = 1
            else:
                os_to_num_nodes[node_spec.operating_system] = os_to_num_nodes[node_spec.operating_system] + 1
        for (operating_system, node_count) in iteritems(os_to_num_nodes):
            num_avail = len(list(self.cluster.all().nodes.elements(operating_system=operating_system)))
            if node_count > num_avail:
                raise RuntimeError(
                    "There are not enough nodes available in the cluster to run this test. "
                    "Cluster size for %s: %d, Need at least: %d. Services currently registered: %s" %
                    (operating_system, num_avail, node_count, self.test_context.services))

    def _check_cluster_utilization(self, result, summary):
        """Checks if the number of nodes used by a test is less than the number of
        nodes requested by the test. If this is the case and we wish to fail
        on bad cluster utilization, the result value is failed. Will also print
        a warning if the test passes and the node utilization doesn't match.
        """
        max_used = self.cluster.max_used()
        total = len(self.cluster.all())
        if max_used < total:
            message = "Test requested %d nodes, used only %d" % (total, max_used)
            if self.fail_bad_cluster_utilization:
                # only check node utilization on test pass
                if result == PASS or result == FLAKY:
                    self.log(logging.INFO, "FAIL: " + message)

                result = FAIL
                summary += message
            else:
                self.log(logging.WARN, message)
        return result, summary

    def setup_test(self):
        """start services etc"""
        self.log(logging.INFO, "Setting up...")
        self.test.setup()

    def run_test(self):
        """Run the test!

        We expect test_context.function to be a function or unbound method which takes an
        instantiated test object as its argument.
        """
        self.log(logging.INFO, "Running...")
        return self.test_context.function(self.test)

    def _exc_msg(self, e):
        return repr(e) + "\n" + traceback.format_exc(limit=16)

    def _do_safely(self, action, err_msg):
        try:
            action()
        except BaseException as e:
            self.log(logging.WARN, err_msg + " " + self._exc_msg(e))

    def teardown_test(self, teardown_services=True, test_status=None):
        """teardown method which stops services, gathers log data, removes persistent state, and releases cluster nodes.

        Catch all exceptions so that every step in the teardown process is tried, but signal that the test runner
        should stop if a keyboard interrupt is caught.
        """
        self.log(logging.INFO, "Tearing down...")
        if not self.test:
            self.log(logging.WARN, "%s failed to instantiate" % self.test_id)
            self.test_context.close()
            return

        services = self.test_context.services

        if teardown_services:
            self._do_safely(self.test.teardown, "Error running teardown method:")
            # stop services
            self._do_safely(services.stop_all, "Error stopping services:")

        # always collect service logs whether or not we tear down
        # logs are typically removed during "clean" phase, so collect logs before cleaning
        self.log(logging.DEBUG, "Copying logs from services...")
        self._do_safely(lambda: self.test.copy_service_logs(test_status), "Error copying service logs:")

        # clean up stray processes and persistent state
        if teardown_services:
            self.log(logging.DEBUG, "Cleaning up services...")
            self._do_safely(services.clean_all, "Error cleaning services:")

    def log(self, log_level, msg, *args, **kwargs):
        """Log to the service log and the test log of the current test."""

        if self.test_context is None:
            msg = "%s: %s" % (self.__class__.__name__, str(msg))
            self.logger.log(log_level, msg, *args, **kwargs)
        else:
            msg = "%s: %s: %s" % (self.__class__.__name__, self.test_context.test_name, str(msg))
            self.logger.log(log_level, msg, *args, **kwargs)

        self.send(self.message.log(msg, level=log_level))