Exemple #1
0
    def __init__(self, *, client, test_name, cluster_name, specification,
                 configuration):
        # Initialize.
        self.client = client
        self.id = test_name
        self.cluster_name = cluster_name
        self.spec = specification
        self.config = configuration
        self.failed = False

        # Initialize attribute used for memoization of connection string.
        self.__connection_string = None

        # Initialize wrapper class for running workload executor.
        self.workload_runner = DriverWorkloadSubprocessRunner()

        # Validate and store organization and project.
        self.organization = get_one_organization_by_name(
            client=self.client,
            organization_name=self.config.organization_name)
        self.project = ensure_project(
            client=self.client, project_name=self.config.project_name,
            organization_id=self.organization.id)
    def run_test(self, driver_workload):
        self.set_collection_from_workload(driver_workload)

        subprocess = DriverWorkloadSubprocessRunner()
        try:
            subprocess.spawn(workload_executor=self.WORKLOAD_EXECUTOR,
                             connection_string=self.CONNECTION_STRING,
                             driver_workload=driver_workload,
                             startup_time=self.STARTUP_TIME)
        except WorkloadExecutorError:
            outs, errs = subprocess.workload_subprocess.communicate(timeout=2)
            self.fail("The workload executor terminated prematurely before "
                      "receiving the termination signal.\n"
                      "STDOUT: {!r}\nSTDERR: {!r}".format(outs, errs))

        # Run operations for 5 seconds.
        sleep(5)

        try:
            stats = subprocess.stop()
        except WorkloadExecutorError as exc:
            self.fail("WorkloadExecutorError: %s" % exc)

        return stats
Exemple #3
0
class AtlasTestCase:
    def __init__(self, *, client, test_name, cluster_name, specification,
                 configuration):
        # Initialize.
        self.client = client
        self.id = test_name
        self.cluster_name = cluster_name
        self.spec = specification
        self.config = configuration
        self.failed = False

        # Initialize attribute used for memoization of connection string.
        self.__connection_string = None

        # Initialize wrapper class for running workload executor.
        self.workload_runner = DriverWorkloadSubprocessRunner()

        # Validate and store organization and project.
        self.organization = get_one_organization_by_name(
            client=self.client,
            organization_name=self.config.organization_name)
        self.project = ensure_project(client=self.client,
                                      project_name=self.config.project_name,
                                      organization_id=self.organization.id)

    @property
    def cluster_url(self):
        return self.client.groups[self.project.id].clusters[self.cluster_name]

    def get_connection_string(self):
        if self.__connection_string is None:
            cluster = self.cluster_url.get().data
            prefix, suffix = cluster.srvAddress.split("//")
            uri_options = self.spec.maintenancePlan.uriOptions.copy()

            # Boolean options must be converted to lowercase strings.
            for key, value in uri_options.items():
                if isinstance(value, bool):
                    uri_options[key] = str(value).lower()

            connection_string = (prefix + "//" +
                                 self.config.database_username + ":" +
                                 self.config.database_password + "@" + suffix +
                                 "/?")
            connection_string += urlencode(uri_options)
            self.__connection_string = connection_string
        return self.__connection_string

    def __repr__(self):
        return "<AtlasTestCase: {}>".format(self.id)

    def is_cluster_state(self, goal_state):
        cluster_info = self.cluster_url.get().data
        return cluster_info.stateName.lower() == goal_state.lower()

    def verify_cluster_configuration_matches(self, state):
        """Verify that the cluster config is what we expect it to be (based on
        maintenance status). Raises AssertionError."""
        state = state.lower()
        if state not in ("initial", "final"):
            raise AstrolabeTestCaseError(
                "State must be either 'initial' or 'final'.")
        cluster_config = self.cluster_url.get().data
        assert_subset(cluster_config,
                      self.spec.maintenancePlan[state].clusterConfiguration)
        process_args = self.cluster_url.processArgs.get().data
        assert_subset(process_args,
                      self.spec.maintenancePlan[state].processArgs)

    def initialize(self):
        """
        Initialize a cluster with the configuration required by the test
        specification.
        """
        LOGGER.info("Initializing cluster {!r}".format(self.cluster_name))

        cluster_config = self.spec.maintenancePlan.initial.\
            clusterConfiguration.copy()
        cluster_config["name"] = self.cluster_name
        try:
            self.client.groups[self.project.id].clusters.post(**cluster_config)
        except AtlasApiError as exc:
            if exc.error_code == 'DUPLICATE_CLUSTER_NAME':
                # Cluster already exists. Simply re-configure it.
                # Cannot send cluster name when updating existing cluster.
                cluster_config.pop("name")
                self.client.groups[self.project.id].\
                    clusters[self.cluster_name].patch(**cluster_config)

        # Apply processArgs if provided.
        process_args = self.spec.maintenancePlan.initial.processArgs
        if process_args:
            self.client.groups[self.project.id].\
                clusters[self.cluster_name].processArgs.patch(**process_args)

    def run(self, persist_cluster=False, startup_time=1):
        LOGGER.info("Running test {!r} on cluster {!r}".format(
            self.id, self.cluster_name))

        # Step-0: sanity-check the cluster configuration.
        self.verify_cluster_configuration_matches("initial")

        # Start the test timer.
        timer = Timer()
        timer.start()

        # Step-1: load test data.
        test_data = self.spec.driverWorkload.get('testData')
        if test_data:
            LOGGER.info("Loading test data on cluster {!r}".format(
                self.cluster_name))
            connection_string = self.get_connection_string()
            load_test_data(connection_string, self.spec.driverWorkload)
            LOGGER.info("Successfully loaded test data on cluster {!r}".format(
                self.cluster_name))

        # Step-2: run driver workload.
        self.workload_runner.spawn(
            workload_executor=self.config.workload_executor,
            connection_string=self.get_connection_string(),
            driver_workload=self.spec.driverWorkload,
            startup_time=startup_time)

        # Step-3: begin maintenance routine.
        final_config = self.spec.maintenancePlan.final
        cluster_config = final_config.clusterConfiguration
        process_args = final_config.processArgs

        if not cluster_config and not process_args:
            raise RuntimeError("invalid maintenance plan")

        if cluster_config:
            LOGGER.info("Pushing cluster configuration update")
            self.cluster_url.patch(**cluster_config)

        if process_args:
            LOGGER.info("Pushing process arguments update")
            self.cluster_url.processArgs.patch(**process_args)

        # Sleep before polling to give Atlas time to update cluster.stateName.
        sleep(3)

        # Step-4: wait until maintenance completes (cluster is IDLE).
        selector = BooleanCallablePoller(
            frequency=self.config.polling_frequency,
            timeout=self.config.polling_timeout)
        LOGGER.info("Waiting for cluster maintenance to complete")
        selector.poll([self],
                      attribute="is_cluster_state",
                      args=("IDLE", ),
                      kwargs={})
        self.verify_cluster_configuration_matches("final")
        LOGGER.info("Cluster maintenance complete")

        # Step-5: interrupt driver workload and capture streams
        stats = self.workload_runner.terminate()

        # Stop the timer
        timer.stop()

        # Step-6: compute xunit entry.
        junit_test = junitparser.TestCase(self.id)
        junit_test.time = timer.elapsed

        if (stats['numErrors'] != 0 or stats['numFailures'] != 0
                or stats['numSuccesses'] == 0):
            LOGGER.info("FAILED: {!r}".format(self.id))
            self.failed = True
            # Write xunit logs for failed tests.
            junit_test.result = junitparser.Failure(str(stats))
        else:
            LOGGER.info("SUCCEEDED: {!r}".format(self.id))
            # Directly log output of successful tests as xunit output
            # is only visible for failed tests.

        LOGGER.info("Workload Statistics: {}".format(stats))

        # Step 7: download logs asynchronously and delete cluster.
        # TODO: https://github.com/mongodb-labs/drivers-atlas-testing/issues/4
        if not persist_cluster:
            self.cluster_url.delete()
            LOGGER.info("Cluster {!r} marked for deletion.".format(
                self.cluster_name))

        return junit_test
Exemple #4
0
class AtlasTestCase:
    def __init__(self, *, client, admin_client, test_name, cluster_name,
                 specification, configuration):
        # Initialize.
        self.client = client
        self.admin_client = admin_client
        self.id = test_name
        self.cluster_name = cluster_name
        self.spec = specification
        self.config = configuration
        self.failed = False

        # Initialize attribute used for memoization of connection string.
        self.__connection_string = None

        # Initialize wrapper class for running workload executor.
        self.workload_runner = DriverWorkloadSubprocessRunner()

        # Validate and store organization and project.
        self.organization = get_one_organization_by_name(
            client=self.client,
            organization_name=self.config.organization_name)
        self.project = ensure_project(client=self.client,
                                      project_name=self.config.project_name,
                                      organization_id=self.organization.id)

    @property
    def cluster_url(self):
        return self.client.groups[self.project.id].clusters[self.cluster_name]

    def get_connection_string(self):
        if self.__connection_string is None:
            cluster = self.cluster_url.get().data
            uri = re.sub(
                r'://', '://%s:%s@' %
                (self.config.database_username, self.config.database_password),
                cluster.srvAddress)
            self.__connection_string = uri
        return self.__connection_string

    def __repr__(self):
        return "<AtlasTestCase: {}>".format(self.id)

    def is_cluster_state(self, goal_state):
        cluster_info = self.cluster_url.get().data
        return cluster_info.stateName.lower() == goal_state.lower()

    def verify_cluster_configuration_matches(self, expected_configuration):
        """Verify that the cluster config is what we expect it to be (based on
        maintenance status). Raises AssertionError."""
        cluster_config = self.cluster_url.get().data
        assert_subset(cluster_config,
                      expected_configuration.clusterConfiguration)
        process_args = self.cluster_url.processArgs.get().data
        assert_subset(process_args, expected_configuration.processArgs)

    def initialize(self, no_create=False):
        """
        Initialize a cluster with the configuration required by the test
        specification.
        """

        if no_create:
            try:
                # If --no-create was specified and the cluster exists, skip
                # initialization. If the cluster does not exist, continue
                # with normal creation.
                self.cluster_url.get().data
                self.verify_cluster_configuration_matches(
                    self.spec.initialConfiguration)
                return
            except AtlasApiError as exc:
                if exc.error_code != 'CLUSTER_NOT_FOUND':
                    LOGGER.warn('Cluster was not found, will create one')
            except AssertionError as exc:
                LOGGER.warn(
                    'Configuration did not match: %s. Recreating the cluster' %
                    exc)

        LOGGER.info("Initializing cluster {!r}".format(self.cluster_name))

        cluster_config = self.spec.initialConfiguration.\
            clusterConfiguration.copy()
        cluster_config["name"] = self.cluster_name
        try:
            self.client.groups[self.project.id].clusters.post(**cluster_config)
        except AtlasApiError as exc:
            if exc.error_code == 'DUPLICATE_CLUSTER_NAME':
                # Cluster already exists. Simply re-configure it.
                # Cannot send cluster name when updating existing cluster.
                cluster_config.pop("name")
                self.client.groups[self.project.id].\
                    clusters[self.cluster_name].patch(**cluster_config)
            else:
                raise

        # Apply processArgs if provided.
        process_args = self.spec.initialConfiguration.processArgs
        if process_args:
            self.client.groups[self.project.id].\
                clusters[self.cluster_name].processArgs.patch(**process_args)

    def run(self, persist_cluster=False, startup_time=1):
        LOGGER.info("Running test {!r} on cluster {!r}".format(
            self.id, self.cluster_name))

        # Step-1: sanity-check the cluster configuration.
        self.verify_cluster_configuration_matches(
            self.spec.initialConfiguration)

        # Start the test timer.
        timer = Timer()
        timer.start()

        # Step-2: run driver workload.
        self.workload_runner.spawn(
            workload_executor=self.config.workload_executor,
            connection_string=self.get_connection_string(),
            driver_workload=self.spec.driverWorkload,
            startup_time=startup_time)

        for operation in self.spec.operations:
            if len(operation) != 1:
                raise ValueError("Operation must have exactly one key: %s" %
                                 operation)

            op_name, op_spec = list(operation.items())[0]

            if op_name == 'setClusterConfiguration':
                # Step-3: begin maintenance routine.
                final_config = op_spec
                cluster_config = final_config.clusterConfiguration
                process_args = final_config.processArgs

                if not cluster_config and not process_args:
                    raise RuntimeError("invalid maintenance plan")

                if cluster_config:
                    LOGGER.info("Pushing cluster configuration update")
                    self.cluster_url.patch(**cluster_config)

                if process_args:
                    LOGGER.info("Pushing process arguments update")
                    self.cluster_url.processArgs.patch(**process_args)

                # Step-4: wait until maintenance completes (cluster is IDLE).
                self.wait_for_idle()
                self.verify_cluster_configuration_matches(final_config)
                LOGGER.info("Cluster maintenance complete")

            elif op_name == 'testFailover':
                timer = Timer()
                timer.start()
                timeout = 90

                # DRIVERS-1585: failover may fail due to the cluster not being
                # ready. Retry failover up to a timeout if the
                # CLUSTER_RESTART_INVALID error is returned from the call
                while True:
                    try:
                        self.cluster_url['restartPrimaries'].post()
                    except AtlasApiError as exc:
                        if exc.error_code != 'CLUSTER_RESTART_INVALID':
                            raise
                    else:
                        break

                    if timer.elapsed > timeout:
                        raise PollingTimeoutError(
                            "Could not test failover as cluster wasn't ready")
                    else:
                        sleep(5)

                self.wait_for_idle()

            elif op_name == 'sleep':
                _time.sleep(op_spec)

            elif op_name == 'waitForIdle':
                self.wait_for_idle()

            elif op_name == 'restartVms':
                rv = self.admin_client.nds.groups[self.project.id].clusters[
                    self.cluster_name].reboot.post(api_version='private')

                self.wait_for_idle()

            elif op_name == 'assertPrimaryRegion':
                region = op_spec['region']

                cluster_config = self.cluster_url.get().data
                timer = Timer()
                timer.start()
                timeout = op_spec.get('timeout', 90)

                with mongo_client(self.get_connection_string()) as mc:
                    while True:
                        rsc = mc.admin.command('replSetGetConfig')
                        member = [
                            m for m in rsc['config']['members']
                            if m['horizons']['PUBLIC'] == '%s:%s' % mc.primary
                        ][0]
                        member_region = member['tags']['region']

                        if region == member_region:
                            break

                        if timer.elapsed > timeout:
                            raise Exception(
                                "Primary in cluster not in expected region '%s' (actual region '%s')"
                                % (region, member_region))
                        else:
                            sleep(5)

            else:
                raise Exception('Unrecognized operation %s' % op_name)

        # Wait 10 seconds to ensure that the driver is not experiencing any
        # errors after the maintenance has concluded.
        sleep(10)

        # Step-5: interrupt driver workload and capture streams
        stats = self.workload_runner.stop()

        # Stop the timer
        timer.stop()

        # Step-6: compute xunit entry.
        junit_test = junitparser.TestCase(self.id)
        junit_test.time = timer.elapsed

        if (stats['numErrors'] != 0 or stats['numFailures'] != 0
                or stats['numSuccesses'] == 0):
            LOGGER.info("FAILED: {!r}".format(self.id))
            self.failed = True
            # Write xunit logs for failed tests.
            junit_test.result = junitparser.Failure(str(stats))
        else:
            LOGGER.info("SUCCEEDED: {!r}".format(self.id))
            # Directly log output of successful tests as xunit output
            # is only visible for failed tests.

        LOGGER.info("Workload Statistics: {}".format(stats))

        get_logs(admin_client=self.admin_client,
                 project=self.project,
                 cluster_name=self.cluster_name)

        # Step 7: download logs asynchronously and delete cluster.
        # TODO: https://github.com/mongodb-labs/drivers-atlas-testing/issues/4
        if not persist_cluster:
            self.cluster_url.delete()
            LOGGER.info("Cluster {!r} marked for deletion.".format(
                self.cluster_name))

        return junit_test

    def wait_for_idle(self):
        # Small delay to account for Atlas not updating cluster state
        # synchronously potentially in all maintenance operations
        # (https://jira.mongodb.org/browse/PRODTRIAGE-1232).
        # VM restarts in sharded clusters require a much longer wait
        # (30+ seconds in some circumstances); scenarios that perform
        # VM restarts in sharded clusters should use explicit sleep operations
        # after the restarts until this is fixed.
        LOGGER.info("Waiting to wait for cluster %s to become idle" %
                    self.cluster_name)
        sleep(5)
        LOGGER.info("Waiting for cluster %s to become idle" %
                    self.cluster_name)
        timer = Timer()
        timer.start()
        ok = False
        timeout = self.config.polling_timeout
        wanted_state = 'idle'
        while timer.elapsed < timeout:
            cluster_info = self.cluster_url.get().data
            actual_state = cluster_info.stateName.lower()
            if actual_state == wanted_state:
                ok = True
                break
            LOGGER.info(
                "Cluster %s: current state: %s; wanted state: %s; waited for %.1f sec"
                %
                (self.cluster_name, actual_state, wanted_state, timer.elapsed))
            sleep(1.0 / self.config.polling_frequency)
        if not ok:
            raise PollingTimeoutError("Polling timed out after %s seconds" %
                                      timeout)