def replace_active_node_test(self):

        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        [node1,node2, node3] = cluster.nodelist()

        debug("Inserting Data...")
        if cluster.version() < "2.1":
            node1.stress(['-o', 'insert', '--num-keys=10000', '--replication-factor=3'])
        else:
            node1.stress(['write', 'n=10000', '-schema', 'replication(factor=3)'])
        cursor = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = cursor.execute(query)

        #replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4',9042))
        cluster.add(node4, False)

        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")

        checkError = node4.grep_log("java.lang.UnsupportedOperationException: Cannot replace a live node...")
        self.assertEqual(len(checkError), 1)
    def replace_active_node_test(self):

        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        #replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4', 9042))
        cluster.add(node4, False)

        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")

        checkError = node4.grep_log("java.lang.UnsupportedOperationException: Cannot replace a live node...")
        self.assertEqual(len(checkError), 1)
예제 #3
0
    def replace_active_node_test(self):

        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        [node1, node2, node3] = cluster.nodelist()

        debug("Inserting Data...")
        if cluster.version() < "2.1":
            node1.stress(
                ['-o', 'insert', '--num-keys=10000', '--replication-factor=3'])
        else:
            node1.stress(
                ['write', 'n=10000', '-schema', 'replication(factor=3)'])
        cursor = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1' if self.cluster.version(
        ) >= '2.1' else '"Keyspace1"."Standard1"'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initialData = cursor.execute(query)

        #replace active node 3 with node 4
        debug("Starting node 4 to replace active node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160),
                     ('127.0.0.4', 7000), '7400', '0', None,
                     ('127.0.0.4', 9042))
        cluster.add(node4, False)

        with self.assertRaises(NodeError):
            try:
                node4.start(replace_address='127.0.0.3',
                            wait_for_binary_proto=True)
            except (NodeError, TimeoutError):
                raise NodeError("Node could not start.")

        checkError = node4.grep_log(
            "java.lang.UnsupportedOperationException: Cannot replace a live node..."
        )
        self.assertEqual(len(checkError), 1)
class BaseReplaceAddressTest(Tester):

    @pytest.fixture(autouse=True)
    def fixture_add_additional_log_patterns(self, fixture_dtest_setup):
        fixture_dtest_setup.ignore_log_patterns = (
            # This one occurs when trying to send the migration to a
            # node that hasn't started yet, and when it does, it gets
            # replayed and everything is fine.
            r'Can\'t send migration request: node.*is down',
            r'Migration task failed to complete',  # 10978
            # ignore streaming error during bootstrap
            r'Streaming error occurred',
            r'failed stream session',
            r'Failed to properly handshake with peer'
        )

    def _setup(self, n=3, opts=None, enable_byteman=False, mixed_versions=False):
        logger.debug("Starting cluster with {} nodes.".format(n))
        self.cluster.populate(n)
        if opts is not None:
            logger.debug("Setting cluster options: {}".format(opts))
            self.cluster.set_configuration_options(opts)

        self.cluster.set_batch_commitlog(enabled=True)
        self.query_node = self.cluster.nodelist()[0]
        self.replaced_node = self.cluster.nodelist()[-1]

        self.cluster.seeds.remove(self.replaced_node)

        if enable_byteman:
            # set up byteman
            self.query_node.byteman_port = '8100'
            self.query_node.import_config_files()

        if mixed_versions:
            logger.debug("Starting nodes on version 2.2.4")
            self.cluster.set_install_dir(version="2.2.4")

        self.cluster.start()

        if self.cluster.cassandra_version() >= '2.2.0':
            session = self.patient_cql_connection(self.query_node)
            # Change system_auth keyspace replication factor to 2, otherwise replace will fail
            session.execute("""ALTER KEYSPACE system_auth
                            WITH replication = {'class':'SimpleStrategy',
                            'replication_factor':2};""")

    def _do_replace(self, same_address=False, jvm_option='replace_address',
                    wait_other_notice=False, wait_for_binary_proto=True,
                    replace_address=None, opts=None, data_center=None,
                    extra_jvm_args=None):
        if replace_address is None:
            replace_address = self.replaced_node.address()

        # only create node if it's not yet created
        if self.replacement_node is None:
            replacement_address = '127.0.0.4'
            if same_address:
                replacement_address = self.replaced_node.address()
                self.cluster.remove(self.replaced_node)

            logger.debug("Starting replacement node {} with jvm_option '{}={}'".format(replacement_address, jvm_option, replace_address))
            self.replacement_node = Node('replacement', cluster=self.cluster, auto_bootstrap=True,
                                         thrift_interface=None, storage_interface=(replacement_address, 7000),
                                         jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042))
            if opts is not None:
                logger.debug("Setting options on replacement node: {}".format(opts))
                self.replacement_node.set_configuration_options(opts)
            self.cluster.add(self.replacement_node, False, data_center=data_center)

        if extra_jvm_args is None:
            extra_jvm_args = []
        extra_jvm_args.extend(["-Dcassandra.{}={}".format(jvm_option, replace_address),
                               "-Dcassandra.ring_delay_ms=10000",
                               "-Dcassandra.broadcast_interval_ms=10000"])

        self.replacement_node.start(jvm_args=extra_jvm_args,
                                    wait_for_binary_proto=wait_for_binary_proto, wait_other_notice=wait_other_notice)

        if self.cluster.cassandra_version() >= '2.2.8' and same_address:
            self.replacement_node.watch_log_for("Writes will not be forwarded to this node during replacement",
                                                timeout=60)

    def _stop_node_to_replace(self, gently=False, table='keyspace1.standard1', cl=ConsistencyLevel.THREE):
        if self.replaced_node.is_running():
            logger.debug("Stopping {}".format(self.replaced_node.name))
            self.replaced_node.stop(gently=gently, wait_other_notice=True)

        logger.debug("Testing node stoppage (query should fail).")
        with pytest.raises((Unavailable, ReadTimeout)):
            session = self.patient_cql_connection(self.query_node)
            query = SimpleStatement('select * from {}'.format(table), consistency_level=cl)
            session.execute(query)

    def _insert_data(self, n='1k', rf=3, whitelist=False):
        logger.debug("Inserting {} entries with rf={} with stress...".format(n, rf))
        self.query_node.stress(['write', 'n={}'.format(n), 'no-warmup', '-schema', 'replication(factor={})'.format(rf),
                                '-rate', 'threads=10'],
                               whitelist=whitelist)
        self.cluster.flush()
        time.sleep(20)

    def _fetch_initial_data(self, table='keyspace1.standard1', cl=ConsistencyLevel.THREE, limit=10000):
        logger.debug("Fetching initial data from {} on {} with CL={} and LIMIT={}".format(table, self.query_node.name, cl, limit))
        session = self.patient_cql_connection(self.query_node)
        query = SimpleStatement('select * from {} LIMIT {}'.format(table, limit), consistency_level=cl)
        return rows_to_list(session.execute(query, timeout=20))

    def _verify_data(self, initial_data, table='keyspace1.standard1', cl=ConsistencyLevel.ONE, limit=10000,
                     restart_nodes=False):
        assert len(initial_data) > 0, "Initial data must be greater than 0"

        # query should work again
        logger.debug("Stopping old nodes")
        for node in self.cluster.nodelist():
            if node.is_running() and node != self.replacement_node:
                logger.debug("Stopping {}".format(node.name))
                node.stop(gently=False, wait_other_notice=True)

        logger.debug("Verifying {} on {} with CL={} and LIMIT={}".format(table, self.replacement_node.address(), cl, limit))
        session = self.patient_exclusive_cql_connection(self.replacement_node)
        assert_all(session, 'select * from {} LIMIT {}'.format(table, limit),
                   expected=initial_data,
                   cl=cl)

    def _verify_replacement(self, node, same_address):
        if not same_address:
            if self.cluster.cassandra_version() >= '2.2.7':
                node.watch_log_for("Node {} is replacing {}"
                                   .format(self.replacement_node.address_for_current_version_slashy(),
                                           self.replaced_node.address_for_current_version_slashy()),
                                   timeout=60, filename='debug.log')
                node.watch_log_for("Node {} will complete replacement of {} for tokens"
                                   .format(self.replacement_node.address_for_current_version_slashy(),
                                           self.replaced_node.address_for_current_version_slashy()), timeout=10)
                node.watch_log_for("removing endpoint {}".format(self.replaced_node.address_for_current_version_slashy()),
                                   timeout=60, filename='debug.log')
            else:
                node.watch_log_for("between /{} and /{}; /{} is the new owner"
                                   .format(self.replaced_node.address(),
                                           self.replacement_node.address(),
                                           self.replacement_node.address()),
                                   timeout=60)

    def _verify_tokens_migrated_successfully(self, previous_log_size=None):
        if not self.dtest_config.use_vnodes:
            num_tokens = 1
        else:
            # a little hacky but grep_log returns the whole line...
            num_tokens = int(self.replacement_node.get_conf_option('num_tokens'))

        logger.debug("Verifying {} tokens migrated successfully".format(num_tokens))
        replmnt_address = self.replacement_node.address_for_current_version_slashy()
        repled_address = self.replaced_node.address_for_current_version_slashy()
        token_ownership_log = r"Token (.*?) changing ownership from {} to {}".format(repled_address,
                                                                                     replmnt_address)
        logs = self.replacement_node.grep_log(token_ownership_log)

        if (previous_log_size is not None):
            assert len(logs) == previous_log_size

        moved_tokens = set([l[1].group(1) for l in logs])
        logger.debug("number of moved tokens: {}".format(len(moved_tokens)))
        assert len(moved_tokens) == num_tokens

        return len(logs)

    def _test_insert_data_during_replace(self, same_address, mixed_versions=False):
        """
        @jira_ticket CASSANDRA-8523
        """
        default_install_dir = self.cluster.get_install_dir()
        self._setup(opts={'hinted_handoff_enabled': False}, mixed_versions=mixed_versions)

        self._insert_data(n='1k')
        initial_data = self._fetch_initial_data()
        self._stop_node_to_replace()

        if mixed_versions:
            logger.debug("Upgrading all except {} to current version".format(self.query_node.address()))
            self.cluster.set_install_dir(install_dir=default_install_dir)
            for node in self.cluster.nodelist():
                if node.is_running() and node != self.query_node:
                    logger.debug("Upgrading {} to current version".format(node.address()))
                    node.stop(gently=True, wait_other_notice=True)
                    node.start(wait_for_binary_proto=True)

        # start node in current version on write survey mode
        self._do_replace(same_address=same_address, extra_jvm_args=["-Dcassandra.write_survey=true"])

        # Insert additional keys on query node
        self._insert_data(n='2k', whitelist=True)

        # If not same address or mixed versions, query node should forward writes to replacement node
        # so we update initial data to reflect additional keys inserted during replace
        if not same_address and not mixed_versions:
            initial_data = self._fetch_initial_data(cl=ConsistencyLevel.TWO)

        logger.debug("Joining replaced node")
        self.replacement_node.nodetool("join")

        if not same_address:
            for node in self.cluster.nodelist():
                # if mixed version, query node is not upgraded so it will not print replacement log
                if node.is_running() and (not mixed_versions or node != self.query_node):
                    self._verify_replacement(node, same_address)

        self._verify_data(initial_data)
    def _replace_node_test(self, gently):
        """
        Check that the replace address function correctly replaces a node that has failed in a cluster.
        Create a cluster, cause a node to fail, and bring up a new node with the replace_address parameter.
        Check that tokens are migrated and that data is replicated properly.
        """
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            numNodes = 1
        else:
            # a little hacky but grep_log returns the whole line...
            numNodes = int(node3.get_conf_option('num_tokens'))

        debug(numNodes)

        debug("Inserting Data...")
        node1.stress(['write', 'n=10K', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        session.default_timeout = 45
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        # stop node, query should not work with consistency 3
        debug("Stopping node 3.")
        node3.stop(gently=gently, wait_other_notice=True)

        debug("Testing node stoppage (query should fail).")
        with self.assertRaises(NodeUnavailable):
            try:
                query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
                session.execute(query)
            except (Unavailable, ReadTimeout):
                raise NodeUnavailable("Node could not be queried.")

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")

        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)

        # query should work again
        debug("Verifying querying works again.")
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)

        debug("Verifying tokens migrated sucessfully")
        movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug(movedTokensList[0])
        self.assertEqual(len(movedTokensList), numNodes)

        # check that restarting node 3 doesn't work
        debug("Try to restart node 3 (should fail)")
        node3.start(wait_other_notice=False)
        checkCollision = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner")
        debug(checkCollision)
        self.assertEqual(len(checkCollision), 1)
    def replace_first_boot_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            numNodes = 1
        else:
            # a little hacky but grep_log returns the whole line...
            numNodes = int(node3.get_conf_option('num_tokens'))

        debug(numNodes)

        debug("Inserting Data...")
        node1.stress(['write', 'n=10K', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        # stop node, query should not work with consistency 3
        debug("Stopping node 3.")
        node3.stop(gently=False)

        debug("Testing node stoppage (query should fail).")
        with self.assertRaises(NodeUnavailable):
            try:
                session.execute(query, timeout=30)
            except (Unavailable, ReadTimeout):
                raise NodeUnavailable("Node could not be queried.")

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_for_binary_proto=True)

        # query should work again
        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)

        debug("Verifying tokens migrated sucessfully")
        movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug(movedTokensList[0])
        self.assertEqual(len(movedTokensList), numNodes)

        # check that restarting node 3 doesn't work
        debug("Try to restart node 3 (should fail)")
        node3.start(wait_other_notice=False)
        checkCollision = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner")
        debug(checkCollision)
        self.assertEqual(len(checkCollision), 1)

        # restart node4 (if error's might have to change num_tokens)
        node4.stop(gently=False)
        node4.start(wait_for_binary_proto=True, wait_other_notice=False)

        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)

        # we redo this check because restarting node should not result in tokens being moved again, ie number should be same
        debug("Verifying tokens migrated sucessfully")
        movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug(movedTokensList[0])
        self.assertEqual(len(movedTokensList), numNodes)
    def _replace_node_test(self, gently):
        """
        Check that the replace address function correctly replaces a node that has failed in a cluster.
        Create a cluster, cause a node to fail, and bring up a new node with the replace_address parameter.
        Check that tokens are migrated and that data is replicated properly.
        """
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            num_tokens = 1
        else:
            # a little hacky but grep_log returns the whole line...
            num_tokens = int(node3.get_conf_option('num_tokens'))

        debug("testing with num_tokens: {}".format(num_tokens))

        debug("Inserting Data...")
        node1.stress([
            'write', 'n=10K', 'no-warmup', '-schema', 'replication(factor=3)'
        ])

        session = self.patient_cql_connection(node1)
        session.default_timeout = 45
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initial_data = rows_to_list(session.execute(query))

        # stop node, query should not work with consistency 3
        debug("Stopping node 3.")
        node3.stop(gently=gently, wait_other_notice=True)

        debug("Testing node stoppage (query should fail).")
        with self.assertRaises(NodeUnavailable):
            try:
                query = SimpleStatement(
                    'select * from %s LIMIT 1' % stress_table,
                    consistency_level=ConsistencyLevel.THREE)
                session.execute(query)
            except (Unavailable, ReadTimeout):
                raise NodeUnavailable("Node could not be queried.")

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4',
                     cluster=cluster,
                     auto_bootstrap=True,
                     thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000),
                     jmx_port='7400',
                     remote_debug_port='0',
                     initial_token=None,
                     binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)

        debug("Verifying tokens migrated sucessfully")
        moved_tokens = node4.grep_log(
            "Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug("number of moved tokens: {}".format(len(moved_tokens)))
        self.assertEqual(len(moved_tokens), num_tokens)

        # check that restarting node 3 doesn't work
        debug("Try to restart node 3 (should fail)")
        node3.start(wait_other_notice=False)
        collision_log = node1.grep_log(
            "between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner")
        debug(collision_log)
        self.assertEqual(len(collision_log), 1)
        node3.stop(gently=False)

        # query should work again
        debug("Stopping old nodes")
        node1.stop(gently=False, wait_other_notice=True)
        node2.stop(gently=False, wait_other_notice=True)

        debug("Verifying data on new node.")
        session = self.patient_exclusive_cql_connection(node4)
        assert_all(session,
                   'SELECT * from {} LIMIT 1'.format(stress_table),
                   expected=initial_data,
                   cl=ConsistencyLevel.ONE)
    def _replace_node_test(self, gently):
        """
        Check that the replace address function correctly replaces a node that has failed in a cluster.
        Create a cluster, cause a node to fail, and bring up a new node with the replace_address parameter.
        Check that tokens are migrated and that data is replicated properly.
        """
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            num_tokens = 1
        else:
            # a little hacky but grep_log returns the whole line...
            num_tokens = int(node3.get_conf_option('num_tokens'))

        debug("testing with num_tokens: {}".format(num_tokens))

        debug("Inserting Data...")
        node1.stress(['write', 'n=10K', 'no-warmup', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        session.default_timeout = 45
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initial_data = rows_to_list(session.execute(query))

        # stop node, query should not work with consistency 3
        debug("Stopping node 3.")
        node3.stop(gently=gently, wait_other_notice=True)

        debug("Testing node stoppage (query should fail).")
        with self.assertRaises(NodeUnavailable):
            try:
                query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
                session.execute(query)
            except (Unavailable, ReadTimeout):
                raise NodeUnavailable("Node could not be queried.")

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0',
                     initial_token=None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)

        debug("Verifying tokens migrated sucessfully")
        moved_tokens = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug("number of moved tokens: {}".format(len(moved_tokens)))
        self.assertEqual(len(moved_tokens), num_tokens)

        # check that restarting node 3 doesn't work
        debug("Try to restart node 3 (should fail)")
        node3.start(wait_other_notice=False)
        collision_log = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner")
        debug(collision_log)
        self.assertEqual(len(collision_log), 1)
        node3.stop(gently=False)

        # query should work again
        debug("Stopping old nodes")
        node1.stop(gently=False, wait_other_notice=True)
        node2.stop(gently=False, wait_other_notice=True)

        debug("Verifying data on new node.")
        session = self.patient_exclusive_cql_connection(node4)
        assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table),
                   expected=initial_data,
                   cl=ConsistencyLevel.ONE)
class BaseReplaceAddressTest(Tester):

    @pytest.fixture(autouse=True)
    def fixture_add_additional_log_patterns(self, fixture_dtest_setup):
        fixture_dtest_setup.ignore_log_patterns = (
            # This one occurs when trying to send the migration to a
            # node that hasn't started yet, and when it does, it gets
            # replayed and everything is fine.
            r'Can\'t send migration request: node.*is down',
            r'Migration task failed to complete',  # 10978
            # ignore streaming error during bootstrap
            r'Streaming error occurred',
            r'failed stream session',
            r'Failed to properly handshake with peer'
        )

    def _setup(self, n=3, opts=None, enable_byteman=False, mixed_versions=False):
        logger.debug("Starting cluster with {} nodes.".format(n))
        self.cluster.populate(n)
        if opts is not None:
            logger.debug("Setting cluster options: {}".format(opts))
            self.cluster.set_configuration_options(opts)

        self.cluster.set_batch_commitlog(enabled=True)
        self.query_node = self.cluster.nodelist()[0]
        self.replaced_node = self.cluster.nodelist()[-1]

        self.cluster.seeds.remove(self.replaced_node)
        NUM_TOKENS = os.environ.get('NUM_TOKENS', '256')
        if not self.dtest_config.use_vnodes:
            self.cluster.set_configuration_options(values={'initial_token': None, 'num_tokens': 1})
        else:
            self.cluster.set_configuration_options(values={'initial_token': None, 'num_tokens': NUM_TOKENS})

        if enable_byteman:
            # set up byteman
            self.query_node.byteman_port = '8100'
            self.query_node.import_config_files()

        if mixed_versions:
            logger.debug("Starting nodes on version 2.2.4")
            self.cluster.set_install_dir(version="2.2.4")

        self.cluster.start()

        if self.cluster.cassandra_version() >= '2.2.0':
            session = self.patient_cql_connection(self.query_node)
            # Change system_auth keyspace replication factor to 2, otherwise replace will fail
            session.execute("""ALTER KEYSPACE system_auth
                            WITH replication = {'class':'SimpleStrategy',
                            'replication_factor':2};""")

    def _do_replace(self, same_address=False, jvm_option='replace_address',
                    wait_other_notice=False, wait_for_binary_proto=True,
                    replace_address=None, opts=None, data_center=None,
                    extra_jvm_args=None):
        if replace_address is None:
            replace_address = self.replaced_node.address()

        # only create node if it's not yet created
        if self.replacement_node is None:
            replacement_address = '127.0.0.4'
            if same_address:
                replacement_address = self.replaced_node.address()
                self.cluster.remove(self.replaced_node)

            logger.debug("Starting replacement node {} with jvm_option '{}={}'".format(replacement_address, jvm_option, replace_address))
            self.replacement_node = Node('replacement', cluster=self.cluster, auto_bootstrap=True,
                                         thrift_interface=None, storage_interface=(replacement_address, 7000),
                                         jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042))
            if opts is not None:
                logger.debug("Setting options on replacement node: {}".format(opts))
                self.replacement_node.set_configuration_options(opts)
            self.cluster.add(self.replacement_node, False, data_center=data_center)

        if extra_jvm_args is None:
            extra_jvm_args = []
        extra_jvm_args.extend(["-Dcassandra.{}={}".format(jvm_option, replace_address),
                               "-Dcassandra.ring_delay_ms=10000",
                               "-Dcassandra.broadcast_interval_ms=10000"])

        self.replacement_node.start(jvm_args=extra_jvm_args,
                                    wait_for_binary_proto=wait_for_binary_proto, wait_other_notice=wait_other_notice)

        if self.cluster.cassandra_version() >= '2.2.8' and same_address:
            self.replacement_node.watch_log_for("Writes will not be forwarded to this node during replacement",
                                                timeout=60)

    def _stop_node_to_replace(self, gently=False, table='keyspace1.standard1', cl=ConsistencyLevel.THREE):
        if self.replaced_node.is_running():
            logger.debug("Stopping {}".format(self.replaced_node.name))
            self.replaced_node.stop(gently=gently, wait_other_notice=True)

        logger.debug("Testing node stoppage (query should fail).")
        with pytest.raises((Unavailable, ReadTimeout)):
            session = self.patient_cql_connection(self.query_node)
            query = SimpleStatement('select * from {}'.format(table), consistency_level=cl)
            session.execute(query)

    def _insert_data(self, n='1k', rf=3, whitelist=False):
        logger.debug("Inserting {} entries with rf={} with stress...".format(n, rf))
        self.query_node.stress(['write', 'n={}'.format(n), 'no-warmup', '-schema', 'replication(factor={})'.format(rf),
                                '-rate', 'threads=10'],
                               whitelist=whitelist)
        self.cluster.flush()
        time.sleep(20)

    def _fetch_initial_data(self, table='keyspace1.standard1', cl=ConsistencyLevel.THREE, limit=10000):
        logger.debug("Fetching initial data from {} on {} with CL={} and LIMIT={}".format(table, self.query_node.name, cl, limit))
        session = self.patient_cql_connection(self.query_node)
        query = SimpleStatement('select * from {} LIMIT {}'.format(table, limit), consistency_level=cl)
        return rows_to_list(session.execute(query, timeout=20))

    def _verify_data(self, initial_data, table='keyspace1.standard1', cl=ConsistencyLevel.ONE, limit=10000,
                     restart_nodes=False):
        assert len(initial_data) > 0, "Initial data must be greater than 0"

        # query should work again
        logger.debug("Stopping old nodes")
        for node in self.cluster.nodelist():
            if node.is_running() and node != self.replacement_node:
                logger.debug("Stopping {}".format(node.name))
                node.stop(gently=False, wait_other_notice=True)

        logger.debug("Verifying {} on {} with CL={} and LIMIT={}".format(table, self.replacement_node.address(), cl, limit))
        session = self.patient_exclusive_cql_connection(self.replacement_node)
        assert_all(session, 'select * from {} LIMIT {}'.format(table, limit),
                   expected=initial_data,
                   cl=cl)

    def _verify_replacement(self, node, same_address):
        if not same_address:
            if self.cluster.cassandra_version() >= '2.2.7':
                address_prefix = '' if self.cluster.cassandra_version() >= '4.0' else '/'
                node.watch_log_for("Node {}{} is replacing {}{}"
                                   .format(address_prefix, self.replacement_node.address_for_current_version(),
                                           address_prefix, self.replaced_node.address_for_current_version()),
                                   timeout=60, filename='debug.log')
                node.watch_log_for("Node {}{} will complete replacement of {}{} for tokens"
                                   .format(address_prefix, self.replacement_node.address_for_current_version(),
                                           address_prefix, self.replaced_node.address_for_current_version()), timeout=10)
                node.watch_log_for("removing endpoint {}{}".format(address_prefix, self.replaced_node.address_for_current_version()),
                                   timeout=60, filename='debug.log')
            else:
                node.watch_log_for("between /{} and /{}; /{} is the new owner"
                                   .format(self.replaced_node.address(),
                                           self.replacement_node.address(),
                                           self.replacement_node.address()),
                                   timeout=60)

    def _verify_tokens_migrated_successfully(self, previous_log_size=None):
        if not self.dtest_config.use_vnodes:
            num_tokens = 1
        else:
            # a little hacky but grep_log returns the whole line...
            num_tokens = int(self.replacement_node.get_conf_option('num_tokens'))

        logger.debug("Verifying {} tokens migrated sucessfully".format(num_tokens))
        logs = self.replacement_node.grep_log(r"Token (.*?) changing ownership from /{} to /{}"
                                              .format(self.replaced_node.address(),
                                                      self.replacement_node.address()))
        if (previous_log_size is not None):
            assert len(logs) == previous_log_size

        moved_tokens = set([l[1].group(1) for l in logs])
        logger.debug("number of moved tokens: {}".format(len(moved_tokens)))
        assert len(moved_tokens) == num_tokens

        return len(logs)

    def _test_insert_data_during_replace(self, same_address, mixed_versions=False):
        """
        @jira_ticket CASSANDRA-8523
        """
        default_install_dir = self.cluster.get_install_dir()
        self._setup(opts={'hinted_handoff_enabled': False}, mixed_versions=mixed_versions)

        self._insert_data(n='1k')
        initial_data = self._fetch_initial_data()
        self._stop_node_to_replace()

        if mixed_versions:
            logger.debug("Upgrading all except {} to current version".format(self.query_node.address()))
            self.cluster.set_install_dir(install_dir=default_install_dir)
            for node in self.cluster.nodelist():
                if node.is_running() and node != self.query_node:
                    logger.debug("Upgrading {} to current version".format(node.address()))
                    node.stop(gently=True, wait_other_notice=True)
                    node.start(wait_other_notice=True, wait_for_binary_proto=True)

        # start node in current version on write survey mode
        self._do_replace(same_address=same_address, extra_jvm_args=["-Dcassandra.write_survey=true"])

        # Insert additional keys on query node
        self._insert_data(n='2k', whitelist=True)

        # If not same address or mixed versions, query node should forward writes to replacement node
        # so we update initial data to reflect additional keys inserted during replace
        if not same_address and not mixed_versions:
            initial_data = self._fetch_initial_data(cl=ConsistencyLevel.TWO)

        logger.debug("Joining replaced node")
        self.replacement_node.nodetool("join")

        if not same_address:
            for node in self.cluster.nodelist():
                # if mixed version, query node is not upgraded so it will not print replacement log
                if node.is_running() and (not mixed_versions or node != self.query_node):
                    self._verify_replacement(node, same_address)

        self._verify_data(initial_data)
예제 #10
0
    def repair_compaction_fine_test(self):
        """Check that we do not stream data for repairs past the last repair.
        Check cases i) node goes down, data inserted, node comes up run repair
        ii) after case i, do similar with a different node down (since compaction has seperated repaired)
        iii) check that repair works appropriately where a new node is replacing
        """
        cluster = self.cluster
        cluster.set_configuration_options(values={
            'hinted_handoff_enabled': False,
            'auto_snapshot': False
        },
                                          batch_commitlog=False)
        cluster.populate(3).start()
        [node1, node2, node3] = cluster.nodelist()

        cursor = self.patient_cql_connection(node1)
        self.create_ks(cursor, 'ks', 3)
        self.create_cf(cursor,
                       'cf',
                       read_repair=0.0,
                       columns={
                           'c1': 'text',
                           'c2': 'text'
                       })

        debug("insert data into all")

        for x in range(1, 5):
            insert_c1c2(cursor, x, ConsistencyLevel.ALL)
        node1.flush()

        debug("bringing down node 3")
        node3.flush()
        node3.stop(gently=False)

        debug("inserting additional data into node 1 and 2")
        for y in range(5, 10):
            insert_c1c2(cursor, y, ConsistencyLevel.TWO)
        node1.flush()
        node2.flush()

        debug("restarting and repairing node 3")
        node3.start()
        node3.repair()

        sstableNode1 = node3.grep_log(
            "reading file from /127.0.0.1, repairedAt = 0")
        sstableNode2 = node3.grep_log(
            "reading file from /127.0.0.2, repairedAt = 0")
        catchBadSSReads = node3.grep_log(
            "reading file from .* repairedAt = ([1-9])")
        self.assertGreaterEqual(len(sstableNode1), 1)
        self.assertGreaterEqual(len(sstableNode2), 1)
        self.assertLess(len(catchBadSSReads), 1)

        debug("stopping node 2")
        node2.stop(gently=False)

        debug("inserting data in nodes 1 and 3")
        for z in range(10, 15):
            insert_c1c2(cursor, z, ConsistencyLevel.TWO)
        node1.flush()

        debug("start and repair node 2")
        node2.flush()
        node2.start()
        node2.repair()

        fileFromNode1 = node2.grep_log(
            "reading file from /127.0.0.1, repairedAt = 0")
        fileFromNode3 = node2.grep_log(
            "reading file from /127.0.0.3, repairedAt = 0")
        catchBadReads = node2.grep_log(
            "reading file from .* repairedAt = ([1-9])")
        self.assertGreaterEqual(len(fileFromNode1), 1)
        self.assertGreaterEqual(len(fileFromNode3), 1)
        self.assertLess(len(catchBadReads), 1)

        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160),
                     ('127.0.0.4', 7000), '7400', '0', None,
                     ('127.0.0.4', 9042))
        node4.start()

        debug("replace node and check repair-like process")
        node3.stop(gently=False)
        node5 = Node('node5', cluster, True, ('127.0.0.5', 9160),
                     ('127.0.0.5', 7000), '7500', '0', None,
                     ('127.0.0.5', 9042))
        cluster.add(node5, False)
        node5.start(replace_address='127.0.0.3', wait_other_notice=True)

        fileRead = node5.grep_log("reading file from .*, repairedAt = 0")
        self.assertGreaterEqual(len(fileRead), 1)

        # additionally should see 14 distinct keys in data(this prints to command line)
        debug((node2.run_sstable2json()))

        rows = cursor.execute("SELECT COUNT(*) FROM ks.cf LIMIT 100")

        results = rows[0]
        debug(results)

        self.assertEqual(results[0], 14)