def replace_first_boot_test(self):
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            numNodes = 1
        else:
            # a little hacky but grep_log returns the whole line...
            numNodes = int(node3.get_conf_option('num_tokens'))

        debug(numNodes)

        debug("Inserting Data...")
        node1.stress(['write', 'n=10K', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        # stop node, query should not work with consistency 3
        debug("Stopping node 3.")
        node3.stop(gently=False)

        debug("Testing node stoppage (query should fail).")
        with self.assertRaises(NodeUnavailable):
            try:
                session.execute(query, timeout=30)
            except (Unavailable, ReadTimeout):
                raise NodeUnavailable("Node could not be queried.")

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_for_binary_proto=True)

        # query should work again
        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)

        debug("Verifying tokens migrated sucessfully")
        movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug(movedTokensList[0])
        self.assertEqual(len(movedTokensList), numNodes)

        # check that restarting node 3 doesn't work
        debug("Try to restart node 3 (should fail)")
        node3.start(wait_other_notice=False)
        checkCollision = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner")
        debug(checkCollision)
        self.assertEqual(len(checkCollision), 1)

        # restart node4 (if error's might have to change num_tokens)
        node4.stop(gently=False)
        node4.start(wait_for_binary_proto=True, wait_other_notice=False)

        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)

        # we redo this check because restarting node should not result in tokens being moved again, ie number should be same
        debug("Verifying tokens migrated sucessfully")
        movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug(movedTokensList[0])
        self.assertEqual(len(movedTokensList), numNodes)
    def _replace_node_test(self, gently):
        """
        Check that the replace address function correctly replaces a node that has failed in a cluster.
        Create a cluster, cause a node to fail, and bring up a new node with the replace_address parameter.
        Check that tokens are migrated and that data is replicated properly.
        """
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            numNodes = 1
        else:
            # a little hacky but grep_log returns the whole line...
            numNodes = int(node3.get_conf_option('num_tokens'))

        debug(numNodes)

        debug("Inserting Data...")
        node1.stress(['write', 'n=10K', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        session.default_timeout = 45
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        # stop node, query should not work with consistency 3
        debug("Stopping node 3.")
        node3.stop(gently=gently, wait_other_notice=True)

        debug("Testing node stoppage (query should fail).")
        with self.assertRaises(NodeUnavailable):
            try:
                query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
                session.execute(query)
            except (Unavailable, ReadTimeout):
                raise NodeUnavailable("Node could not be queried.")

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")

        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)

        # query should work again
        debug("Verifying querying works again.")
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE)
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)

        debug("Verifying tokens migrated sucessfully")
        movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug(movedTokensList[0])
        self.assertEqual(len(movedTokensList), numNodes)

        # check that restarting node 3 doesn't work
        debug("Try to restart node 3 (should fail)")
        node3.start(wait_other_notice=False)
        checkCollision = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner")
        debug(checkCollision)
        self.assertEqual(len(checkCollision), 1)
    def test_upgrade_legacy_table(self):
        """
        Upgrade with bringing up the legacy tables after the newer nodes (without legacy tables)
        were started.

        @jira_ticket CASSANDRA-12813
        """

        cluster = self.cluster

        # Forcing cluster version on purpose
        cluster.set_install_dir(version="2.1.16")
        cluster.populate(3).start()

        node1, node2, node3 = cluster.nodelist()

        # Wait for default user to get created on one of the nodes
        time.sleep(15)

        # Upgrade to current version
        for node in [node1, node2, node3]:
            node.drain()
            node.watch_log_for("DRAINED")
            node.stop(gently=True)
            self.set_node_to_current_version(node)

        cluster.start()

        # Make sure the system_auth table will get replicated to the node that we're going to replace
        session = self.patient_cql_connection(node1, user='******', password='******')
        session.execute("ALTER KEYSPACE system_auth WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 3 };")
        cluster.repair()

        cluster.stop()

        # Replace the node
        cluster.seeds.remove(node1)
        cluster.remove(node1)

        replacement_address = node1.address()
        replacement_node = Node('replacement', cluster=self.cluster, auto_bootstrap=True,
                                thrift_interface=(replacement_address, 9160),
                                storage_interface=(replacement_address, 7000),
                                jmx_port='7400', remote_debug_port='0', initial_token=None,
                                binary_interface=(replacement_address, 9042))
        self.set_node_to_current_version(replacement_node)

        cluster.add(replacement_node, True)
        replacement_node.start(wait_for_binary_proto=True)

        node2.start(wait_for_binary_proto=True)
        node3.start(wait_for_binary_proto=True)

        replacement_node.watch_log_for('Initializing system_auth.credentials')
        replacement_node.watch_log_for('Initializing system_auth.permissions')
        replacement_node.watch_log_for('Initializing system_auth.users')

        cluster.repair()
        replacement_node.watch_log_for('Repair command')

        # Should succeed. Will throw an NPE on pre-12813 code.
        self.patient_cql_connection(replacement_node, user='******', password='******')
Example #4
0
 def create_node(self, name, auto_bootstrap, thrift_interface, storage_interface, jmx_port, remote_debug_port, initial_token, save=True, binary_interface=None, byteman_port='0'):
     return Node(name, self, auto_bootstrap, thrift_interface, storage_interface, jmx_port, remote_debug_port, initial_token, save, binary_interface, byteman_port)
    def multiple_repair_test(self):
        """
        * Launch a three node cluster
        * Create a keyspace with RF 3 and a table
        * Insert 49 rows
        * Stop node3
        * Insert 50 more rows
        * Restart node3
        * Issue an incremental repair on node3
        * Stop node2
        * Insert a final50 rows
        * Restart node2
        * Issue an incremental repair on node2
        * Replace node3 with a new node
        * Verify data integrity
        # TODO: Several more verifications of data need to be interspersed throughout the test. The final assertion is insufficient.
        @jira_ticket CASSANDRA-10644
        """
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        session = self.patient_cql_connection(node1)
        self.create_ks(session, 'ks', 3)
        self.create_cf(session,
                       'cf',
                       read_repair=0.0,
                       columns={
                           'c1': 'text',
                           'c2': 'text'
                       })

        debug("insert data")

        insert_c1c2(session,
                    keys=range(1, 50),
                    consistency=ConsistencyLevel.ALL)
        node1.flush()

        debug("bringing down node 3")
        node3.flush()
        node3.stop(gently=False)

        debug("inserting additional data into node 1 and 2")
        insert_c1c2(session,
                    keys=range(50, 100),
                    consistency=ConsistencyLevel.TWO)
        node1.flush()
        node2.flush()

        debug("restarting and repairing node 3")
        node3.start(wait_for_binary_proto=True)

        if cluster.version() >= "2.2":
            node3.repair()
        else:
            node3.nodetool("repair -par -inc")

        # wait stream handlers to be closed on windows
        # after session is finished (See CASSANDRA-10644)
        if is_win:
            time.sleep(2)

        debug("stopping node 2")
        node2.stop(gently=False)

        debug("inserting data in nodes 1 and 3")
        insert_c1c2(session,
                    keys=range(100, 150),
                    consistency=ConsistencyLevel.TWO)
        node1.flush()
        node3.flush()

        debug("start and repair node 2")
        node2.start(wait_for_binary_proto=True)

        if cluster.version() >= "2.2":
            node2.repair()
        else:
            node2.nodetool("repair -par -inc")

        debug("replace node and check data integrity")
        node3.stop(gently=False)
        node5 = Node('node5', cluster, True, ('127.0.0.5', 9160),
                     ('127.0.0.5', 7000), '7500', '0', None,
                     ('127.0.0.5', 9042))
        cluster.add(node5, False)
        node5.start(replace_address='127.0.0.3', wait_other_notice=True)

        assert_one(session, "SELECT COUNT(*) FROM ks.cf LIMIT 200", [149])
Example #6
0
    def populate(self,
                 nodes,
                 debug=False,
                 tokens=None,
                 use_vnodes=False,
                 ipprefix='127.0.0.',
                 iplist=None,
                 dclist=None):
        node_count = nodes
        dcs = []

        if dclist is not None:
            self.set_configuration_options(
                values={
                    'endpoint_snitch':
                    'org.apache.cassandra.locator.PropertyFileSnitch'
                })
            dcs = filter(None, dclist.split(","))
        else:
            if isinstance(nodes, list):
                self.set_configuration_options(
                    values={
                        'endpoint_snitch':
                        'org.apache.cassandra.locator.PropertyFileSnitch'
                    })
                node_count = 0
                i = 0
                for c in nodes:
                    i = i + 1
                    node_count = node_count + c
                    for x in xrange(0, c):
                        dcs.append('dc%d' % i)

        if node_count < 1:
            raise common.ArgumentError('invalid node count %s' % nodes)

        for i in xrange(1, node_count + 1):
            if 'node%s' % i in list(self.nodes.values()):
                raise common.ArgumentError(
                    'Cannot create existing node node%s' % i)

        if tokens is None and not use_vnodes:
            tokens = self.balanced_tokens(node_count)

        for i in xrange(1, node_count + 1):
            tk = None
            if tokens is not None and i - 1 < len(tokens):
                tk = tokens[i - 1]
            dc = dcs[i - 1] if i - 1 < len(dcs) else None

            node_ip = '%s%s' % (ipprefix, i)

            if (iplist is not None):
                node_ip = iplist[i - 1]

            binary = None
            if self.version() >= '1.2':
                binary = (node_ip, 9042)
            node = Node('node%s' % i,
                        self,
                        False, (node_ip, 9160), (node_ip, 7000),
                        str(7000 + i * 100),
                        (str(0), str(2000 + i * 100))[debug == True],
                        tk,
                        binary_interface=binary)
            self.add(node, True, dc)
            self.__update_config()
        return self
    def _replace_node_test(self, gently):
        """
        Check that the replace address function correctly replaces a node that has failed in a cluster.
        Create a cluster, cause a node to fail, and bring up a new node with the replace_address parameter.
        Check that tokens are migrated and that data is replicated properly.
        """
        debug("Starting cluster with 3 nodes.")
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        if DISABLE_VNODES:
            num_tokens = 1
        else:
            # a little hacky but grep_log returns the whole line...
            num_tokens = int(node3.get_conf_option('num_tokens'))

        debug("testing with num_tokens: {}".format(num_tokens))

        debug("Inserting Data...")
        node1.stress([
            'write', 'n=10K', 'no-warmup', '-schema', 'replication(factor=3)'
        ])

        session = self.patient_cql_connection(node1)
        session.default_timeout = 45
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initial_data = rows_to_list(session.execute(query))

        # stop node, query should not work with consistency 3
        debug("Stopping node 3.")
        node3.stop(gently=gently, wait_other_notice=True)

        debug("Testing node stoppage (query should fail).")
        with self.assertRaises(NodeUnavailable):
            try:
                query = SimpleStatement(
                    'select * from %s LIMIT 1' % stress_table,
                    consistency_level=ConsistencyLevel.THREE)
                session.execute(query)
            except (Unavailable, ReadTimeout):
                raise NodeUnavailable("Node could not be queried.")

        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4',
                     cluster=cluster,
                     auto_bootstrap=True,
                     thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000),
                     jmx_port='7400',
                     remote_debug_port='0',
                     initial_token=None,
                     binary_interface=('127.0.0.4', 9042))
        cluster.add(node4, False)
        node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True)

        debug("Verifying tokens migrated sucessfully")
        moved_tokens = node4.grep_log(
            "Token .* changing ownership from /127.0.0.3 to /127.0.0.4")
        debug("number of moved tokens: {}".format(len(moved_tokens)))
        self.assertEqual(len(moved_tokens), num_tokens)

        # check that restarting node 3 doesn't work
        debug("Try to restart node 3 (should fail)")
        node3.start(wait_other_notice=False)
        collision_log = node1.grep_log(
            "between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner")
        debug(collision_log)
        self.assertEqual(len(collision_log), 1)
        node3.stop(gently=False)

        # query should work again
        debug("Stopping old nodes")
        node1.stop(gently=False, wait_other_notice=True)
        node2.stop(gently=False, wait_other_notice=True)

        debug("Verifying data on new node.")
        session = self.patient_exclusive_cql_connection(node4)
        assert_all(session,
                   'SELECT * from {} LIMIT 1'.format(stress_table),
                   expected=initial_data,
                   cl=ConsistencyLevel.ONE)
    def test_node_cannot_join_as_hibernating_node_without_replace_address(self):
        """
        @jira_ticket CASSANDRA-14559
        Test that a node cannot bootstrap without replace_address if a hibernating node exists with that address
        """
        cluster = self.cluster
        cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')
        cluster.populate(2)
        # Setting seed node to first node to make sure replaced node is not in own seed list
        cluster.set_configuration_options({
            'seed_provider': [{'class_name': 'org.apache.cassandra.locator.SimpleSeedProvider',
                               'parameters': [{'seeds': '127.0.0.1'}]
                               }]
        })

        cluster.start()

        node1 = cluster.nodelist()[0]
        node2 = cluster.nodelist()[1]

        replacement_address = node2.address()
        node2.stop()

        jvm_option = 'replace_address'

        logger.debug("Starting replacement node {} with jvm_option '{}={}'".format(replacement_address, jvm_option,
                                                                                   replacement_address))
        replacement_node = Node('replacement', cluster=self.cluster, auto_bootstrap=True,
                                thrift_interface=None, storage_interface=(replacement_address, 7000),
                                jmx_port='7400', remote_debug_port='0', initial_token=None,
                                binary_interface=(replacement_address, 9042))
        cluster.add(replacement_node, False)

        extra_jvm_args = []
        extra_jvm_args.extend(["-Dcassandra.{}={}".format(jvm_option, replacement_address),
                               "-Dcassandra.ring_delay_ms=10000",
                               "-Dcassandra.broadcast_interval_ms=10000"])

        wait_other_notice = False
        wait_for_binary_proto = False

        # Killing node earlier in bootstrap to prevent node making it to 'normal' status.
        t = KillOnReadyToBootstrap(replacement_node)

        t.start()

        replacement_node.start(jvm_args=extra_jvm_args,
                               wait_for_binary_proto=wait_for_binary_proto, wait_other_notice=wait_other_notice)

        t.join()

        logger.debug("Asserting that original replacement node is not running")
        assert not replacement_node.is_running()

        # Assert node is actually in hibernate for test to be accurate.
        logger.debug("Asserting that node is actually in hibernate status for test accuracy")
        assert 'hibernate' in node1.nodetool("gossipinfo").stdout

        extra_jvm_args = []
        extra_jvm_args.extend(["-Dcassandra.ring_delay_ms=10000",
                               "-Dcassandra.broadcast_interval_ms=10000"])

        logger.debug("Starting blind replacement node {}".format(replacement_address))
        blind_replacement_node = Node('blind_replacement', cluster=self.cluster, auto_bootstrap=True,
                                      thrift_interface=None, storage_interface=(replacement_address, 7000),
                                      jmx_port='7400', remote_debug_port='0', initial_token=None,
                                      binary_interface=(replacement_address, 9042))
        cluster.add(blind_replacement_node, False)
        wait_other_notice = False
        wait_for_binary_proto = False

        blind_replacement_node.start(wait_for_binary_proto=wait_for_binary_proto, wait_other_notice=wait_other_notice)

        # Asserting that the new node has correct log entry
        self.assert_log_had_msg(blind_replacement_node, "A node with the same IP in hibernate status was detected", timeout=60)
        # Waiting two seconds to give node a chance to stop in case above assertion is True.
        # When this happens cassandra may not shut down fast enough and the below assertion fails.
        time.sleep(15)
        # Asserting that then new node is not running.
        # This tests the actual expected state as opposed to just checking for the existance of the above error message.
        assert not blind_replacement_node.is_running()
    def resumable_replace_test(self):
        """
        Test resumable bootstrap while replacing node. Feature introduced in
        2.2 with ticket https://issues.apache.org/jira/browse/CASSANDRA-8838

        @jira_ticket https://issues.apache.org/jira/browse/CASSANDRA-8838
        """

        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        node1.stress([
            'write', 'n=100K', 'no-warmup', '-schema', 'replication(factor=3)'
        ])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initial_data = rows_to_list(session.execute(query))

        node3.stop(gently=False)

        # kill node1 in the middle of streaming to let it fail
        t = InterruptBootstrap(node1)
        t.start()
        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4',
                     cluster=cluster,
                     auto_bootstrap=True,
                     thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000),
                     jmx_port='7400',
                     remote_debug_port='0',
                     initial_token=None,
                     binary_interface=('127.0.0.4', 9042))
        # keep timeout low so that test won't hang
        node4.set_configuration_options(
            values={'streaming_socket_timeout_in_ms': 1000})
        cluster.add(node4, False)
        try:
            node4.start(
                jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"],
                wait_other_notice=False)
        except NodeError:
            pass  # node doesn't start as expected
        t.join()

        # bring back node1 and invoke nodetool bootstrap to resume bootstrapping
        node1.start()
        node4.nodetool('bootstrap resume')
        # check if we skipped already retrieved ranges
        node4.watch_log_for("already available. Skipping streaming.")
        # wait for node3 ready to query
        node4.watch_log_for("Listening for thrift clients...")

        # check if 2nd bootstrap succeeded
        assert_bootstrap_state(self, node4, 'COMPLETED')

        # query should work again
        debug("Stopping old nodes")
        node1.stop(gently=False, wait_other_notice=True)
        node2.stop(gently=False, wait_other_notice=True)

        debug("Verifying data on new node.")
        session = self.patient_exclusive_cql_connection(node4)
        assert_all(session,
                   'SELECT * from {} LIMIT 1'.format(stress_table),
                   expected=initial_data,
                   cl=ConsistencyLevel.ONE)
    def replace_with_reset_resume_state_test(self):
        """Test replace with resetting bootstrap progress"""

        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        node1.stress([
            'write', 'n=100K', 'no-warmup', '-schema', 'replication(factor=3)'
        ])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initial_data = rows_to_list(session.execute(query))

        node3.stop(gently=False)

        # kill node1 in the middle of streaming to let it fail
        t = InterruptBootstrap(node1)
        t.start()
        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4',
                     cluster=cluster,
                     auto_bootstrap=True,
                     thrift_interface=('127.0.0.4', 9160),
                     storage_interface=('127.0.0.4', 7000),
                     jmx_port='7400',
                     remote_debug_port='0',
                     initial_token=None,
                     binary_interface=('127.0.0.4', 9042))

        # keep timeout low so that test won't hang
        node4.set_configuration_options(
            values={'streaming_socket_timeout_in_ms': 1000})
        cluster.add(node4, False)
        try:
            node4.start(
                jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"],
                wait_other_notice=False)
        except NodeError:
            pass  # node doesn't start as expected
        t.join()
        node1.start()

        # restart node4 bootstrap with resetting bootstrap state
        node4.stop()
        mark = node4.mark_log()
        node4.start(jvm_args=[
            "-Dcassandra.replace_address_first_boot=127.0.0.3",
            "-Dcassandra.reset_bootstrap_progress=true"
        ])
        # check if we reset bootstrap state
        node4.watch_log_for("Resetting bootstrap progress to start fresh",
                            from_mark=mark)
        # wait for node3 ready to query
        node4.watch_log_for("Listening for thrift clients...", from_mark=mark)

        # check if 2nd bootstrap succeeded
        assert_bootstrap_state(self, node4, 'COMPLETED')

        # query should work again
        debug("Stopping old nodes")
        node1.stop(gently=False, wait_other_notice=True)
        node2.stop(gently=False, wait_other_notice=True)

        debug("Verifying data on new node.")
        session = self.patient_exclusive_cql_connection(node4)
        assert_all(session,
                   'SELECT * from {} LIMIT 1'.format(stress_table),
                   expected=initial_data,
                   cl=ConsistencyLevel.ONE)
Example #11
0
def new_node(cluster, bootstrap=True, token=None):
    i = len(cluster.nodes) + 1
    node = Node('node%s' % i, cluster, bootstrap, ('127.0.0.%s' % i, 9160),
                ('127.0.0.%s' % i, 7000), str(7000 + i * 100), token)
    cluster.add(node, not bootstrap)
    return node
Example #12
0
    def repair_compaction_fine_test(self):
        """Check that we do not stream data for repairs past the last repair.
        Check cases i) node goes down, data inserted, node comes up run repair
        ii) after case i, do similar with a different node down (since compaction has seperated repaired)
        iii) check that repair works appropriately where a new node is replacing
        """
        cluster = self.cluster
        cluster.set_configuration_options(values={
            'hinted_handoff_enabled': False,
            'auto_snapshot': False
        },
                                          batch_commitlog=False)
        cluster.populate(3).start()
        [node1, node2, node3] = cluster.nodelist()

        cursor = self.patient_cql_connection(node1)
        self.create_ks(cursor, 'ks', 3)
        self.create_cf(cursor,
                       'cf',
                       read_repair=0.0,
                       columns={
                           'c1': 'text',
                           'c2': 'text'
                       })

        debug("insert data into all")

        for x in range(1, 5):
            insert_c1c2(cursor, x, ConsistencyLevel.ALL)
        node1.flush()

        debug("bringing down node 3")
        node3.flush()
        node3.stop(gently=False)

        debug("inserting additional data into node 1 and 2")
        for y in range(5, 10):
            insert_c1c2(cursor, y, ConsistencyLevel.TWO)
        node1.flush()
        node2.flush()

        debug("restarting and repairing node 3")
        node3.start()
        node3.repair()

        sstableNode1 = node3.grep_log(
            "reading file from /127.0.0.1, repairedAt = 0")
        sstableNode2 = node3.grep_log(
            "reading file from /127.0.0.2, repairedAt = 0")
        catchBadSSReads = node3.grep_log(
            "reading file from .* repairedAt = ([1-9])")
        self.assertGreaterEqual(len(sstableNode1), 1)
        self.assertGreaterEqual(len(sstableNode2), 1)
        self.assertLess(len(catchBadSSReads), 1)

        debug("stopping node 2")
        node2.stop(gently=False)

        debug("inserting data in nodes 1 and 3")
        for z in range(10, 15):
            insert_c1c2(cursor, z, ConsistencyLevel.TWO)
        node1.flush()

        debug("start and repair node 2")
        node2.flush()
        node2.start()
        node2.repair()

        fileFromNode1 = node2.grep_log(
            "reading file from /127.0.0.1, repairedAt = 0")
        fileFromNode3 = node2.grep_log(
            "reading file from /127.0.0.3, repairedAt = 0")
        catchBadReads = node2.grep_log(
            "reading file from .* repairedAt = ([1-9])")
        self.assertGreaterEqual(len(fileFromNode1), 1)
        self.assertGreaterEqual(len(fileFromNode3), 1)
        self.assertLess(len(catchBadReads), 1)

        node4 = Node('node4', cluster, True, ('127.0.0.4', 9160),
                     ('127.0.0.4', 7000), '7400', '0', None,
                     ('127.0.0.4', 9042))
        node4.start()

        debug("replace node and check repair-like process")
        node3.stop(gently=False)
        node5 = Node('node5', cluster, True, ('127.0.0.5', 9160),
                     ('127.0.0.5', 7000), '7500', '0', None,
                     ('127.0.0.5', 9042))
        cluster.add(node5, False)
        node5.start(replace_address='127.0.0.3', wait_other_notice=True)

        fileRead = node5.grep_log("reading file from .*, repairedAt = 0")
        self.assertGreaterEqual(len(fileRead), 1)

        # additionally should see 14 distinct keys in data(this prints to command line)
        debug((node2.run_sstable2json()))

        rows = cursor.execute("SELECT COUNT(*) FROM ks.cf LIMIT 100")

        results = rows[0]
        debug(results)

        self.assertEqual(results[0], 14)
Example #13
0
    def replace_with_reset_resume_state_test(self):
        """Test replace with resetting bootstrap progress"""

        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        node1.stress(['write', 'n=100K', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        node3.stop(gently=False)

        # kill node1 in the middle of streaming to let it fail
        t = InterruptBootstrap(node1)
        t.start()
        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4',
                     cluster,
                     True, ('127.0.0.4', 9160), ('127.0.0.4', 7000),
                     '7400',
                     '0',
                     None,
                     binary_interface=('127.0.0.4', 9042))
        # keep timeout low so that test won't hang
        node4.set_configuration_options(
            values={'streaming_socket_timeout_in_ms': 1000})
        cluster.add(node4, False)
        try:
            node4.start(
                jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"],
                wait_other_notice=False)
        except NodeError:
            pass  # node doesn't start as expected
        t.join()
        node1.start()

        # restart node4 bootstrap with resetting bootstrap state
        node4.stop()
        mark = node4.mark_log()
        node4.start(jvm_args=[
            "-Dcassandra.replace_address_first_boot=127.0.0.3",
            "-Dcassandra.reset_bootstrap_progress=true"
        ])
        # check if we reset bootstrap state
        node4.watch_log_for("Resetting bootstrap progress to start fresh",
                            from_mark=mark)
        # wait for node3 ready to query
        node4.watch_log_for("Listening for thrift clients...", from_mark=mark)

        # check if 2nd bootstrap succeeded
        session = self.exclusive_cql_connection(node4)
        rows = list(
            session.execute(
                "SELECT bootstrapped FROM system.local WHERE key='local'"))
        assert len(rows) == 1
        assert rows[0][0] == 'COMPLETED', rows[0][0]

        # query should work again
        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)
Example #14
0
    def resumable_replace_test(self):
        """
        Test resumable bootstrap while replacing node. Feature introduced in
        2.2 with ticket https://issues.apache.org/jira/browse/CASSANDRA-8838

        @jira_ticket https://issues.apache.org/jira/browse/CASSANDRA-8838
        """

        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        node1.stress(['write', 'n=100K', '-schema', 'replication(factor=3)'])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        query = SimpleStatement('select * from %s LIMIT 1' % stress_table,
                                consistency_level=ConsistencyLevel.THREE)
        initialData = list(session.execute(query))

        node3.stop(gently=False)

        # kill node1 in the middle of streaming to let it fail
        t = InterruptBootstrap(node1)
        t.start()
        # replace node 3 with node 4
        debug("Starting node 4 to replace node 3")
        node4 = Node('node4',
                     cluster,
                     True, ('127.0.0.4', 9160), ('127.0.0.4', 7000),
                     '7400',
                     '0',
                     None,
                     binary_interface=('127.0.0.4', 9042))
        # keep timeout low so that test won't hang
        node4.set_configuration_options(
            values={'streaming_socket_timeout_in_ms': 1000})
        cluster.add(node4, False)
        try:
            node4.start(
                jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"],
                wait_other_notice=False)
        except NodeError:
            pass  # node doesn't start as expected
        t.join()

        # bring back node1 and invoke nodetool bootstrap to resume bootstrapping
        node1.start()
        node4.nodetool('bootstrap resume')
        # check if we skipped already retrieved ranges
        node4.watch_log_for("already available. Skipping streaming.")
        # wait for node3 ready to query
        node4.watch_log_for("Listening for thrift clients...")

        # check if 2nd bootstrap succeeded
        session = self.exclusive_cql_connection(node4)
        rows = list(
            session.execute(
                "SELECT bootstrapped FROM system.local WHERE key='local'"))
        assert len(rows) == 1
        assert rows[0][0] == 'COMPLETED', rows[0][0]

        # query should work again
        debug("Verifying querying works again.")
        finalData = list(session.execute(query))
        self.assertListEqual(initialData, finalData)
    def multiple_repair_test(self):
        cluster = self.cluster
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        session = self.patient_cql_connection(node1)
        self.create_ks(session, 'ks', 3)
        self.create_cf(session,
                       'cf',
                       read_repair=0.0,
                       columns={
                           'c1': 'text',
                           'c2': 'text'
                       })

        debug("insert data")

        insert_c1c2(session,
                    keys=range(1, 50),
                    consistency=ConsistencyLevel.ALL)
        node1.flush()

        debug("bringing down node 3")
        node3.flush()
        node3.stop(gently=False)

        debug("inserting additional data into node 1 and 2")
        insert_c1c2(session,
                    keys=range(50, 100),
                    consistency=ConsistencyLevel.TWO)
        node1.flush()
        node2.flush()

        debug("restarting and repairing node 3")
        node3.start(wait_for_binary_proto=True)

        if cluster.version() >= "2.2":
            node3.repair()
        else:
            node3.nodetool("repair -par -inc")

        debug("stopping node 2")
        node2.stop(gently=False)

        debug("inserting data in nodes 1 and 3")
        insert_c1c2(session,
                    keys=range(100, 150),
                    consistency=ConsistencyLevel.TWO)
        node1.flush()
        node3.flush()

        debug("start and repair node 2")
        node2.start(wait_for_binary_proto=True)

        if cluster.version() >= "2.2":
            node2.repair()
        else:
            node2.nodetool("repair -par -inc")

        debug("replace node and check data integrity")
        node3.stop(gently=False)
        node5 = Node('node5', cluster, True, ('127.0.0.5', 9160),
                     ('127.0.0.5', 7000), '7500', '0', None,
                     ('127.0.0.5', 9042))
        cluster.add(node5, False)
        node5.start(replace_address='127.0.0.3', wait_other_notice=True)

        assert_one(session, "SELECT COUNT(*) FROM ks.cf LIMIT 200", [149])