def replace_with_reset_resume_state_test(self): """Test replace with resetting bootstrap progress""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100K', 'no-warmup', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initial_data = rows_to_list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node4 bootstrap with resetting bootstrap state node4.stop() mark = node4.mark_log() node4.start(jvm_args=[ "-Dcassandra.replace_address_first_boot=127.0.0.3", "-Dcassandra.reset_bootstrap_progress=true" ]) # check if we reset bootstrap state node4.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...", from_mark=mark) # check if 2nd bootstrap succeeded assert_bootstrap_state(self, node4, 'COMPLETED') # query should work again debug("Stopping old nodes") node1.stop(gently=False, wait_other_notice=True) node2.stop(gently=False, wait_other_notice=True) debug("Verifying data on new node.") session = self.patient_exclusive_cql_connection(node4) assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table), expected=initial_data, cl=ConsistencyLevel.ONE)
def bootstrap_with_reset_bootstrap_state_test(self): """Test bootstrap with resetting bootstrap progress""" cluster = self.cluster cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100K', '-schema', 'replication(factor=2)']) node1.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) try: node3.start() except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node3 bootstrap with resetting bootstrap progress node3.stop() mark = node3.mark_log() node3.start(jvm_args=["-Dcassandra.reset_bootstrap_progress=true"]) # check if we reset bootstrap state node3.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node3.watch_log_for("Listening for thrift clients...", from_mark=mark) # check if 2nd bootstrap succeeded assert_bootstrap_state(self, node3, 'COMPLETED')
def resumable_replace_test(self): """ Test resumable bootstrap while replacing node. Feature introduced in 2.2 with ticket https://issues.apache.org/jira/browse/CASSANDRA-8838 @jira_ticket https://issues.apache.org/jira/browse/CASSANDRA-8838 """ cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100K', 'no-warmup', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initial_data = rows_to_list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start() node4.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node4.watch_log_for("already available. Skipping streaming.") # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...") # check if 2nd bootstrap succeeded assert_bootstrap_state(self, node4, 'COMPLETED') # query should work again debug("Stopping old nodes") node1.stop(gently=False, wait_other_notice=True) node2.stop(gently=False, wait_other_notice=True) debug("Verifying data on new node.") session = self.patient_exclusive_cql_connection(node4) assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table), expected=initial_data, cl=ConsistencyLevel.ONE)
def resumable_replace_test(self): """Test resumable bootstrap while replacing node""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) try: node4.start( jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"]) except NodeError: pass # node doesn't start as expected t.join() # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start() node4.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node4.watch_log_for("already available. Skipping streaming.") # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...") # check if 2nd bootstrap succeeded session = self.exclusive_cql_connection(node4) rows = list( session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'COMPLETED', rows[0][0] # query should work again debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData)
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100K', 'cl=TWO', '-schema', 'replication(factor=2)']) cluster.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) # keep timeout low so that test won't hang node3.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) try: node3.start(wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode session = self.patient_exclusive_cql_connection(node3) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'IN_PROGRESS', rows[0][0] # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert rows[0][0] == 'COMPLETED', rows[0][0] # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr = node3.stress(['read', 'n=100k', "no-warmup", '-schema', 'replication(factor=2)', '-rate', 'threads=8'], capture_output=True) if stdout and "FAILURE" in stdout: debug(stdout) assert False, "Cannot read inserted data after bootstrap"
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.set_configuration_options( values={'stream_throughput_outbound_megabits_per_sec': 1}) cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress([ 'write', 'n=100K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50' ]) cluster.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) # keep timeout low so that test won't hang node3.set_configuration_options( values={'streaming_socket_timeout_in_ms': 1000}) node3.start(wait_other_notice=False, wait_for_binary_proto=True) t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode assert_bootstrap_state(self, node3, 'IN_PROGRESS') # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) assert_bootstrap_state(self, node3, 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr = node3.stress([ 'read', 'n=100k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8' ], capture_output=True) if stdout is not None: self.assertNotIn("FAILURE", stdout)
def replace_with_reset_resume_state_test(self): """Test replace with resetting bootstrap progress""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node4 bootstrap with resetting bootstrap state node4.stop() mark = node4.mark_log() node4.start(jvm_args=[ "-Dcassandra.replace_address_first_boot=127.0.0.3", "-Dcassandra.reset_bootstrap_progress=true" ]) # check if we reset bootstrap state node4.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...", from_mark=mark) # check if 2nd bootstrap succeeded session = self.exclusive_cql_connection(node4) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'COMPLETED', rows[0][0] # query should work again debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData)
def resumable_replace_test(self): """ Test resumable bootstrap while replacing node. Feature introduced in 2.2 with ticket https://issues.apache.org/jira/browse/CASSANDRA-8838 @jira_ticket https://issues.apache.org/jira/browse/CASSANDRA-8838 """ cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100K', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start() node4.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node4.watch_log_for("already available. Skipping streaming.") # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...") # check if 2nd bootstrap succeeded session = self.exclusive_cql_connection(node4) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'COMPLETED', rows[0][0] # query should work again debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData)
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50']) cluster.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) # keep timeout low so that test won't hang node3.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) node3.start(wait_other_notice=False, wait_for_binary_proto=True) t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode assert_bootstrap_state(self, node3, 'IN_PROGRESS') # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) assert_bootstrap_state(self, node3, 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr = node3.stress(['read', 'n=100k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8'], capture_output=True) if stdout is not None: self.assertNotIn("FAILURE", stdout)
def resumable_replace_test(self): """Test resumable bootstrap while replacing node""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = session.execute(query) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4', 9042)) cluster.add(node4, False) try: node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"]) except NodeError: pass # node doesn't start as expected t.join() # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start() node4.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node4.watch_log_for("already available. Skipping streaming.") # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...") # check if 2nd bootstrap succeeded session = self.exclusive_cql_connection(node4) rows = session.execute("SELECT bootstrapped FROM system.local WHERE key='local'") assert len(rows) == 1 assert rows[0][0] == 'COMPLETED', rows[0][0] #query should work again debug("Verifying querying works again.") finalData = session.execute(query) self.assertListEqual(initialData, finalData)
def resumable_bootstrap_test(self): """Test resuming bootstrap after data streaming failure""" cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100000', '-schema', 'replication(factor=2)']) node1.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.set_configuration_options( values={'stream_throughput_outbound_megabits_per_sec': 1}) # keep timeout low so that test won't hang node3.set_configuration_options( values={'streaming_socket_timeout_in_ms': 1000}) try: node3.start() except NodeError: pass # node doesn't start as expected t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode session = self.exclusive_cql_connection(node3) rows = session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'") assert len(rows) == 1 assert rows[0][0] == 'IN_PROGRESS', rows[0][0] # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node3.watch_log_for("already available. Skipping streaming.") node3.watch_log_for("Resume complete", from_mark=mark) rows = session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'") assert rows[0][0] == 'COMPLETED', rows[0][0]
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100000', '-schema', 'replication(factor=2)']) node1.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) # keep timeout low so that test won't hang node3.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) try: node3.start() except NodeError: pass # node doesn't start as expected t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode session = self.exclusive_cql_connection(node3) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'IN_PROGRESS', rows[0][0] # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node3.watch_log_for("already available. Skipping streaming.") node3.watch_log_for("Resume complete", from_mark=mark) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert rows[0][0] == 'COMPLETED', rows[0][0]
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress([ 'write', 'n=100K', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50' ]) cluster.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.set_configuration_options( values={'stream_throughput_outbound_megabits_per_sec': 1}) # keep timeout low so that test won't hang node3.set_configuration_options( values={'streaming_socket_timeout_in_ms': 1000}) try: node3.start(wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode session = self.patient_exclusive_cql_connection(node3) rows = list( session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'")) self.assertEqual(len(rows), 1) self.assertEqual(rows[0][0], 'IN_PROGRESS') # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) rows = list( session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'")) self.assertEqual(rows[0][0], 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr = node3.stress([ 'read', 'n=100k', "no-warmup", '-schema', 'replication(factor=2)', '-rate', 'threads=8' ], capture_output=True) if stdout and "FAILURE" in stdout: debug(stdout) assert False, "Cannot read inserted data after bootstrap"
def replace_with_reset_resume_state_test(self): """Test replace with resetting bootstrap progress""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress([ 'write', 'n=100K', 'no-warmup', '-schema', 'replication(factor=3)' ]) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initial_data = rows_to_list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options( values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start( jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node4 bootstrap with resetting bootstrap state node4.stop() mark = node4.mark_log() node4.start(jvm_args=[ "-Dcassandra.replace_address_first_boot=127.0.0.3", "-Dcassandra.reset_bootstrap_progress=true" ]) # check if we reset bootstrap state node4.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...", from_mark=mark) # check if 2nd bootstrap succeeded assert_bootstrap_state(self, node4, 'COMPLETED') # query should work again debug("Stopping old nodes") node1.stop(gently=False, wait_other_notice=True) node2.stop(gently=False, wait_other_notice=True) debug("Verifying data on new node.") session = self.patient_exclusive_cql_connection(node4) assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table), expected=initial_data, cl=ConsistencyLevel.ONE)
def resumable_replace_test(self): """ Test resumable bootstrap while replacing node. Feature introduced in 2.2 with ticket https://issues.apache.org/jira/browse/CASSANDRA-8838 @jira_ticket https://issues.apache.org/jira/browse/CASSANDRA-8838 """ cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress([ 'write', 'n=100K', 'no-warmup', '-schema', 'replication(factor=3)' ]) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initial_data = rows_to_list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options( values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start( jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start() node4.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node4.watch_log_for("already available. Skipping streaming.") # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...") # check if 2nd bootstrap succeeded assert_bootstrap_state(self, node4, 'COMPLETED') # query should work again debug("Stopping old nodes") node1.stop(gently=False, wait_other_notice=True) node2.stop(gently=False, wait_other_notice=True) debug("Verifying data on new node.") session = self.patient_exclusive_cql_connection(node4) assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table), expected=initial_data, cl=ConsistencyLevel.ONE)