def test_add_and_remove_node(self): """ Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave. @jira_ticket CASSANDRA-11038 """ self.cluster.populate(1).start() node1 = self.cluster.nodelist()[0] waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"]) # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications # don't confuse the state below logger.debug("Waiting for unwanted notifications...") waiter.wait_for_notifications(timeout=30, num_notifications=2) waiter.clear_notifications() session = self.patient_cql_connection(node1) # reduce system_distributed RF to 2 so we don't require forceful decommission session.execute( "ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};" ) session.execute( "ALTER KEYSPACE system_traces WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};" ) logger.debug("Adding second node...") node2 = Node('node2', self.cluster, True, None, ('127.0.0.2', 7000), '7200', '0', None, binary_interface=('127.0.0.2', 9042)) self.cluster.add(node2, False) node2.start() logger.debug("Waiting for notifications from {}".format( waiter.address)) notifications = waiter.wait_for_notifications(timeout=120.0, num_notifications=2) assert 2 == len(notifications), notifications for notification in notifications: assert get_ip_from_node(node2) == notification["address"][0] assert "NEW_NODE" == notifications[0]["change_type"] assert "UP" == notifications[1]["change_type"] logger.debug("Removing second node...") waiter.clear_notifications() node2.decommission() node2.stop(gently=False) logger.debug("Waiting for notifications from {}".format( waiter.address)) notifications = waiter.wait_for_notifications(timeout=120.0, num_notifications=2) assert 2 == len(notifications), notifications for notification in notifications: assert get_ip_from_node(node2) == notification["address"][0] assert "REMOVED_NODE" == notifications[0]["change_type"] assert "DOWN" == notifications[1]["change_type"]
def replace_with_reset_resume_state_test(self): """Test replace with resetting bootstrap progress""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100K', 'no-warmup', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initial_data = rows_to_list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node4 bootstrap with resetting bootstrap state node4.stop() mark = node4.mark_log() node4.start(jvm_args=[ "-Dcassandra.replace_address_first_boot=127.0.0.3", "-Dcassandra.reset_bootstrap_progress=true" ]) # check if we reset bootstrap state node4.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...", from_mark=mark) # check if 2nd bootstrap succeeded assert_bootstrap_state(self, node4, 'COMPLETED') # query should work again debug("Stopping old nodes") node1.stop(gently=False, wait_other_notice=True) node2.stop(gently=False, wait_other_notice=True) debug("Verifying data on new node.") session = self.patient_exclusive_cql_connection(node4) assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table), expected=initial_data, cl=ConsistencyLevel.ONE)
def _init_new_loading_node(self, ks_name, create_stmt, use_thrift=False): loading_node = Node( name='node2', cluster=self.cluster, auto_bootstrap=False, thrift_interface=('127.0.0.2', 9160) if use_thrift else None, storage_interface=('127.0.0.2', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.2', 9042) ) logger.debug('adding node') self.cluster.add(loading_node, is_seed=True) logger.debug('starting new node') loading_node.start(wait_for_binary_proto=True) logger.debug('recreating ks and table') loading_session = self.patient_exclusive_cql_connection(loading_node) create_ks(loading_session, ks_name, rf=1) logger.debug('creating new table') loading_session.execute(create_stmt) logger.debug('stopping new node') loading_session.cluster.shutdown() loading_node.stop() return loading_node
def replace_with_reset_resume_state_test(self): """Test replace with resetting bootstrap progress""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node4 bootstrap with resetting bootstrap state node4.stop() mark = node4.mark_log() node4.start(jvm_args=[ "-Dcassandra.replace_address_first_boot=127.0.0.3", "-Dcassandra.reset_bootstrap_progress=true" ]) # check if we reset bootstrap state node4.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...", from_mark=mark) # check if 2nd bootstrap succeeded session = self.exclusive_cql_connection(node4) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'COMPLETED', rows[0][0] # query should work again debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData)
def add_and_remove_node_test(self): """ Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave. @jira_ticket CASSANDRA-11038 """ self.cluster.populate(1).start(wait_for_binary_proto=True) node1 = self.cluster.nodelist()[0] waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"]) # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications # don't confuse the state below debug("Waiting for unwanted notifications...") waiter.wait_for_notifications(timeout=30, num_notifications=2) waiter.clear_notifications() debug("Adding second node...") node2 = Node('node2', self.cluster, True, ('127.0.0.2', 9160), ('127.0.0.2', 7000), '7200', '0', None, ('127.0.0.2', 9042)) self.cluster.add(node2, False) node2.start(wait_other_notice=True) debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) self.assertEquals(2, len(notifications), notifications) for notification in notifications: self.assertEquals(self.get_ip_from_node(node2), notification["address"][0]) self.assertEquals("NEW_NODE", notifications[0]["change_type"]) self.assertEquals("UP", notifications[1]["change_type"]) debug("Removing second node...") waiter.clear_notifications() node2.decommission() node2.stop(gently=False) debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) self.assertEquals(2, len(notifications), notifications) for notification in notifications: self.assertEquals(self.get_ip_from_node(node2), notification["address"][0]) self.assertEquals("REMOVED_NODE", notifications[0]["change_type"]) self.assertEquals("DOWN", notifications[1]["change_type"])
def test_add_and_remove_node(self): """ Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave. @jira_ticket CASSANDRA-11038 """ self.cluster.populate(1).start(wait_for_binary_proto=True) node1 = self.cluster.nodelist()[0] waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"]) # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications # don't confuse the state below logger.debug("Waiting for unwanted notifications...") waiter.wait_for_notifications(timeout=30, num_notifications=2) waiter.clear_notifications() session = self.patient_cql_connection(node1) # reduce system_distributed RF to 2 so we don't require forceful decommission session.execute("ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};") session.execute("ALTER KEYSPACE system_traces WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};") logger.debug("Adding second node...") node2 = Node('node2', self.cluster, True, None, ('127.0.0.2', 7000), '7200', '0', None, binary_interface=('127.0.0.2', 9042)) self.cluster.add(node2, False) node2.start(wait_other_notice=True) logger.debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) assert 2 == len(notifications), notifications for notification in notifications: assert get_ip_from_node(node2) == notification["address"][0] assert "NEW_NODE" == notifications[0]["change_type"] assert "UP" == notifications[1]["change_type"] logger.debug("Removing second node...") waiter.clear_notifications() node2.decommission() node2.stop(gently=False) logger.debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) assert 2 == len(notifications), notifications for notification in notifications: assert get_ip_from_node(node2) == notification["address"][0] assert "REMOVED_NODE" == notifications[0]["change_type"] assert "DOWN" == notifications[1]["change_type"]
def add_and_remove_node_test(self): """ Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave. @jira_ticket CASSANDRA-11038 """ self.cluster.populate(1).start(wait_for_binary_proto=True) node1 = self.cluster.nodelist()[0] waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"]) # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications # don't confuse the state below debug("Waiting for unwanted notifications...") waiter.wait_for_notifications(timeout=30, num_notifications=2) waiter.clear_notifications() debug("Adding second node...") node2 = Node('node2', self.cluster, True, None, ('127.0.0.2', 7000), '7200', '0', None, binary_interface=('127.0.0.2', 9042)) self.cluster.add(node2, False) node2.start(wait_other_notice=True) debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) self.assertEquals(2, len(notifications), notifications) for notification in notifications: self.assertEquals(get_ip_from_node(node2), notification["address"][0]) self.assertEquals("NEW_NODE", notifications[0]["change_type"]) self.assertEquals("UP", notifications[1]["change_type"]) debug("Removing second node...") waiter.clear_notifications() node2.decommission() node2.stop(gently=False) debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) self.assertEquals(2, len(notifications), notifications) for notification in notifications: self.assertEquals(get_ip_from_node(node2), notification["address"][0]) self.assertEquals("REMOVED_NODE", notifications[0]["change_type"]) self.assertEquals("DOWN", notifications[1]["change_type"])
def replace_first_boot_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() if DISABLE_VNODES: numNodes = 1 else: # a little hacky but grep_log returns the whole line... numNodes = int(node3.get_conf_option('num_tokens')) debug(numNodes) debug("Inserting Data...") node1.stress(['write', 'n=10K', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) # stop node, query should not work with consistency 3 debug("Stopping node 3.") node3.stop(gently=False) debug("Testing node stoppage (query should fail).") with self.assertRaises(NodeUnavailable): try: session.execute(query, timeout=30) except (Unavailable, ReadTimeout): raise NodeUnavailable("Node could not be queried.") # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_for_binary_proto=True) # query should work again debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData) debug("Verifying tokens migrated sucessfully") movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4") debug(movedTokensList[0]) self.assertEqual(len(movedTokensList), numNodes) # check that restarting node 3 doesn't work debug("Try to restart node 3 (should fail)") node3.start(wait_other_notice=False) checkCollision = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner") debug(checkCollision) self.assertEqual(len(checkCollision), 1) # restart node4 (if error's might have to change num_tokens) node4.stop(gently=False) node4.start(wait_for_binary_proto=True, wait_other_notice=False) debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData) # we redo this check because restarting node should not result in tokens being moved again, ie number should be same debug("Verifying tokens migrated sucessfully") movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4") debug(movedTokensList[0]) self.assertEqual(len(movedTokensList), numNodes)
def replace_with_reset_resume_state_test(self): """Test replace with resetting bootstrap progress""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress([ 'write', 'n=100K', 'no-warmup', '-schema', 'replication(factor=3)' ]) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initial_data = rows_to_list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options( values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start( jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node4 bootstrap with resetting bootstrap state node4.stop() mark = node4.mark_log() node4.start(jvm_args=[ "-Dcassandra.replace_address_first_boot=127.0.0.3", "-Dcassandra.reset_bootstrap_progress=true" ]) # check if we reset bootstrap state node4.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...", from_mark=mark) # check if 2nd bootstrap succeeded assert_bootstrap_state(self, node4, 'COMPLETED') # query should work again debug("Stopping old nodes") node1.stop(gently=False, wait_other_notice=True) node2.stop(gently=False, wait_other_notice=True) debug("Verifying data on new node.") session = self.patient_exclusive_cql_connection(node4) assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table), expected=initial_data, cl=ConsistencyLevel.ONE)
def replace_first_boot_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() if DISABLE_VNODES: num_tokens = 1 else: # a little hacky but grep_log returns the whole line... num_tokens = int(node3.get_conf_option('num_tokens')) debug("testing with num_tokens: {}".format(num_tokens)) debug("Inserting Data...") node1.stress([ 'write', 'n=10K', 'no-warmup', '-schema', 'replication(factor=3)' ]) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initial_data = rows_to_list(session.execute(query)) # stop node, query should not work with consistency 3 debug("Stopping node 3.") node3.stop(gently=False) debug("Testing node stoppage (query should fail).") with self.assertRaises(NodeUnavailable): try: session.execute(query, timeout=30) except (Unavailable, ReadTimeout): raise NodeUnavailable("Node could not be queried.") # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) node4.start( jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_for_binary_proto=True) debug("Verifying tokens migrated sucessfully") moved_tokens = node4.grep_log( "Token .* changing ownership from /127.0.0.3 to /127.0.0.4") debug("number of moved tokens: {}".format(len(moved_tokens))) self.assertEqual(len(moved_tokens), num_tokens) # check that restarting node 3 doesn't work debug("Try to restart node 3 (should fail)") node3.start(wait_other_notice=False) collision_log = node1.grep_log( "between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner") debug(collision_log) self.assertEqual(len(collision_log), 1) # restart node4 (if error's might have to change num_tokens) node4.stop(gently=False) node4.start(wait_for_binary_proto=True, wait_other_notice=False) # we redo this check because restarting node should not result in tokens being moved again, ie number should be same debug("Verifying tokens migrated sucessfully") moved_tokens = node4.grep_log( "Token .* changing ownership from /127.0.0.3 to /127.0.0.4") debug("number of moved tokens: {}".format(len(moved_tokens))) self.assertEqual(len(moved_tokens), num_tokens) # query should work again debug("Stopping old nodes") node1.stop(gently=False, wait_other_notice=True) node2.stop(gently=False, wait_other_notice=True) debug("Verifying data on new node.") session = self.patient_exclusive_cql_connection(node4) assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table), expected=initial_data, cl=ConsistencyLevel.ONE)
def replace_first_boot_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() if DISABLE_VNODES: num_tokens = 1 else: # a little hacky but grep_log returns the whole line... num_tokens = int(node3.get_conf_option('num_tokens')) debug("testing with num_tokens: {}".format(num_tokens)) debug("Inserting Data...") node1.stress(['write', 'n=10K', 'no-warmup', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initial_data = rows_to_list(session.execute(query)) # stop node, query should not work with consistency 3 debug("Stopping node 3.") node3.stop(gently=False) debug("Testing node stoppage (query should fail).") with self.assertRaises(NodeUnavailable): try: session.execute(query, timeout=30) except (Unavailable, ReadTimeout): raise NodeUnavailable("Node could not be queried.") # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_for_binary_proto=True) debug("Verifying tokens migrated sucessfully") moved_tokens = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4") debug("number of moved tokens: {}".format(len(moved_tokens))) self.assertEqual(len(moved_tokens), num_tokens) # check that restarting node 3 doesn't work debug("Try to restart node 3 (should fail)") node3.start(wait_other_notice=False) collision_log = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner") debug(collision_log) self.assertEqual(len(collision_log), 1) # restart node4 (if error's might have to change num_tokens) node4.stop(gently=False) node4.start(wait_for_binary_proto=True, wait_other_notice=False) # we redo this check because restarting node should not result in tokens being moved again, ie number should be same debug("Verifying tokens migrated sucessfully") moved_tokens = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4") debug("number of moved tokens: {}".format(len(moved_tokens))) self.assertEqual(len(moved_tokens), num_tokens) # query should work again debug("Stopping old nodes") node1.stop(gently=False, wait_other_notice=True) node2.stop(gently=False, wait_other_notice=True) debug("Verifying data on new node.") session = self.patient_exclusive_cql_connection(node4) assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table), expected=initial_data, cl=ConsistencyLevel.ONE)