def __init__(self, cluster): # A lot of things are wrong in that method. It assumes that the ip # 127.0.0.<nbnode> is free and use standard ports without asking. # It should problably be fixed, but will be good enough for now. addr = "127.0.0.%d" % (len(cluster.nodes) + 1) self.path = tempfile.mkdtemp(prefix="bulkloader-") Node.__init__(self, "bulkloader", cluster, False, (addr, 9160), (addr, 7000), str(9042), 2000, None)
def copy_config_files(self): Node.copy_config_files(self) conf_pattern = os.path.join(self.get_tools_java_dir(), 'conf', "jvm*.options") for filename in glob.glob(conf_pattern): if os.path.isfile(filename): shutil.copy(filename, self.get_conf_dir())
def test_upgrade_legacy_table(self): """ Upgrade with bringing up the legacy tables after the newer nodes (without legacy tables) were started. @jira_ticket CASSANDRA-12813 """ cluster = self.cluster # Forcing cluster version on purpose cluster.set_install_dir(version="2.1.16") cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() # Wait for default user to get created on one of the nodes time.sleep(15) # Upgrade to current version for node in [node1, node2, node3]: node.drain() node.watch_log_for("DRAINED") node.stop(gently=True) self.set_node_to_current_version(node) cluster.start() # Make sure the system_auth table will get replicated to the node that we're going to replace session = self.patient_cql_connection(node1, user='******', password='******') session.execute("ALTER KEYSPACE system_auth WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 3 };") cluster.repair() cluster.stop() # Replace the node cluster.seeds.remove(node1) cluster.remove(node1) replacement_address = node1.address() replacement_node = Node('replacement', cluster=self.cluster, auto_bootstrap=True, thrift_interface=(replacement_address, 9160), storage_interface=(replacement_address, 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042)) self.set_node_to_current_version(replacement_node) cluster.add(replacement_node, True) replacement_node.start(wait_for_binary_proto=True) node2.start(wait_for_binary_proto=True) node3.start(wait_for_binary_proto=True) replacement_node.watch_log_for('Initializing system_auth.credentials') replacement_node.watch_log_for('Initializing system_auth.permissions') replacement_node.watch_log_for('Initializing system_auth.users') cluster.repair() replacement_node.watch_log_for('Repair command') # Should succeed. Will throw an NPE on pre-12813 code. self.patient_cql_connection(replacement_node, user='******', password='******')
def replace_nonexistent_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() [node1,node2, node3] = cluster.nodelist() debug("Inserting Data...") if cluster.version() < "2.1": node1.stress(['-o', 'insert', '--num-keys=10000', '--replication-factor=3']) else: node1.stress(['write', 'n=10000', '-schema', 'replication(factor=3)']) cursor = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = cursor.execute(query) debug('Start node 4 and replace an address with no node') node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4',9042)) cluster.add(node4, False) #try to replace an unassigned ip address with self.assertRaises(NodeError): try: node4.start(replace_address='127.0.0.5', wait_for_binary_proto=True) except (NodeError, TimeoutError): raise NodeError("Node could not start.")
def replace_active_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() [node1,node2, node3] = cluster.nodelist() debug("Inserting Data...") if cluster.version() < "2.1": node1.stress(['-o', 'insert', '--num-keys=10000', '--replication-factor=3']) else: node1.stress(['write', 'n=10000', '-schema', 'replication(factor=3)']) cursor = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = cursor.execute(query) #replace active node 3 with node 4 debug("Starting node 4 to replace active node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4',9042)) cluster.add(node4, False) with self.assertRaises(NodeError): try: node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True) except (NodeError, TimeoutError): raise NodeError("Node could not start.") checkError = node4.grep_log("java.lang.UnsupportedOperationException: Cannot replace a live node...") self.assertEqual(len(checkError), 1)
def decommission_node_test(self): debug("decommission_node_test()") cluster = self.cluster cluster.populate(1) # create and add a new node, I must not be a seed, otherwise # we get schema disagreement issues for awhile after decommissioning it. node2 = Node('node2', cluster, True, ('127.0.0.2', 9160), ('127.0.0.2', 7000), '7200', '0', None) cluster.add(node2, False) [node1, node2] = cluster.nodelist() node1.start() node2.start() wait(2) cursor = self.cql_connection(node1).cursor() self.prepare_for_changes(cursor) node2.decommission() wait(30) self.validate_schema_consistent(node1) self.make_schema_changes(cursor, namespace='ns1') # create and add a new node node3 = Node('node3', cluster, True, ('127.0.0.3', 9160), ('127.0.0.3', 7000), '7300', '0', None) cluster.add(node3, True) node3.start() wait(30) self.validate_schema_consistent(node1)
def replace_nonexistent_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() debug("Inserting Data...") if cluster.version() < "2.1": node1.stress( ['-o', 'insert', '--num-keys=10000', '--replication-factor=3']) else: node1.stress( ['write', 'n=10000', '-schema', 'replication(factor=3)']) cursor = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' if self.cluster.version( ) >= '2.1' else '"Keyspace1"."Standard1"' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = cursor.execute(query) debug('Start node 4 and replace an address with no node') node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4', 9042)) cluster.add(node4, False) #try to replace an unassigned ip address with self.assertRaises(NodeError): try: node4.start(replace_address='127.0.0.5', wait_for_binary_proto=True) except (NodeError, TimeoutError): raise NodeError("Node could not start.")
def _do_replace(self, same_address=False, jvm_option='replace_address', wait_other_notice=False, wait_for_binary_proto=True, replace_address=None, opts=None, data_center=None, extra_jvm_args=None): if replace_address is None: replace_address = self.replaced_node.address() # only create node if it's not yet created if self.replacement_node is None: replacement_address = '127.0.0.4' if same_address: replacement_address = self.replaced_node.address() self.cluster.remove(self.replaced_node) logger.debug("Starting replacement node {} with jvm_option '{}={}'".format(replacement_address, jvm_option, replace_address)) self.replacement_node = Node('replacement', cluster=self.cluster, auto_bootstrap=True, thrift_interface=None, storage_interface=(replacement_address, 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042)) if opts is not None: logger.debug("Setting options on replacement node: {}".format(opts)) self.replacement_node.set_configuration_options(opts) self.cluster.add(self.replacement_node, False, data_center=data_center) if extra_jvm_args is None: extra_jvm_args = [] extra_jvm_args.extend(["-Dcassandra.{}={}".format(jvm_option, replace_address), "-Dcassandra.ring_delay_ms=10000", "-Dcassandra.broadcast_interval_ms=10000"]) self.replacement_node.start(jvm_args=extra_jvm_args, wait_for_binary_proto=wait_for_binary_proto, wait_other_notice=wait_other_notice) if self.cluster.cassandra_version() >= '2.2.8' and same_address: self.replacement_node.watch_log_for("Writes will not be forwarded to this node during replacement", timeout=60)
def _test_disk_balance_replace(self, same_address): logger.debug("Creating cluster") cluster = self.cluster if self.dtest_config.use_vnodes: cluster.set_configuration_options(values={'num_tokens': 256}) # apparently we have legitimate errors in the log when bootstrapping (see bootstrap_test.py) self.fixture_dtest_setup.allow_log_errors = True cluster.populate(4).start(wait_for_binary_proto=True) node1 = cluster.nodes['node1'] logger.debug("Populating") node1.stress(['write', 'n=50k', 'no-warmup', '-rate', 'threads=100', '-schema', 'replication(factor=3)', 'compaction(strategy=SizeTieredCompactionStrategy,enabled=false)']) cluster.flush() logger.debug("Stopping and removing node2") node2 = cluster.nodes['node2'] node2.stop(gently=False) self.cluster.remove(node2) node5_address = node2.address() if same_address else '127.0.0.5' logger.debug("Starting replacement node") node5 = Node('node5', cluster=self.cluster, auto_bootstrap=True, thrift_interface=None, storage_interface=(node5_address, 7000), jmx_port='7500', remote_debug_port='0', initial_token=None, binary_interface=(node5_address, 9042)) self.cluster.add(node5, False) node5.start(jvm_args=["-Dcassandra.replace_address_first_boot={}".format(node2.address())], wait_for_binary_proto=True, wait_other_notice=True) logger.debug("Checking replacement node is balanced") self.assert_balanced(node5)
def test_replace(self): main_session = self.patient_cql_connection(self.node1) #We want the node being replaced to have no data on it so the replacement definitely fetches all the data self.node2.stop(wait_other_notice=True) for i in range(0, 40): print("Inserting " + str(i)) self.insert_row(i, i, i, main_session) replacement_address = self.node2.address() self.node2.stop(wait_other_notice=True) self.cluster.remove(self.node2) self.node2 = Node('replacement', cluster=self.cluster, auto_bootstrap=True, thrift_interface=None, storage_interface=(replacement_address, 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042)) patch_start(self.node2) nodes = [self.node1, self.node2, self.node3] self.cluster.add(self.node2, False, data_center='datacenter1') jvm_args = [ "-Dcassandra.replace_address=%s" % replacement_address, "-Dcassandra.ring_delay_ms=10000", "-Dcassandra.broadcast_interval_ms=10000" ] self.node2.start(jvm_args=jvm_args, wait_for_binary_proto=True, wait_other_notice=True) sessions = [ self.exclusive_cql_connection(node) for node in [self.node1, self.node2, self.node3] ] # Everyone should have everything expected = [ gen_expected(range(0, 40)), gen_expected(range(0, 40)), gen_expected(range(0, 40)) ] self.check_replication(sessions, exactly=3) self.check_expected(sessions, expected) repair_nodes(nodes) cleanup_nodes(nodes) self.check_replication(sessions, exactly=2) expected = [ gen_expected(range(0, 11), range(21, 40)), gen_expected(range(0, 21), range(31, 40)), gen_expected(range(11, 31)) ] self.check_expected(sessions, expected)
def __init__(self, cluster): # A lot of things are wrong in that method. It assumes that the ip # 127.0.0.<nbnode> is free and use standard ports without asking. # It should problably be fixed, but will be good enough for now. addr = '127.0.0.%d' % (len(cluster.nodes) + 1) self.path = tempfile.mkdtemp(prefix='bulkloader-') Node.__init__(self, 'bulkloader', cluster, False, (addr, 9160), (addr, 7000), str(9042), 2000, None)
def multiple_repair_test(self): cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 3) self.create_cf(session, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'}) debug("insert data") insert_c1c2(session, keys=range(1, 50), consistency=ConsistencyLevel.ALL) node1.flush() debug("bringing down node 3") node3.flush() node3.stop(gently=False) debug("inserting additional data into node 1 and 2") insert_c1c2(session, keys=range(50, 100), consistency=ConsistencyLevel.TWO) node1.flush() node2.flush() debug("restarting and repairing node 3") node3.start(wait_for_binary_proto=True) if cluster.version() >= "2.2": node3.repair() else: node3.nodetool("repair -par -inc") # wait stream handlers to be closed on windows # after session is finished (See CASSANDRA-10644) if is_win: time.sleep(2) debug("stopping node 2") node2.stop(gently=False) debug("inserting data in nodes 1 and 3") insert_c1c2(session, keys=range(100, 150), consistency=ConsistencyLevel.TWO) node1.flush() node3.flush() debug("start and repair node 2") node2.start(wait_for_binary_proto=True) if cluster.version() >= "2.2": node2.repair() else: node2.nodetool("repair -par -inc") debug("replace node and check data integrity") node3.stop(gently=False) node5 = Node('node5', cluster, True, ('127.0.0.5', 9160), ('127.0.0.5', 7000), '7500', '0', None, ('127.0.0.5', 9042)) cluster.add(node5, False) node5.start(replace_address='127.0.0.3', wait_other_notice=True) assert_one(session, "SELECT COUNT(*) FROM ks.cf LIMIT 200", [149])
def test_rf_gt_nodes_multidc_should_succeed(self): """ Validating a KS with RF > N on multi DC doesn't break bootstrap @jira_ticket CASSANDRA-16296 CASSANDRA-16411 """ cluster = self.cluster cluster.set_environment_variable( 'CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True') cluster.populate([1, 1]) cluster.start() node1 = cluster.nodelist()[0] node2 = cluster.nodelist()[1] session = self.patient_exclusive_cql_connection(node1) session.execute( "CREATE KEYSPACE k WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'dc1' : '3'}" ) if cluster.version() >= '4.0': warning = 'Your replication factor 3 for keyspace k is higher than the number of nodes 1 for datacenter dc1' assert len(node1.grep_log(warning)) == 1 assert len(node2.grep_log(warning)) == 0 session.execute( "ALTER KEYSPACE k WITH REPLICATION = {'class' : 'NetworkTopologyStrategy', 'dc1' : '2'}" ) session.execute( "CREATE TABLE k.testgtrfmultidc (KEY text PRIMARY KEY)") session.execute( "INSERT INTO k.testgtrfmultidc (KEY) VALUES ('test_rf_gt_nodes_multidc_should_succeed')" ) if cluster.version() >= '4.0': warning = 'Your replication factor 2 for keyspace k is higher than the number of nodes 1 for datacenter dc1' assert len(node1.grep_log(warning)) == 1 assert len(node2.grep_log(warning)) == 0 marks = map(lambda n: n.mark_log(), cluster.nodelist()) node3 = Node(name='node3', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.3', 9160), storage_interface=('127.0.0.3', 7000), jmx_port='7300', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.3', 9042)) cluster.add(node3, is_seed=False, data_center="dc1") node3.start(wait_for_binary_proto=True) if cluster.version() >= '4.0': warning = 'is higher than the number of nodes' for (node, mark) in zip(cluster.nodelist(), marks): assert len(node.grep_log(warning, from_mark=mark)) == 0 session3 = self.patient_exclusive_cql_connection(node3) assert_one(session3, "SELECT * FROM k.testgtrfmultidc", ["test_rf_gt_nodes_multidc_should_succeed"])
def multiple_repair_test(self): cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor = self.patient_cql_connection(node1) self.create_ks(cursor, 'ks', 3) self.create_cf(cursor, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'}) debug("insert data") for x in range(1, 50): insert_c1c2(cursor, x, ConsistencyLevel.ALL) node1.flush() debug("bringing down node 3") node3.flush() node3.stop(gently=False) debug("inserting additional data into node 1 and 2") for y in range(50, 100): insert_c1c2(cursor, y, ConsistencyLevel.TWO) node1.flush() node2.flush() debug("restarting and repairing node 3") node3.start() if cluster.version() >= "3.0": node3.repair() else: node3.nodetool("repair -par -inc") debug("stopping node 2") node2.stop(gently=False) debug("inserting data in nodes 1 and 3") for z in range(100, 150): insert_c1c2(cursor, z, ConsistencyLevel.TWO) node1.flush() node3.flush() debug("start and repair node 2") node2.start() if cluster.version() >= "3.0": node2.repair() else: node2.nodetool("repair -par -inc") debug("replace node and check data integrity") node3.stop(gently=False) node5 = Node('node5', cluster, True, ('127.0.0.5', 9160), ('127.0.0.5', 7000), '7500', '0', None, ('127.0.0.5',9042)) cluster.add(node5, False) node5.start(replace_address = '127.0.0.3', wait_other_notice=True) assert_one(cursor, "SELECT COUNT(*) FROM ks.cf LIMIT 200", [149])
def resumable_replace_test(self): """Test resumable bootstrap while replacing node""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) try: node4.start( jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"]) except NodeError: pass # node doesn't start as expected t.join() # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start() node4.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node4.watch_log_for("already available. Skipping streaming.") # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...") # check if 2nd bootstrap succeeded session = self.exclusive_cql_connection(node4) rows = list( session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'COMPLETED', rows[0][0] # query should work again debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData)
def configure_node(self, node: Node, index): # set dc/rack manually, since CCM doesn't support custom racks node.set_configuration_options({ 'endpoint_snitch': 'GossipingPropertyFileSnitch', }) rackdc_path = Path( node.get_conf_dir()) / 'cassandra-rackdc.properties' with open(rackdc_path, 'w') as f: f.write(f'dc={node.dc}\nrack={node.rack}\n')
def decommission_node_test(self): debug("decommission_node_test()") cluster = self.cluster cluster.populate(1) # create and add a new node, I must not be a seed, otherwise # we get schema disagreement issues for awhile after decommissioning it. node2 = Node("node2", cluster, True, ("127.0.0.2", 9160), ("127.0.0.2", 7000), "7200", None) cluster.add(node2, False) [node1, node2] = cluster.nodelist() node1.start() node2.start() wait(2) cursor = self.cql_connection(node1).cursor() self.prepare_for_changes(cursor) node2.decommission() wait(30) self.validate_schema_consistent(node1) self.make_schema_changes(cursor, namespace="ns1") # create and add a new node node3 = Node("node3", cluster, True, ("127.0.0.3", 9160), ("127.0.0.3", 7000), "7300", None) cluster.add(node3, True) node3.start() wait(30) self.validate_schema_consistent(node1)
def test_add_and_remove_node(self): """ Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave. @jira_ticket CASSANDRA-11038 """ self.cluster.populate(1).start() node1 = self.cluster.nodelist()[0] waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"]) # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications # don't confuse the state below logger.debug("Waiting for unwanted notifications...") waiter.wait_for_notifications(timeout=30, num_notifications=2) waiter.clear_notifications() session = self.patient_cql_connection(node1) # reduce system_distributed RF to 2 so we don't require forceful decommission session.execute( "ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};" ) session.execute( "ALTER KEYSPACE system_traces WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};" ) logger.debug("Adding second node...") node2 = Node('node2', self.cluster, True, None, ('127.0.0.2', 7000), '7200', '0', None, binary_interface=('127.0.0.2', 9042)) self.cluster.add(node2, False) node2.start() logger.debug("Waiting for notifications from {}".format( waiter.address)) notifications = waiter.wait_for_notifications(timeout=120.0, num_notifications=2) assert 2 == len(notifications), notifications for notification in notifications: assert get_ip_from_node(node2) == notification["address"][0] assert "NEW_NODE" == notifications[0]["change_type"] assert "UP" == notifications[1]["change_type"] logger.debug("Removing second node...") waiter.clear_notifications() node2.decommission() node2.stop(gently=False) logger.debug("Waiting for notifications from {}".format( waiter.address)) notifications = waiter.wait_for_notifications(timeout=120.0, num_notifications=2) assert 2 == len(notifications), notifications for notification in notifications: assert get_ip_from_node(node2) == notification["address"][0] assert "REMOVED_NODE" == notifications[0]["change_type"] assert "DOWN" == notifications[1]["change_type"]
def issue_150_test(self): self.cluster = Cluster(CLUSTER_PATH, "150", cassandra_version='2.0.9') self.cluster.populate([1, 2], use_vnodes=True) self.cluster.start() dcs = [node.data_center for node in self.cluster.nodelist()] dcs.append('dc2') node4 = Node('node4', self.cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '2000', None) self.cluster.add(node4, False, 'dc2') node4.start() dcs_2 = [node.data_center for node in self.cluster.nodelist()] self.assertItemsEqual(dcs, dcs_2) node4.nodetool('status')
def test_decommission_node(self): logger.debug("decommission_node_test()") cluster = self.cluster cluster.populate(1) # create and add a new node, I must not be a seed, otherwise # we get schema disagreement issues for awhile after decommissioning it. node2 = Node('node2', cluster, True, ('127.0.0.2', 9160), ('127.0.0.2', 7000), '7200', '0', None, binary_interface=('127.0.0.2', 9042)) cluster.add(node2, False) node1, node2 = cluster.nodelist() node1.start(wait_for_binary_proto=True) node2.start(wait_for_binary_proto=True) wait(2) session = self.patient_cql_connection(node1) self.prepare_for_changes(session) node2.decommission() wait(30) self.validate_schema_consistent(node1) self.make_schema_changes(session, namespace='ns1') # create and add a new node node3 = Node('node3', cluster, True, ('127.0.0.3', 9160), ('127.0.0.3', 7000), '7300', '0', None, binary_interface=('127.0.0.3', 9042)) cluster.add(node3, True) node3.start(wait_for_binary_proto=True) wait(30) self.validate_schema_consistent(node1)
def bootstrap(node, data_center=None, token=None): log.debug('called bootstrap(' 'node={node}, data_center={data_center}, ' 'token={token})') node_instance = Node('node%s' % node, get_cluster(), auto_bootstrap=False, thrift_interface=(IP_FORMAT % node, 9160), storage_interface=(IP_FORMAT % node, 7000), binary_interface=(IP_FORMAT % node, 9042), jmx_port=str(7000 + 100 * node), remote_debug_port=0, initial_token=token if token else node * 10) get_cluster().add(node_instance, is_seed=False, data_center=data_center) try: start(node) except Exception as e0: log.debug('failed 1st bootstrap attempt with: \n{}'.format(e0)) # Try only twice try: start(node) except Exception as e1: log.debug('failed 2nd bootstrap attempt with: \n{}'.format(e1)) log.error('Added node failed to start twice.') raise e1
def run(self): try: if self.options.dse_node: node = DseNode(self.name, self.cluster, self.options.bootstrap, self.thrift, self.storage, self.jmx_port, self.remote_debug_port, self.initial_token, binary_interface=self.binary) else: node = Node(self.name, self.cluster, self.options.bootstrap, self.thrift, self.storage, self.jmx_port, self.remote_debug_port, self.initial_token, binary_interface=self.binary) self.cluster.add(node, self.options.is_seed, self.options.data_center) except common.ArgumentError as e: print_(str(e), file=sys.stderr) exit(1)
def replace_nonexistent_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() debug('Start node 4 and replace an address with no node') node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4', 9042)) cluster.add(node4, False) #try to replace an unassigned ip address with self.assertRaises(NodeError): try: node4.start(replace_address='127.0.0.5', wait_for_binary_proto=True) except (NodeError, TimeoutError): raise NodeError("Node could not start.")
def load(path, name): cluster_path = os.path.join(path, name) filename = os.path.join(cluster_path, 'cluster.conf') with open(filename, 'r') as f: data = yaml.load(f) try: cassandra_dir = None if 'cassandra_dir' in data: cassandra_dir = data['cassandra_dir'] repository.validate(cassandra_dir) cluster = Cluster(path, data['name'], cassandra_dir=cassandra_dir, create_directory=False) node_list = data['nodes'] seed_list = data['seeds'] if 'partitioner' in data: cluster.partitioner = data['partitioner'] if 'config_options' in data: cluster._config_options = data['config_options'] if 'log_level' in data: cluster.__log_level = data['log_level'] except KeyError as k: raise common.LoadError("Error Loading " + filename + ", missing property:" + k) for node_name in node_list: cluster.nodes[node_name] = Node.load(cluster_path, node_name, cluster) for seed_name in seed_list: cluster.seeds.append(cluster.nodes[seed_name]) return cluster
def create_node(self, name, auto_bootstrap, thrift_interface, storage_interface, jmx_port, remote_debug_port, initial_token, save=True, binary_interface=None, byteman_port='0', environment_variables=None, derived_cassandra_version=None): return Node(name, self, auto_bootstrap, thrift_interface, storage_interface, jmx_port, remote_debug_port, initial_token, save, binary_interface, byteman_port, environment_variables, derived_cassandra_version=derived_cassandra_version)
def load(path, name): cluster_path = os.path.join(path, name) filename = os.path.join(cluster_path, 'cluster.conf') with open(filename, 'r') as f: data = yaml.load(f) try: install_dir = None if 'install_dir' in data: install_dir = data['install_dir'] repository.validate(install_dir) if install_dir is None and 'cassandra_dir' in data: install_dir = data['cassandra_dir'] repository.validate(install_dir) cassandra_version = None if 'cassandra_version' in data: cassandra_version = LooseVersion(data['cassandra_version']) if common.isDse(install_dir): cluster = DseCluster( path, data['name'], install_dir=install_dir, create_directory=False, derived_cassandra_version=cassandra_version) else: cluster = Cluster(path, data['name'], install_dir=install_dir, create_directory=False, derived_cassandra_version=cassandra_version) node_list = data['nodes'] seed_list = data['seeds'] if 'partitioner' in data: cluster.partitioner = data['partitioner'] if 'config_options' in data: cluster._config_options = data['config_options'] if 'dse_config_options' in data: cluster._dse_config_options = data['dse_config_options'] if 'misc_config_options' in data: cluster._misc_config_options = data['misc_config_options'] if 'log_level' in data: cluster.__log_level = data['log_level'] if 'use_vnodes' in data: cluster.use_vnodes = data['use_vnodes'] if 'datadirs' in data: cluster.data_dir_count = int(data['datadirs']) extension.load_from_cluster_config(cluster, data) except KeyError as k: raise common.LoadError("Error Loading " + filename + ", missing property:" + k) for node_name in node_list: cluster.nodes[node_name] = Node.load(cluster_path, node_name, cluster) for seed in seed_list: cluster.seeds.append(seed) return cluster
def replace_active_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() # replace active node 3 with node 4 debug("Starting node 4 to replace active node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) mark = node4.mark_log() node4.start(replace_address='127.0.0.3', wait_other_notice=False) node4.watch_log_for( "java.lang.UnsupportedOperationException: Cannot replace a live node...", from_mark=mark) assert_not_running(node4)
def replace_nonexistent_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() debug('Start node 4 and replace an address with no node') node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) # try to replace an unassigned ip address mark = node4.mark_log() node4.start(replace_address='127.0.0.5', wait_other_notice=False) node4.watch_log_for( "java.lang.RuntimeException: Cannot replace_address /127.0.0.5 because it doesn't exist in gossip", from_mark=mark) assert_not_running(node4)
def resumable_replace_test(self): """Test resumable bootstrap while replacing node""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) try: node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"]) except NodeError: pass # node doesn't start as expected t.join() # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start() node4.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node4.watch_log_for("already available. Skipping streaming.") # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...") # check if 2nd bootstrap succeeded session = self.exclusive_cql_connection(node4) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'COMPLETED', rows[0][0] # query should work again debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData)
def test_replace(self): main_session = self.patient_cql_connection(self.node1) # We want the node being replaced to have no data on it so the replacement definitely fetches all the data self.node2.stop(wait_other_notice=True) for i in range(0, 40): self.insert_row(i, i, i, main_session) replacement_address = self.node2.address() self.node2.stop(wait_other_notice=True) self.cluster.remove(self.node2) self.node2 = Node('replacement', cluster=self.cluster, auto_bootstrap=True, thrift_interface=None, storage_interface=(replacement_address, 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042)) patch_start(self.node2) nodes = [self.node1, self.node2, self.node3] self.cluster.add(self.node2, False, data_center='datacenter1') jvm_args = [ "-Dcassandra.replace_address=%s" % replacement_address, "-Dcassandra.ring_delay_ms=10000", "-Dcassandra.broadcast_interval_ms=10000" ] self.node2.start(jvm_args=jvm_args, wait_for_binary_proto=True) sessions = [ self.exclusive_cql_connection(node) for node in [self.node1, self.node2, self.node3] ] self._everyone_should_have_everything(sessions) repair_nodes(nodes) cleanup_nodes(nodes) self._nodes_have_proper_ranges_after_repair_and_cleanup(sessions)
def replace_active_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() #replace active node 3 with node 4 debug("Starting node 4 to replace active node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4', 9042)) cluster.add(node4, False) with self.assertRaises(NodeError): try: node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True) except (NodeError, TimeoutError): raise NodeError("Node could not start.") checkError = node4.grep_log("java.lang.UnsupportedOperationException: Cannot replace a live node...") self.assertEqual(len(checkError), 1)
def replace_with_insufficient_replicas_test(self): """ Test that replace fails when there are insufficient replicas @jira_ticket CASSANDRA-11848 """ debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() if DISABLE_VNODES: num_tokens = 1 else: # a little hacky but grep_log returns the whole line... num_tokens = int(node3.get_conf_option('num_tokens')) debug("testing with num_tokens: {}".format(num_tokens)) debug("Inserting Data...") node1.stress(['write', 'n=10K', 'no-warmup', '-schema', 'replication(factor=2)']) # stop node to replace debug("Stopping node to replace.") node3.stop(wait_other_notice=True) # stop other replica debug("Stopping node2 (other replica)") node2.stop(wait_other_notice=True) # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) node4.start(replace_address='127.0.0.3', wait_for_binary_proto=False, wait_other_notice=False) # replace should fail due to insufficient replicas node4.watch_log_for("Unable to find sufficient sources for streaming range") assert_not_running(node4)
def new_node(cluster, bootstrap=True, token=None, remote_debug_port='2000'): i = len(cluster.nodes) + 1 node = Node('node%s' % i, cluster, bootstrap, ('127.0.0.%s' % i, 9160), ('127.0.0.%s' % i, 7000), str(7000 + i * 100), remote_debug_port, token) cluster.add(node, not bootstrap) return node
def populate(self, nodes, debug=False, tokens=None, use_vnodes=False, ipprefix='127.0.0.'): node_count = nodes dcs = [] if isinstance(nodes, list): self.set_configuration_options( values={ 'endpoint_snitch': 'org.apache.cassandra.locator.PropertyFileSnitch' }) node_count = 0 i = 0 for c in nodes: i = i + 1 node_count = node_count + c for x in xrange(0, c): dcs.append('dc%d' % i) if node_count < 1: raise common.ArgumentError('invalid node count %s' % nodes) for i in xrange(1, node_count + 1): if 'node%s' % i in list(self.nodes.values()): raise common.ArgumentError( 'Cannot create existing node node%s' % i) if tokens is None and not use_vnodes: tokens = self.balanced_tokens(node_count) for i in xrange(1, node_count + 1): tk = None if tokens is not None and i - 1 < len(tokens): tk = tokens[i - 1] dc = dcs[i - 1] if i - 1 < len(dcs) else None binary = None if self.version() >= '1.2': binary = ('%s%s' % (ipprefix, i), 9042) node = Node('node%s' % i, self, False, ('%s%s' % (ipprefix, i), 9160), ('%s%s' % (ipprefix, i), 7000), str(7000 + i * 100), (str(0), str(2000 + i * 100))[debug == True], tk, binary_interface=binary) self.add(node, True, dc) self.__update_config() return self
def _init_new_loading_node(self, ks_name, create_stmt, use_thrift=False): loading_node = Node( name='node2', cluster=self.cluster, auto_bootstrap=False, thrift_interface=('127.0.0.2', 9160) if use_thrift else None, storage_interface=('127.0.0.2', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.2', 9042) ) logger.debug('adding node') self.cluster.add(loading_node, is_seed=True) logger.debug('starting new node') loading_node.start(wait_for_binary_proto=True) logger.debug('recreating ks and table') loading_session = self.patient_exclusive_cql_connection(loading_node) create_ks(loading_session, ks_name, rf=1) logger.debug('creating new table') loading_session.execute(create_stmt) logger.debug('stopping new node') loading_session.cluster.shutdown() loading_node.stop() return loading_node
def new_node(cluster, bootstrap=True, token=None, remote_debug_port='0', data_center=None): i = len(cluster.nodes) + 1 node = Node('node%s' % i, cluster, bootstrap, ('127.0.0.%s' % i, 9160), ('127.0.0.%s' % i, 7000), str(7000 + i * 100), remote_debug_port, token, binary_interface=('127.0.0.%s' % i, 9042)) cluster.add(node, not bootstrap, data_center=data_center) return node
def replace_active_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() debug("Inserting Data...") if cluster.version() < "2.1": node1.stress( ['-o', 'insert', '--num-keys=10000', '--replication-factor=3']) else: node1.stress( ['write', 'n=10000', '-schema', 'replication(factor=3)']) cursor = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' if self.cluster.version( ) >= '2.1' else '"Keyspace1"."Standard1"' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = cursor.execute(query) #replace active node 3 with node 4 debug("Starting node 4 to replace active node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, ('127.0.0.4', 9042)) cluster.add(node4, False) with self.assertRaises(NodeError): try: node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True) except (NodeError, TimeoutError): raise NodeError("Node could not start.") checkError = node4.grep_log( "java.lang.UnsupportedOperationException: Cannot replace a live node..." ) self.assertEqual(len(checkError), 1)
def create_node(self, name, auto_bootstrap, thrift_interface, storage_interface, jmx_port, remote_debug_port, initial_token, save=True, binary_interface=None): return Node(name, self, auto_bootstrap, thrift_interface, storage_interface, jmx_port, remote_debug_port, initial_token, save, binary_interface)
def load(path, name): cluster_path = os.path.join(path, name) filename = os.path.join(cluster_path, 'cluster.conf') with open(filename, 'r') as f: data = yaml.safe_load(f) try: install_dir = None if 'install_dir' in data: install_dir = data['install_dir'] repository.validate(install_dir) if install_dir is None and 'cassandra_dir' in data: install_dir = data['cassandra_dir'] repository.validate(install_dir) if common.isScylla(install_dir): cluster = ScyllaCluster(path, data['name'], install_dir=install_dir, create_directory=False) elif common.isDse(install_dir): cluster = DseCluster(path, data['name'], install_dir=install_dir, create_directory=False) else: cluster = Cluster(path, data['name'], install_dir=install_dir, create_directory=False) node_list = data['nodes'] seed_list = data['seeds'] if 'partitioner' in data: cluster.partitioner = data['partitioner'] if 'config_options' in data: cluster._config_options = data['config_options'] if 'log_level' in data: cluster.__log_level = data['log_level'] if 'use_vnodes' in data: cluster.use_vnodes = data['use_vnodes'] except KeyError as k: raise common.LoadError("Error Loading " + filename + ", missing property:" + str(k)) for node_name in node_list: cluster.nodes[node_name] = Node.load(cluster_path, node_name, cluster) for seed_name in seed_list: cluster.seeds.append(cluster.nodes[seed_name]) return cluster
def decommission_node_schema_check_test(self): cluster = self.cluster cluster.populate(1) # create and add a non-seed node. node2 = Node('node2', cluster, True, ('127.0.0.2', 9160), ('127.0.0.2', 7000), '7200', None) cluster.add(node2, False) [node1, node2] = cluster.nodelist() node1.start() node2.start() time.sleep(2) node2.decommission() time.sleep(30) self.validate_schema_consistent(node1)
def add_and_remove_node_test(self): """ Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave. @jira_ticket CASSANDRA-11038 """ self.cluster.populate(1).start(wait_for_binary_proto=True) node1 = self.cluster.nodelist()[0] waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"]) # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications # don't confuse the state below debug("Waiting for unwanted notifications...") waiter.wait_for_notifications(timeout=30, num_notifications=2) waiter.clear_notifications() debug("Adding second node...") node2 = Node('node2', self.cluster, True, ('127.0.0.2', 9160), ('127.0.0.2', 7000), '7200', '0', None, ('127.0.0.2', 9042)) self.cluster.add(node2, False) node2.start(wait_other_notice=True) debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) self.assertEquals(2, len(notifications), notifications) for notification in notifications: self.assertEquals(self.get_ip_from_node(node2), notification["address"][0]) self.assertEquals("NEW_NODE", notifications[0]["change_type"]) self.assertEquals("UP", notifications[1]["change_type"]) debug("Removing second node...") waiter.clear_notifications() node2.decommission() node2.stop(gently=False) debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) self.assertEquals(2, len(notifications), notifications) for notification in notifications: self.assertEquals(self.get_ip_from_node(node2), notification["address"][0]) self.assertEquals("REMOVED_NODE", notifications[0]["change_type"]) self.assertEquals("DOWN", notifications[1]["change_type"])
def load(path, name): cluster_path = os.path.join(path, name) filename = os.path.join(cluster_path, 'cluster.conf') with open(filename, 'r') as f: data = yaml.load(f) try: install_dir = None if 'install_dir' in data: install_dir = data['install_dir'] repository.validate(install_dir) if install_dir is None and 'cassandra_dir' in data: install_dir = data['cassandra_dir'] repository.validate(install_dir) cassandra_version = None if 'cassandra_version' in data: cassandra_version = LooseVersion(data['cassandra_version']) if common.isDse(install_dir): cluster = DseCluster(path, data['name'], install_dir=install_dir, create_directory=False, derived_cassandra_version=cassandra_version) else: cluster = Cluster(path, data['name'], install_dir=install_dir, create_directory=False, derived_cassandra_version=cassandra_version) node_list = data['nodes'] seed_list = data['seeds'] if 'partitioner' in data: cluster.partitioner = data['partitioner'] if 'config_options' in data: cluster._config_options = data['config_options'] if 'dse_config_options' in data: cluster._dse_config_options = data['dse_config_options'] if 'misc_config_options' in data: cluster._misc_config_options = data['misc_config_options'] if 'log_level' in data: cluster.__log_level = data['log_level'] if 'use_vnodes' in data: cluster.use_vnodes = data['use_vnodes'] if 'datadirs' in data: cluster.data_dir_count = int(data['datadirs']) extension.load_from_cluster_config(cluster, data) except KeyError as k: raise common.LoadError("Error Loading " + filename + ", missing property:" + k) for node_name in node_list: cluster.nodes[node_name] = Node.load(cluster_path, node_name, cluster) for seed in seed_list: cluster.seeds.append(seed) return cluster
def replace_nonexistent_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() debug('Start node 4 and replace an address with no node') node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) # try to replace an unassigned ip address mark = node4.mark_log() node4.start(replace_address='127.0.0.5', wait_other_notice=False) node4.watch_log_for("java.lang.RuntimeException: Cannot replace_address /127.0.0.5 because it doesn't exist in gossip", from_mark=mark) self.check_not_running(node4)
def test_replace(self): main_session = self.patient_cql_connection(self.node1) #We want the node being replaced to have no data on it so the replacement definitely fetches all the data self.node2.stop(wait_other_notice=True) for i in range(0, 40): print("Inserting " + str(i)) self.insert_row(i, i, i, main_session) replacement_address = self.node2.address() self.node2.stop(wait_other_notice=True) self.cluster.remove(self.node2) self.node2 = Node('replacement', cluster=self.cluster, auto_bootstrap=True, thrift_interface=None, storage_interface=(replacement_address, 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=(replacement_address, 9042)) patch_start(self.node2) nodes = [self.node1, self.node2, self.node3] self.cluster.add(self.node2, False, data_center='datacenter1') jvm_args = ["-Dcassandra.replace_address=%s" % replacement_address, "-Dcassandra.ring_delay_ms=10000", "-Dcassandra.broadcast_interval_ms=10000"] self.node2.start(jvm_args=jvm_args, wait_for_binary_proto=True, wait_other_notice=True) sessions = [self.exclusive_cql_connection(node) for node in [self.node1, self.node2, self.node3]] # Everyone should have everything expected = [gen_expected(range(0, 40)), gen_expected(range(0, 40)), gen_expected(range(0,40))] self.check_replication(sessions, exactly=3) self.check_expected(sessions, expected) repair_nodes(nodes) cleanup_nodes(nodes) self.check_replication(sessions, exactly=2) expected = [gen_expected(range(0,11), range(21,40)), gen_expected(range(0,21), range(31, 40)), gen_expected(range(11,31))] self.check_expected(sessions, expected)
def test_add_and_remove_node(self): """ Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave. @jira_ticket CASSANDRA-11038 """ self.cluster.populate(1).start(wait_for_binary_proto=True) node1 = self.cluster.nodelist()[0] waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"]) # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications # don't confuse the state below logger.debug("Waiting for unwanted notifications...") waiter.wait_for_notifications(timeout=30, num_notifications=2) waiter.clear_notifications() session = self.patient_cql_connection(node1) # reduce system_distributed RF to 2 so we don't require forceful decommission session.execute("ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};") session.execute("ALTER KEYSPACE system_traces WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};") logger.debug("Adding second node...") node2 = Node('node2', self.cluster, True, None, ('127.0.0.2', 7000), '7200', '0', None, binary_interface=('127.0.0.2', 9042)) self.cluster.add(node2, False) node2.start(wait_other_notice=True) logger.debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) assert 2 == len(notifications), notifications for notification in notifications: assert get_ip_from_node(node2) == notification["address"][0] assert "NEW_NODE" == notifications[0]["change_type"] assert "UP" == notifications[1]["change_type"] logger.debug("Removing second node...") waiter.clear_notifications() node2.decommission() node2.stop(gently=False) logger.debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) assert 2 == len(notifications), notifications for notification in notifications: assert get_ip_from_node(node2) == notification["address"][0] assert "REMOVED_NODE" == notifications[0]["change_type"] assert "DOWN" == notifications[1]["change_type"]
def replace_active_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() # replace active node 3 with node 4 debug("Starting node 4 to replace active node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) mark = node4.mark_log() node4.start(replace_address='127.0.0.3', wait_other_notice=False) node4.watch_log_for("java.lang.UnsupportedOperationException: Cannot replace a live node...", from_mark=mark) self.check_not_running(node4)
def replace_nonexistent_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() debug('Start node 4 and replace an address with no node') node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) # try to replace an unassigned ip address mark = node4.mark_log() node4.start(replace_address='127.0.0.5', wait_other_notice=False) node4.watch_log_for("java.lang.RuntimeException: Cannot replace_address /127.0.0.5 because it doesn't exist in gossip", from_mark=mark) assert_not_running(node4)
def replace_active_node_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() # replace active node 3 with node 4 debug("Starting node 4 to replace active node 3") node4 = Node('node4', cluster=cluster, auto_bootstrap=True, thrift_interface=('127.0.0.4', 9160), storage_interface=('127.0.0.4', 7000), jmx_port='7400', remote_debug_port='0', initial_token=None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) mark = node4.mark_log() node4.start(replace_address='127.0.0.3', wait_other_notice=False) node4.watch_log_for("java.lang.UnsupportedOperationException: Cannot replace a live node...", from_mark=mark) assert_not_running(node4)
def load(path, name): cluster_path = os.path.join(path, name) filename = os.path.join(cluster_path, "cluster.conf") with open(filename, "r") as f: data = yaml.load(f) try: install_dir = None if "install_dir" in data: install_dir = data["install_dir"] repository.validate(install_dir) if install_dir is None and "cassandra_dir" in data: install_dir = data["cassandra_dir"] repository.validate(install_dir) if common.isDse(install_dir): cluster = DseCluster(path, data["name"], install_dir=install_dir, create_directory=False) else: cluster = Cluster(path, data["name"], install_dir=install_dir, create_directory=False) node_list = data["nodes"] seed_list = data["seeds"] if "partitioner" in data: cluster.partitioner = data["partitioner"] if "config_options" in data: cluster._config_options = data["config_options"] if "log_level" in data: cluster.__log_level = data["log_level"] if "use_vnodes" in data: cluster.use_vnodes = data["use_vnodes"] except KeyError as k: raise common.LoadError("Error Loading " + filename + ", missing property:" + k) for node_name in node_list: cluster.nodes[node_name] = Node.load(cluster_path, node_name, cluster) for seed_name in seed_list: cluster.seeds.append(cluster.nodes[seed_name]) return cluster
def add_and_remove_node_test(self): """ Test that NEW_NODE and REMOVED_NODE are sent correctly as nodes join and leave. @jira_ticket CASSANDRA-11038 """ self.cluster.populate(1).start(wait_for_binary_proto=True) node1 = self.cluster.nodelist()[0] waiter = NotificationWaiter(self, node1, ["STATUS_CHANGE", "TOPOLOGY_CHANGE"]) # need to block for up to 2 notifications (NEW_NODE and UP) so that these notifications # don't confuse the state below debug("Waiting for unwanted notifications...") waiter.wait_for_notifications(timeout=30, num_notifications=2) waiter.clear_notifications() debug("Adding second node...") node2 = Node('node2', self.cluster, True, None, ('127.0.0.2', 7000), '7200', '0', None, binary_interface=('127.0.0.2', 9042)) self.cluster.add(node2, False) node2.start(wait_other_notice=True) debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) self.assertEquals(2, len(notifications), notifications) for notification in notifications: self.assertEquals(get_ip_from_node(node2), notification["address"][0]) self.assertEquals("NEW_NODE", notifications[0]["change_type"]) self.assertEquals("UP", notifications[1]["change_type"]) debug("Removing second node...") waiter.clear_notifications() node2.decommission() node2.stop(gently=False) debug("Waiting for notifications from {}".format(waiter.address)) notifications = waiter.wait_for_notifications(timeout=60.0, num_notifications=2) self.assertEquals(2, len(notifications), notifications) for notification in notifications: self.assertEquals(get_ip_from_node(node2), notification["address"][0]) self.assertEquals("REMOVED_NODE", notifications[0]["change_type"]) self.assertEquals("DOWN", notifications[1]["change_type"])
def _replace_node_test(self, gently): """ Check that the replace address function correctly replaces a node that has failed in a cluster. Create a cluster, cause a node to fail, and bring up a new node with the replace_address parameter. Check that tokens are migrated and that data is replicated properly. """ debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() if DISABLE_VNODES: numNodes = 1 else: # a little hacky but grep_log returns the whole line... numNodes = int(node3.get_conf_option('num_tokens')) debug(numNodes) debug("Inserting Data...") node1.stress(['write', 'n=10K', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) session.default_timeout = 45 stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) # stop node, query should not work with consistency 3 debug("Stopping node 3.") node3.stop(gently=gently, wait_other_notice=True) debug("Testing node stoppage (query should fail).") with self.assertRaises(NodeUnavailable): try: query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) session.execute(query) except (Unavailable, ReadTimeout): raise NodeUnavailable("Node could not be queried.") # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) node4.start(replace_address='127.0.0.3', wait_for_binary_proto=True) # query should work again debug("Verifying querying works again.") query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData) debug("Verifying tokens migrated sucessfully") movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4") debug(movedTokensList[0]) self.assertEqual(len(movedTokensList), numNodes) # check that restarting node 3 doesn't work debug("Try to restart node 3 (should fail)") node3.start(wait_other_notice=False) checkCollision = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner") debug(checkCollision) self.assertEqual(len(checkCollision), 1)
def replace_with_reset_resume_state_test(self): """Test replace with resetting bootstrap progress""" cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() node1.stress(['write', 'n=100000', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) node3.stop(gently=False) # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) # keep timeout low so that test won't hang node4.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) cluster.add(node4, False) try: node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node4 bootstrap with resetting bootstrap state node4.stop() mark = node4.mark_log() node4.start(jvm_args=[ "-Dcassandra.replace_address_first_boot=127.0.0.3", "-Dcassandra.reset_bootstrap_progress=true" ]) # check if we reset bootstrap state node4.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node4.watch_log_for("Listening for thrift clients...", from_mark=mark) # check if 2nd bootstrap succeeded session = self.exclusive_cql_connection(node4) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'COMPLETED', rows[0][0] # query should work again debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData)
def replace_first_boot_test(self): debug("Starting cluster with 3 nodes.") cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() if DISABLE_VNODES: numNodes = 1 else: # a little hacky but grep_log returns the whole line... numNodes = int(node3.get_conf_option('num_tokens')) debug(numNodes) debug("Inserting Data...") node1.stress(['write', 'n=10K', '-schema', 'replication(factor=3)']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.THREE) initialData = list(session.execute(query)) # stop node, query should not work with consistency 3 debug("Stopping node 3.") node3.stop(gently=False) debug("Testing node stoppage (query should fail).") with self.assertRaises(NodeUnavailable): try: session.execute(query, timeout=30) except (Unavailable, ReadTimeout): raise NodeUnavailable("Node could not be queried.") # replace node 3 with node 4 debug("Starting node 4 to replace node 3") node4 = Node('node4', cluster, True, ('127.0.0.4', 9160), ('127.0.0.4', 7000), '7400', '0', None, binary_interface=('127.0.0.4', 9042)) cluster.add(node4, False) node4.start(jvm_args=["-Dcassandra.replace_address_first_boot=127.0.0.3"], wait_for_binary_proto=True) # query should work again debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData) debug("Verifying tokens migrated sucessfully") movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4") debug(movedTokensList[0]) self.assertEqual(len(movedTokensList), numNodes) # check that restarting node 3 doesn't work debug("Try to restart node 3 (should fail)") node3.start(wait_other_notice=False) checkCollision = node1.grep_log("between /127.0.0.3 and /127.0.0.4; /127.0.0.4 is the new owner") debug(checkCollision) self.assertEqual(len(checkCollision), 1) # restart node4 (if error's might have to change num_tokens) node4.stop(gently=False) node4.start(wait_for_binary_proto=True, wait_other_notice=False) debug("Verifying querying works again.") finalData = list(session.execute(query)) self.assertListEqual(initialData, finalData) # we redo this check because restarting node should not result in tokens being moved again, ie number should be same debug("Verifying tokens migrated sucessfully") movedTokensList = node4.grep_log("Token .* changing ownership from /127.0.0.3 to /127.0.0.4") debug(movedTokensList[0]) self.assertEqual(len(movedTokensList), numNodes)