def simultaneous_bootstrap_test(self): """ Attempt to bootstrap two nodes at once, to assert the second bootstrapped node fails, and does not interfere. Start a one node cluster and run a stress write workload. Start up a second node, and wait for the first node to detect it has joined the cluster. While the second node is bootstrapping, start a third node. This should fail. @jira_ticket CASSANDRA-7069 @jira_ticket CASSANDRA-9484 """ bootstrap_error = ( "Other bootstrapping/leaving/moving nodes detected," " cannot bootstrap while cassandra.consistent.rangemovement is true" ) self.ignore_log_patterns.append(bootstrap_error) cluster = self.cluster cluster.populate(1) cluster.start(wait_for_binary_proto=True) node1, = cluster.nodelist() node1.stress([ 'write', 'n=500K', '-schema', 'replication(factor=1)', '-rate', 'threads=10' ]) node2 = new_node(cluster) node2.start(wait_other_notice=True) node3 = new_node(cluster, remote_debug_port='2003') process = node3.start(wait_other_notice=False) stdout, stderr = process.communicate() self.assertIn(bootstrap_error, stderr, msg=stderr) time.sleep(.5) self.assertFalse(node3.is_running(), msg="Two nodes bootstrapped simultaneously") node2.watch_log_for("Starting listening for CQL clients") session = self.patient_exclusive_cql_connection(node2) # Repeat the select count(*) query, to help catch # bugs like 9484, where count(*) fails at higher # data loads. for _ in xrange(5): assert_one(session, "SELECT count(*) from keyspace1.standard1", [500000], cl=ConsistencyLevel.ONE)
def decommissioned_wiped_node_can_gossip_to_single_seed_test(self): """ @jira_ticket CASSANDRA-8072 @jira_ticket CASSANDRA-8422 Test that if we decommission a node, kill it and wipe its data, it can join a cluster with a single seed node. """ cluster = self.cluster cluster.populate(1) cluster.start(wait_for_binary_proto=True) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) node2.start(wait_for_binary_proto=True, wait_other_notice=True) # Decommision the new node and kill it debug("Decommissioning & stopping node2") node2.decommission() node2.stop(wait_other_notice=False) # Wipe its data for data_dir in node2.data_directories(): debug("Deleting {}".format(data_dir)) shutil.rmtree(data_dir) commitlog_dir = os.path.join(node2.get_path(), 'commitlogs') debug("Deleting {}".format(commitlog_dir)) shutil.rmtree(commitlog_dir) # Now start it, it should be allowed to join mark = node2.mark_log() debug("Restarting wiped node2") node2.start(wait_other_notice=False) node2.watch_log_for("JOINING:", from_mark=mark)
def disk_balance_bootstrap_test(self): cluster = self.cluster # apparently we have legitimate errors in the log when bootstrapping (see bootstrap_test.py) self.allow_log_errors = True cluster.set_configuration_options( values={'allocate_tokens_for_keyspace': 'keyspace1'}) cluster.populate(4).start(wait_for_binary_proto=True) node1 = cluster.nodes['node1'] node1.stress([ 'write', 'n=10k', '-rate', 'threads=100', '-schema', 'replication(factor=3)' ]) cluster.flush() node5 = new_node(cluster) node5.start(wait_for_binary_proto=True) self.assert_balanced(node5) cluster.cleanup() self.assert_balanced(node5) if DISABLE_VNODES: for node in cluster.nodelist(): node.nodetool('relocatesstables') self.assertTrue( len( node5.grep_log( "No sstables to RELOCATE for keyspace1.standard1")) > 0 ) for node in cluster.nodelist(): self.assert_balanced(node)
def simple_bootstrap_test(self): cluster = self.cluster tokens = cluster.balanced_tokens(2) cluster.set_configuration_options(values={'num_tokens': 1}) debug("[node1, node2] tokens: %r" % (tokens, )) keys = 10000 # Create a single node cluster cluster.populate(1) node1 = cluster.nodelist()[0] node1.set_configuration_options(values={'initial_token': tokens[0]}) cluster.start(wait_other_notice=True) session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 1) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) # record the size before inserting any of our own data empty_size = node1.data_size() debug("node1 empty size : %s" % float(empty_size)) insert_statement = session.prepare( "INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')") execute_concurrent_with_args(session, insert_statement, [['k%d' % k] for k in range(keys)]) node1.flush() node1.compact() initial_size = node1.data_size() debug("node1 size before bootstrapping node2: %s" % float(initial_size)) # Reads inserted data all during the bootstrap process. We shouldn't # get any error reader = self.go(lambda _: query_c1c2( session, random.randint(0, keys - 1), ConsistencyLevel.ONE)) # Bootstrapping a new node node2 = new_node(cluster) node2.set_configuration_options(values={'initial_token': tokens[1]}) node2.start(wait_for_binary_proto=True) node2.compact() reader.check() node1.cleanup() debug("node1 size after cleanup: %s" % float(node1.data_size())) node1.compact() debug("node1 size after compacting: %s" % float(node1.data_size())) time.sleep(.5) reader.check() debug("node2 size after compacting: %s" % float(node2.data_size())) size1 = float(node1.data_size()) size2 = float(node2.data_size()) assert_almost_equal(size1, size2, error=0.3) assert_almost_equal(float(initial_size - empty_size), 2 * (size1 - float(empty_size)))
def read_from_bootstrapped_node_test(self): """Test bootstrapped node sees existing data, eg. CASSANDRA-6648""" cluster = self.cluster cluster.populate(3) version = cluster.version() cluster.start() node1 = cluster.nodes['node1'] if version < "2.1": node1.stress(['-n', '10000']) else: node1.stress(['write', 'n=10000', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' if self.cluster.version( ) >= '2.1' else '"Keyspace1"."Standard1"' original_rows = list( session.execute("SELECT * FROM %s" % (stress_table, ))) node4 = new_node(cluster) node4.start(wait_for_binary_proto=True) session = self.patient_exclusive_cql_connection(node4) new_rows = list(session.execute("SELECT * FROM %s" % (stress_table, ))) self.assertEquals(original_rows, new_rows)
def read_from_bootstrapped_node_test(self): """ Test bootstrapped node sees existing data @jira_ticket CASSANDRA-6648 """ cluster = self.cluster cluster.populate(3) cluster.start() node1 = cluster.nodes['node1'] node1.stress([ 'write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor=2)' ]) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' original_rows = list( session.execute("SELECT * FROM %s" % (stress_table, ))) node4 = new_node(cluster) node4.start(wait_for_binary_proto=True) session = self.patient_exclusive_cql_connection(node4) new_rows = list(session.execute("SELECT * FROM %s" % (stress_table, ))) self.assertEquals(original_rows, new_rows)
def disk_balance_bootstrap_test(self): cluster = self.cluster # apparently we have legitimate errors in the log when bootstrapping (see bootstrap_test.py) self.allow_log_errors = True cluster.set_configuration_options(values={'allocate_tokens_for_keyspace': 'keyspace1'}) cluster.populate(4).start(wait_for_binary_proto=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=10k', '-rate', 'threads=100', '-schema', 'replication(factor=3)']) cluster.flush() node5 = new_node(cluster) node5.start(wait_for_binary_proto=True) self.assert_balanced(node5) cluster.cleanup() self.assert_balanced(node5) if DISABLE_VNODES: for node in cluster.nodelist(): node.nodetool('relocatesstables') self.assertTrue(len(node5.grep_log("No sstables to RELOCATE for keyspace1.standard1")) > 0) for node in cluster.nodelist(): self.assert_balanced(node)
def manual_bootstrap_test(self): """ Test adding a new node and bootstrapping it manually. No auto_bootstrap. This test also verify that all data are OK after the addition of the new node. @jira_ticket CASSANDRA-9022 """ cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) (node1, node2) = cluster.nodelist() node1.stress([ 'write', 'n=1K', '-schema', 'replication(factor=2)', '-rate', 'threads=1', '-pop', 'dist=UNIFORM(1..1000)' ]) session = self.patient_exclusive_cql_connection(node2) stress_table = 'keyspace1.standard1' original_rows = list(session.execute("SELECT * FROM %s" % stress_table)) # Add a new node node3 = new_node(cluster, bootstrap=False) node3.start(wait_for_binary_proto=True) node3.repair() node1.cleanup() current_rows = list(session.execute("SELECT * FROM %s" % stress_table)) self.assertEquals(original_rows, current_rows)
def decommissioned_wiped_node_can_join_test(self): """ @jira_ticket CASSANDRA-9765 Test that if we decommission a node and then wipe its data, it can join the cluster. """ cluster = self.cluster cluster.populate(3) cluster.start(wait_for_binary_proto=True) stress_table = 'keyspace1.standard1' # write some data node1 = cluster.nodelist()[0] node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,))) # Add a new node, bootstrap=True ensures that it is not a seed node4 = new_node(cluster, bootstrap=True) node4.start(wait_for_binary_proto=True, wait_other_notice=True) session = self.patient_cql_connection(node4) self.assertEquals(original_rows, list(session.execute("SELECT * FROM {}".format(stress_table,)))) # Decommission the new node and wipe its data node4.decommission() node4.stop() self._cleanup(node4) # Now start it, it should be allowed to join mark = node4.mark_log() node4.start(wait_other_notice=True) node4.watch_log_for("JOINING:", from_mark=mark)
def decommissioned_wiped_node_can_join_test(self): """ @jira_ticket CASSANDRA-9765 Test that if we decommission a node and then wipe its data, it can join the cluster. """ cluster = self.cluster cluster.populate(3) cluster.start(wait_for_binary_proto=True) stress_table = 'keyspace1.standard1' # write some data node1 = cluster.nodelist()[0] node1.stress(['write', 'n=10K', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,))) # Add a new node, bootstrap=True ensures that it is not a seed node4 = new_node(cluster, bootstrap=True) node4.start(wait_for_binary_proto=True, wait_other_notice=True) session = self.patient_cql_connection(node4) self.assertEquals(original_rows, list(session.execute("SELECT * FROM {}".format(stress_table,)))) # Decommission the new node and wipe its data node4.decommission() node4.stop() self._cleanup(node4) # Now start it, it should be allowed to join mark = node4.mark_log() node4.start(wait_other_notice=True) node4.watch_log_for("JOINING:", from_mark=mark)
def _do_upgrade(self, login_keyspace=True): cluster = self.cluster node1 = cluster.nodelist()[0] node1.flush() time.sleep(.5) node1.stop(wait_other_notice=True) node1.set_install_dir(install_dir=self.default_install_dir) node1.start(wait_other_notice=True, wait_for_binary_proto=True) if self.bootstrap: cluster.set_install_dir(install_dir=self.default_install_dir) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) node2.start(wait_for_binary_proto=True, jvm_args=self.jvm_args) temp_files = self.glob_data_dirs(os.path.join('*', "tmp", "*.dat")) debug("temp files: " + str(temp_files)) self.assertEquals(0, len(temp_files), "Temporary files were not cleaned up.") cursor = self.patient_cql_connection(node1) if login_keyspace: cursor.execute('USE ks') return cursor
def _wiped_node_cannot_join_test(self, gently): """ @jira_ticket CASSANDRA-9765 Test that if we stop a node and wipe its data then the node cannot join when it is not a seed. Test both a nice shutdown or a forced shutdown, via the gently parameter. """ cluster = self.cluster cluster.populate(3) cluster.start(wait_for_binary_proto=True) stress_table = 'keyspace1.standard1' # write some data node1 = cluster.nodelist()[0] node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,))) # Add a new node, bootstrap=True ensures that it is not a seed node4 = new_node(cluster, bootstrap=True) node4.start(wait_for_binary_proto=True) session = self.patient_cql_connection(node4) self.assertEquals(original_rows, list(session.execute("SELECT * FROM {}".format(stress_table,)))) # Stop the new node and wipe its data node4.stop(gently=gently) self._cleanup(node4) # Now start it, it should not be allowed to join. mark = node4.mark_log() node4.start(no_wait=True, wait_other_notice=False) node4.watch_log_for("A node with address /127.0.0.4 already exists, cancelling join", from_mark=mark)
def add_node_after_mv_test(self): """Test that materialized views work as expected when adding a node.""" session = self.prepare() session.execute("CREATE TABLE t (id int PRIMARY KEY, v int)") session.execute(("CREATE MATERIALIZED VIEW t_by_v AS SELECT * FROM t " "WHERE v IS NOT NULL AND id IS NOT NULL PRIMARY KEY (v, id)")) for i in xrange(1000): session.execute("INSERT INTO t (id, v) VALUES ({v}, {v})".format(v=i)) for i in xrange(1000): assert_one(session, "SELECT * FROM t_by_v WHERE v = {}".format(i), [i, i]) node4 = new_node(self.cluster) node4.start(wait_for_binary_proto=True) session2 = self.patient_exclusive_cql_connection(node4) for i in xrange(1000): assert_one(session2, "SELECT * FROM ks.t_by_v WHERE v = {}".format(i), [i, i]) for i in xrange(1000, 1100): session.execute("INSERT INTO t (id, v) VALUES ({v}, {v})".format(v=i)) for i in xrange(1000, 1100): assert_one(session, "SELECT * FROM t_by_v WHERE v = {}".format(i), [i, i])
def manual_bootstrap_test(self): """ Test adding a new node and bootstrapping it manually. No auto_bootstrap. This test also verify that all data are OK after the addition of the new node. @jira_ticket CASSANDRA-9022 """ cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) (node1, node2) = cluster.nodelist() node1.stress(['write', 'n=1K', '-schema', 'replication(factor=2)', '-rate', 'threads=1', '-pop', 'dist=UNIFORM(1..1000)']) session = self.patient_exclusive_cql_connection(node2) stress_table = 'keyspace1.standard1' original_rows = list(session.execute("SELECT * FROM %s" % stress_table)) # Add a new node node3 = new_node(cluster, bootstrap=False) node3.start(wait_for_binary_proto=True) node3.repair() node1.cleanup() current_rows = list(session.execute("SELECT * FROM %s" % stress_table)) self.assertEquals(original_rows, current_rows)
def local_quorum_bootstrap_test(self): """ Test that CL local_quorum works while a node is bootstrapping. @jira_ticket CASSANDRA-8058 """ cluster = self.cluster cluster.populate([1, 1]) cluster.start() node1 = cluster.nodes['node1'] yaml_config = """ # Create the keyspace and table keyspace: keyspace1 keyspace_definition: | CREATE KEYSPACE keyspace1 WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 1}; table: users table_definition: CREATE TABLE users ( username text, first_name text, last_name text, email text, PRIMARY KEY(username) ) WITH compaction = {'class':'SizeTieredCompactionStrategy'}; insert: partitions: fixed(1) batchtype: UNLOGGED queries: read: cql: select * from users where username = ? fields: samerow """ with tempfile.NamedTemporaryFile(mode='w+') as stress_config: stress_config.write(yaml_config) stress_config.flush() node1.stress([ 'user', 'profile=' + stress_config.name, 'n=2M', 'no-warmup', 'ops(insert=1)', '-rate', 'threads=50' ]) node3 = new_node(cluster, data_center='dc2') node3.start(no_wait=True) time.sleep(3) with tempfile.TemporaryFile(mode='w+') as tmpfile: node1.stress([ 'user', 'profile=' + stress_config.name, 'ops(insert=1)', 'n=500K', 'no-warmup', 'cl=LOCAL_QUORUM', '-rate', 'threads=5', '-errors', 'retries=2' ], stdout=tmpfile, stderr=subprocess.STDOUT) tmpfile.seek(0) output = tmpfile.read() debug(output) regex = re.compile("Operation.+error inserting key.+Exception") failure = regex.search(output) self.assertIsNone(failure, "Error during stress while bootstrapping")
def test_cleanup(self): """ @jira_ticket CASSANDRA-11179 Make sure we remove processed files during cleanup """ cluster = self.cluster cluster.set_configuration_options(values={'concurrent_compactors': 4}) cluster.populate(1) cluster.start(wait_for_binary_proto=True) node1, = cluster.nodelist() for x in xrange(0, 5): node1.stress([ 'write', 'n=100k', '-schema', 'compaction(strategy=SizeTieredCompactionStrategy,enabled=false)', 'replication(factor=1)', '-rate', 'threads=10' ]) node1.flush() node2 = new_node(cluster) node2.start(wait_for_binary_proto=True, wait_other_notice=True) event = threading.Event() failed = threading.Event() jobs = 1 thread = threading.Thread( target=self._monitor_datadir, args=(node1, event, len(node1.get_sstables("keyspace1", "standard1")), jobs, failed)) thread.start() node1.nodetool("cleanup -j {} keyspace1 standard1".format(jobs)) event.set() thread.join() self.assertFalse(failed.is_set())
def _wiped_node_cannot_join_test(self, gently): """ @jira_ticket CASSANDRA-9765 Test that if we stop a node and wipe its data then the node cannot join when it is not a seed. Test both a nice shutdown or a forced shutdown, via the gently parameter. """ cluster = self.cluster cluster.populate(3) cluster.start(wait_for_binary_proto=True) stress_table = 'keyspace1.standard1' # write some data node1 = cluster.nodelist()[0] node1.stress(['write', 'n=10K', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,))) # Add a new node, bootstrap=True ensures that it is not a seed node4 = new_node(cluster, bootstrap=True) node4.start(wait_for_binary_proto=True) session = self.patient_cql_connection(node4) self.assertEquals(original_rows, list(session.execute("SELECT * FROM {}".format(stress_table,)))) # Stop the new node and wipe its data node4.stop(gently=gently) self._cleanup(node4) # Now start it, it should not be allowed to join. mark = node4.mark_log() node4.start(no_wait=True, wait_other_notice=False) node4.watch_log_for("A node with address /127.0.0.4 already exists, cancelling join", from_mark=mark)
def bootstrap_with_reset_bootstrap_state_test(self): """Test bootstrap with resetting bootstrap progress""" cluster = self.cluster cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100K', '-schema', 'replication(factor=2)']) node1.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) try: node3.start() except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node3 bootstrap with resetting bootstrap progress node3.stop() mark = node3.mark_log() node3.start(jvm_args=["-Dcassandra.reset_bootstrap_progress=true"]) # check if we reset bootstrap state node3.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node3.watch_log_for("Listening for thrift clients...", from_mark=mark) # check if 2nd bootstrap succeeded assert_bootstrap_state(self, node3, 'COMPLETED')
def local_quorum_bootstrap_test(self): """ Test that CL local_quorum works while a node is bootstrapping. @jira_ticket CASSANDRA-8058 """ cluster = self.cluster cluster.populate([1, 1]) cluster.start() node1 = cluster.nodes['node1'] yaml_config = """ # Create the keyspace and table keyspace: keyspace1 keyspace_definition: | CREATE KEYSPACE keyspace1 WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 1}; table: users table_definition: CREATE TABLE users ( username text, first_name text, last_name text, email text, PRIMARY KEY(username) ) WITH compaction = {'class':'SizeTieredCompactionStrategy'}; insert: partitions: fixed(1) batchtype: UNLOGGED queries: read: cql: select * from users where username = ? fields: samerow """ stress_config = tempfile.NamedTemporaryFile(mode='w+', delete=False) stress_config.write(yaml_config) stress_config.close() node1.stress(['user', 'profile=' + stress_config.name, 'n=2000000', 'ops(insert=1)', '-rate', 'threads=50']) node3 = new_node(cluster, data_center='dc2') node3.start(no_wait=True) time.sleep(3) with tempfile.TemporaryFile(mode='w+') as tmpfile: node1.stress(['user', 'profile=' + stress_config.name, 'ops(insert=1)', 'n=500K', 'cl=LOCAL_QUORUM', '-rate', 'threads=5', '-errors', 'retries=2'], stdout=tmpfile, stderr=subprocess.STDOUT) os.unlink(stress_config.name) tmpfile.seek(0) output = tmpfile.read() debug(output) regex = re.compile("Operation.+error inserting key.+Exception") failure = regex.search(output) self.assertIsNone(failure, "Error during stress while bootstrapping")
def simultaneous_bootstrap_test(self): """ Attempt to bootstrap two nodes at once, to assert the second bootstrapped node fails, and does not interfere. Start a one node cluster and run a stress write workload. Start up a second node, and wait for the first node to detect it has joined the cluster. While the second node is bootstrapping, start a third node. This should fail. @jira_ticket CASSANDRA-7069 @jira_ticket CASSANDRA-9484 """ bootstrap_error = ("Other bootstrapping/leaving/moving nodes detected," " cannot bootstrap while cassandra.consistent.rangemovement is true") self.ignore_log_patterns.append(bootstrap_error) cluster = self.cluster cluster.populate(1) cluster.start(wait_for_binary_proto=True) node1, = cluster.nodelist() node1.stress(['write', 'n=500K', '-schema', 'replication(factor=1)', '-rate', 'threads=10']) node2 = new_node(cluster) node2.start(wait_other_notice=True) node3 = new_node(cluster, remote_debug_port='2003') process = node3.start(wait_other_notice=False) stdout, stderr = process.communicate() self.assertIn(bootstrap_error, stderr, msg=stderr) time.sleep(.5) self.assertFalse(node3.is_running(), msg="Two nodes bootstrapped simultaneously") node2.watch_log_for("Starting listening for CQL clients") session = self.patient_exclusive_cql_connection(node2) # Repeat the select count(*) query, to help catch # bugs like 9484, where count(*) fails at higher # data loads. for _ in xrange(5): assert_one(session, "SELECT count(*) from keyspace1.standard1", [500000], cl=ConsistencyLevel.ONE)
def _bootstrap_new_node(self): # Check we can bootstrap a new node on the upgraded cluster: debug("Adding a node to the cluster") nnode = new_node(self.cluster, remote_debug_port=str(2000 + len(self.cluster.nodes))) nnode.start(use_jna=True, wait_other_notice=True, wait_for_binary_proto=True) self._write_values() self._increment_counters() self._check_values() self._check_counters()
def add_dc_after_mv_test(self): """Test that materialized views work as expected when adding a datacenter.""" session = self.prepare() debug("Creating schema") session.execute("CREATE TABLE t (id int PRIMARY KEY, v int)") session.execute(("CREATE MATERIALIZED VIEW t_by_v AS SELECT * FROM t " "WHERE v IS NOT NULL AND id IS NOT NULL PRIMARY KEY (v, id)")) debug("Writing 1k to base") for i in xrange(1000): session.execute("INSERT INTO t (id, v) VALUES ({v}, {v})".format(v=i)) debug("Reading 1k from view") for i in xrange(1000): assert_one(session, "SELECT * FROM t_by_v WHERE v = {}".format(i), [i, i]) debug("Reading 1k from base") for i in xrange(1000): assert_one(session, "SELECT * FROM t WHERE id = {}".format(i), [i, i]) debug("Bootstrapping new node in another dc") node4 = new_node(self.cluster, data_center='dc2') node4.start(wait_other_notice=True, wait_for_binary_proto=True) debug("Bootstrapping new node in another dc") node5 = new_node(self.cluster, remote_debug_port='1414', data_center='dc2') node5.start() session2 = self.patient_exclusive_cql_connection(node4) debug("Verifying data from new node in view") for i in xrange(1000): assert_one(session2, "SELECT * FROM ks.t_by_v WHERE v = {}".format(i), [i, i]) debug("Inserting 100 into base") for i in xrange(1000, 1100): session.execute("INSERT INTO t (id, v) VALUES ({v}, {v})".format(v=i)) debug("Verify 100 in view") for i in xrange(1000, 1100): assert_one(session, "SELECT * FROM t_by_v WHERE v = {}".format(i), [i, i])
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100K', 'cl=TWO', '-schema', 'replication(factor=2)']) cluster.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) # keep timeout low so that test won't hang node3.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) try: node3.start(wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode session = self.patient_exclusive_cql_connection(node3) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'IN_PROGRESS', rows[0][0] # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert rows[0][0] == 'COMPLETED', rows[0][0] # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr = node3.stress(['read', 'n=100k', "no-warmup", '-schema', 'replication(factor=2)', '-rate', 'threads=8'], capture_output=True) if stdout and "FAILURE" in stdout: debug(stdout) assert False, "Cannot read inserted data after bootstrap"
def simple_bootstrap_test(self): cluster = self.cluster tokens = cluster.balanced_tokens(2) cluster.set_configuration_options(values={'num_tokens': 1}) debug("[node1, node2] tokens: %r" % (tokens,)) keys = 10000 # Create a single node cluster cluster.populate(1) node1 = cluster.nodelist()[0] node1.set_configuration_options(values={'initial_token': tokens[0]}) cluster.start(wait_other_notice=True) session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 1) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) # record the size before inserting any of our own data empty_size = node1.data_size() debug("node1 empty size : %s" % float(empty_size)) insert_statement = session.prepare("INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')") execute_concurrent_with_args(session, insert_statement, [['k%d' % k] for k in range(keys)]) node1.flush() node1.compact() initial_size = node1.data_size() debug("node1 size before bootstrapping node2: %s" % float(initial_size)) # Reads inserted data all during the bootstrap process. We shouldn't # get any error reader = self.go(lambda _: query_c1c2(session, random.randint(0, keys - 1), ConsistencyLevel.ONE)) # Bootstrapping a new node node2 = new_node(cluster) node2.set_configuration_options(values={'initial_token': tokens[1]}) node2.start(wait_for_binary_proto=True) node2.compact() reader.check() node1.cleanup() debug("node1 size after cleanup: %s" % float(node1.data_size())) node1.compact() debug("node1 size after compacting: %s" % float(node1.data_size())) time.sleep(.5) reader.check() debug("node2 size after compacting: %s" % float(node2.data_size())) size1 = float(node1.data_size()) size2 = float(node2.data_size()) assert_almost_equal(size1, size2, error=0.3) assert_almost_equal(float(initial_size - empty_size), 2 * (size1 - float(empty_size)))
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.set_configuration_options( values={'stream_throughput_outbound_megabits_per_sec': 1}) cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress([ 'write', 'n=100K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50' ]) cluster.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) # keep timeout low so that test won't hang node3.set_configuration_options( values={'streaming_socket_timeout_in_ms': 1000}) node3.start(wait_other_notice=False, wait_for_binary_proto=True) t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode assert_bootstrap_state(self, node3, 'IN_PROGRESS') # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) assert_bootstrap_state(self, node3, 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr = node3.stress([ 'read', 'n=100k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8' ], capture_output=True) if stdout is not None: self.assertNotIn("FAILURE", stdout)
def consistent_reads_after_bootstrap_test(self): debug("Creating a ring") cluster = self.cluster cluster.set_configuration_options(values={ 'hinted_handoff_enabled': False, 'write_request_timeout_in_ms': 60000, 'read_request_timeout_in_ms': 60000, 'dynamic_snitch_badness_threshold': 0.0 }, batch_commitlog=True) cluster.populate(2).start() node1, node2 = cluster.nodelist() cluster.start(wait_for_binary_proto=True, wait_other_notice=True) debug("Set to talk to node 2") n2session = self.patient_cql_connection(node2) self.create_ks(n2session, 'ks', 2) create_c1c2_table(self, n2session) debug("Generating some data for all nodes") insert_c1c2(n2session, keys=range(10, 20), consistency=ConsistencyLevel.ALL) node1.flush() debug("Taking down node1") node1.stop(wait_other_notice=True) debug("Writing data to only node2") insert_c1c2(n2session, keys=range(30, 1000), consistency=ConsistencyLevel.ONE) node2.flush() debug("Restart node1") node1.start(wait_other_notice=True) debug("Bootstraping node3") node3 = new_node(cluster) node3.start(wait_for_binary_proto=True) n3session = self.patient_cql_connection(node3) n3session.execute("USE ks") debug("Checking that no data was lost") for n in xrange(10, 20): query_c1c2(n3session, n, ConsistencyLevel.ALL) for n in xrange(30, 1000): query_c1c2(n3session, n, ConsistencyLevel.ALL)
def bootstrap_on_write_survey_and_join(cluster, token): node2 = new_node(cluster) node2.set_configuration_options(values={'initial_token': token}) node2.start(jvm_args=["-Dcassandra.write_survey=true"], wait_for_binary_proto=True) self.assertTrue(len(node2.grep_log('Startup complete, but write survey mode is active, not becoming an active ring member.'))) assert_bootstrap_state(self, node2, 'IN_PROGRESS') node2.nodetool("join") self.assertTrue(len(node2.grep_log('Leaving write survey mode and joining ring at operator request'))) return node2
def _bootstrap_test_with_replica_down(self, consistent_range_movement, rf=2): """ Test to check consistent bootstrap will not succeed when there are insufficient replicas @jira_ticket CASSANDRA-11848 """ cluster = self.cluster cluster.populate(2) node1, node2 = cluster.nodelist() node3_token = None # Make token assignment deterministic if DISABLE_VNODES: cluster.set_configuration_options(values={'num_tokens': 1}) tokens = cluster.balanced_tokens(3) debug("non-vnode tokens: %r" % (tokens,)) node1.set_configuration_options(values={'initial_token': tokens[0]}) node2.set_configuration_options(values={'initial_token': tokens[2]}) node3_token = tokens[1] # Add node 3 between node1 and node2 cluster.start() node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor={})'.format(rf)]) # change system_auth keyspace to 2 (default is 1) to avoid # "Unable to find sufficient sources for streaming" warning if cluster.cassandra_version() >= '2.2.0': session = self.patient_cql_connection(node1) session.execute(""" ALTER KEYSPACE system_auth WITH replication = {'class':'SimpleStrategy', 'replication_factor':2}; """) # Stop node2, so node3 will not be able to perform consistent range movement node2.stop(wait_other_notice=True) successful_bootstrap_expected = not consistent_range_movement node3 = new_node(cluster, token=node3_token) node3.start(wait_for_binary_proto=successful_bootstrap_expected, wait_other_notice=successful_bootstrap_expected, jvm_args=["-Dcassandra.consistent.rangemovement={}".format(consistent_range_movement)]) if successful_bootstrap_expected: # with rf=1 and cassandra.consistent.rangemovement=false, missing sources are ignored if not consistent_range_movement and rf == 1: node3.watch_log_for("Unable to find sufficient sources for streaming range") self.assertTrue(node3.is_running()) assert_bootstrap_state(self, node3, 'COMPLETED') else: if consistent_range_movement: node3.watch_log_for("A node required to move the data consistently is down") else: node3.watch_log_for("Unable to find sufficient sources for streaming range") assert_not_running(node3)
def _wiped_node_cannot_join_test(self, gently): """ @jira_ticket CASSANDRA-9765 Test that if we stop a node and wipe its data then the node cannot join when it is not a seed. Test both a nice shutdown or a forced shutdown, via the gently parameter. """ cluster = self.cluster cluster.populate(3) cluster.start(wait_for_binary_proto=True) version = cluster.version() stress_table = 'keyspace1.standard1' if self.cluster.version( ) >= '2.1' else '"Keyspace1"."Standard1"' # write some data node1 = cluster.nodelist()[0] if version < "2.1": node1.stress(['-n', '10000']) else: node1.stress(['write', 'n=10000', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) original_rows = list( session.execute("SELECT * FROM {}".format(stress_table, ))) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) node2.start(wait_for_binary_proto=True) session = self.patient_cql_connection(node2) self.assertEquals( original_rows, list(session.execute("SELECT * FROM {}".format(stress_table, )))) # Stop the new node and wipe its data node2.stop(gently=gently) data_dir = os.path.join(node2.get_path(), 'data') commitlog_dir = os.path.join(node2.get_path(), 'commitlogs') debug("Deleting {}".format(data_dir)) shutil.rmtree(data_dir) shutil.rmtree(commitlog_dir) # Now start it, it should not be allowed to join. mark = node2.mark_log() node2.start(no_wait=True) node2.watch_log_for( "A node with address /127.0.0.4 already exists, cancelling join", from_mark=mark)
def failed_bootstap_wiped_node_can_join_test(self): """ @jira_ticket CASSANDRA-9765 Test that if a node fails to bootstrap, it can join the cluster even if the data is wiped. """ cluster = self.cluster cluster.populate(1) cluster.start(wait_for_binary_proto=True) version = cluster.version() stress_table = 'keyspace1.standard1' if self.cluster.version( ) >= '2.1' else '"Keyspace1"."Standard1"' # write some data, enough for the bootstrap to fail later on node1 = cluster.nodelist()[0] if version < "2.1": node1.stress(['-n', '100000']) else: node1.stress(['write', 'n=100000', '-rate', 'threads=8']) node1.flush() session = self.patient_cql_connection(node1) original_rows = list( session.execute("SELECT * FROM {}".format(stress_table, ))) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) node2.set_configuration_options( values={'stream_throughput_outbound_megabits_per_sec': 1}) # kill node2 in the middle of bootstrap t = KillOnBootstrap(node2) t.start() node2.start() t.join() self.assertFalse(node2.is_running()) # wipe any data for node2 data_dir = os.path.join(node2.get_path(), 'data') commitlog_dir = os.path.join(node2.get_path(), 'commitlogs') debug("Deleting {}".format(data_dir)) shutil.rmtree(data_dir) shutil.rmtree(commitlog_dir) # Now start it again, it should be allowed to join mark = node2.mark_log() node2.start(wait_other_notice=True) node2.watch_log_for("JOINING:", from_mark=mark)
def disk_balance_bootstrap_test(self): cluster = self.cluster if not DISABLE_VNODES: cluster.set_configuration_options(values={'num_tokens': 256}) # apparently we have legitimate errors in the log when bootstrapping (see bootstrap_test.py) self.allow_log_errors = True cluster.set_configuration_options(values={'allocate_tokens_for_keyspace': 'keyspace1'}) cluster.populate(4).start(wait_for_binary_proto=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=50k', 'no-warmup', '-rate', 'threads=100', '-schema', 'replication(factor=3)', 'compaction(strategy=SizeTieredCompactionStrategy,enabled=false)']) cluster.flush() node5 = new_node(cluster) node5.start(wait_for_binary_proto=True) self.assert_balanced(node5)
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50']) cluster.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) # keep timeout low so that test won't hang node3.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) node3.start(wait_other_notice=False, wait_for_binary_proto=True) t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode assert_bootstrap_state(self, node3, 'IN_PROGRESS') # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) assert_bootstrap_state(self, node3, 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr = node3.stress(['read', 'n=100k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8'], capture_output=True) if stdout is not None: self.assertNotIn("FAILURE", stdout)
def simple_bootstrap_test_nodata(self): """ @jira_ticket CASSANDRA-11010 Test that bootstrap completes if streaming from nodes with no data """ cluster = self.cluster # Create a two-node cluster cluster.populate(2) cluster.start(wait_other_notice=True) # Bootstrapping a new node node3 = new_node(cluster) node3.start(wait_for_binary_proto=True, wait_other_notice=True) assert_bootstrap_state(self, node3, 'COMPLETED')
def failed_bootstap_wiped_node_can_join_test(self): """ @jira_ticket CASSANDRA-9765 Test that if a node fails to bootstrap, it can join the cluster even if the data is wiped. """ cluster = self.cluster cluster.populate(1) cluster.start(wait_for_binary_proto=True) version = cluster.version() stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"' # write some data, enough for the bootstrap to fail later on node1 = cluster.nodelist()[0] if version < "2.1": node1.stress(['-n', '100000']) else: node1.stress(['write', 'n=100000', '-rate', 'threads=8']) node1.flush() session = self.patient_cql_connection(node1) original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,))) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) node2.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) # kill node2 in the middle of bootstrap t = KillOnBootstrap(node2) t.start() node2.start() t.join() self.assertFalse(node2.is_running()) # wipe any data for node2 data_dir = os.path.join(node2.get_path(), 'data') commitlog_dir = os.path.join(node2.get_path(), 'commitlogs') debug("Deleting {}".format(data_dir)) shutil.rmtree(data_dir) shutil.rmtree(commitlog_dir) # Now start it again, it should be allowed to join mark = node2.mark_log() node2.start(wait_other_notice=True) node2.watch_log_for("JOINING:", from_mark=mark)
def decommissioned_wiped_node_can_join_test(self): """ @jira_ticket CASSANDRA-9765 Test that if we decommission a node and then wipe its data, it can join the cluster. """ cluster = self.cluster cluster.populate(3) cluster.start(wait_for_binary_proto=True) version = cluster.version() stress_table = 'keyspace1.standard1' if self.cluster.version( ) >= '2.1' else '"Keyspace1"."Standard1"' # write some data node1 = cluster.nodelist()[0] if version < "2.1": node1.stress(['-n', '10000']) else: node1.stress(['write', 'n=10K', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) original_rows = list( session.execute("SELECT * FROM {}".format(stress_table, ))) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) node2.start(wait_for_binary_proto=True, wait_other_notice=True) session = self.patient_cql_connection(node2) self.assertEquals( original_rows, list(session.execute("SELECT * FROM {}".format(stress_table, )))) # Decommision the new node and wipe its data node2.decommission() node2.stop(wait_other_notice=True) data_dir = os.path.join(node2.get_path(), 'data') commitlog_dir = os.path.join(node2.get_path(), 'commitlogs') debug("Deleting {}".format(data_dir)) shutil.rmtree(data_dir) shutil.rmtree(commitlog_dir) # Now start it, it should be allowed to join mark = node2.mark_log() node2.start(wait_other_notice=True) node2.watch_log_for("JOINING:", from_mark=mark)
def resumable_bootstrap_test(self): """Test resuming bootstrap after data streaming failure""" cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100000', '-schema', 'replication(factor=2)']) node1.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.set_configuration_options( values={'stream_throughput_outbound_megabits_per_sec': 1}) # keep timeout low so that test won't hang node3.set_configuration_options( values={'streaming_socket_timeout_in_ms': 1000}) try: node3.start() except NodeError: pass # node doesn't start as expected t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode session = self.exclusive_cql_connection(node3) rows = session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'") assert len(rows) == 1 assert rows[0][0] == 'IN_PROGRESS', rows[0][0] # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node3.watch_log_for("already available. Skipping streaming.") node3.watch_log_for("Resume complete", from_mark=mark) rows = session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'") assert rows[0][0] == 'COMPLETED', rows[0][0]
def simple_bootstrap_test_nodata(self): """ @jira_ticket CASSANDRA-11010 Test that bootstrap completes if streaming from nodes with no data """ cluster = self.cluster # Create a two-node cluster cluster.populate(2) cluster.start(wait_other_notice=True) # Bootstrapping a new node node3 = new_node(cluster) node3.start(wait_for_binary_proto=True, wait_other_notice=True) session = self.patient_exclusive_cql_connection(node3) rows = session.execute("SELECT bootstrapped FROM system.local WHERE key='local'") self.assertEqual(rows[0][0], 'COMPLETED')
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.populate(2) node1 = cluster.nodes['node1'] # set up byteman node1.byteman_port = '8100' node1.import_config_files() cluster.start(wait_other_notice=True) # kill stream to node3 in the middle of streaming to let it fail node1.byteman_submit(['./stream_failure.btm']) node1.stress(['write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50']) cluster.flush() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.start(wait_other_notice=False, wait_for_binary_proto=True) # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode assert_bootstrap_state(self, node3, 'IN_PROGRESS') # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) assert_bootstrap_state(self, node3, 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr, _ = node3.stress(['read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8']) if stdout is not None: self.assertNotIn("FAILURE", stdout)
def decommissioned_wiped_node_can_join_test(self): """ @jira_ticket CASSANDRA-9765 Test that if we decommission a node and then wipe its data, it can join the cluster. """ cluster = self.cluster cluster.set_log_level('TRACE') cluster.populate(3) cluster.start(wait_for_binary_proto=True) version = cluster.version() stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"' # write some data node1 = cluster.nodelist()[0] if version < "2.1": node1.stress(['-n', '10000']) else: node1.stress(['write', 'n=10000', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,))) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) node2.start(wait_for_binary_proto=True) session = self.patient_cql_connection(node2) self.assertEquals(original_rows, list(session.execute("SELECT * FROM {}".format(stress_table,)))) session.shutdown() # Ensure all sockets to node2 are released # Decommision the new node and wipe its data node2.decommission() node2.stop(gently=False) data_dir = os.path.join(node2.get_path(), 'data') commitlog_dir = os.path.join(node2.get_path(), 'commitlogs') debug("Deleting {}".format(data_dir)) shutil.rmtree(data_dir) shutil.rmtree(commitlog_dir) # Now start it, it should be allowed to join mark = node2.mark_log() node2.start(wait_other_notice=True) node2.watch_log_for("JOINING:", from_mark=mark)
def read_from_bootstrapped_node_test(self): """Test bootstrapped node sees existing data, eg. CASSANDRA-6648""" cluster = self.cluster cluster.populate(3) cluster.start() node1 = cluster.nodes['node1'] node1.stress(['write', 'n=10000', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) stress_table = 'keyspace1.standard1' original_rows = list(session.execute("SELECT * FROM %s" % (stress_table,))) node4 = new_node(cluster) node4.start(wait_for_binary_proto=True) session = self.patient_exclusive_cql_connection(node4) new_rows = list(session.execute("SELECT * FROM %s" % (stress_table,))) self.assertEquals(original_rows, new_rows)
def simple_bootstrap_test_nodata(self): """ @jira_ticket CASSANDRA-11010 Test that bootstrap completes if streaming from nodes with no data """ cluster = self.cluster # Create a two-node cluster cluster.populate(2) cluster.start(wait_other_notice=True) # Bootstrapping a new node node3 = new_node(cluster) node3.start(wait_for_binary_proto=True, wait_other_notice=True) session = self.patient_exclusive_cql_connection(node3) rows = session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'") self.assertEqual(rows[0][0], 'COMPLETED')
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100000', '-schema', 'replication(factor=2)']) node1.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) # keep timeout low so that test won't hang node3.set_configuration_options(values={'streaming_socket_timeout_in_ms': 1000}) try: node3.start() except NodeError: pass # node doesn't start as expected t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode session = self.exclusive_cql_connection(node3) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert len(rows) == 1 assert rows[0][0] == 'IN_PROGRESS', rows[0][0] # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') # check if we skipped already retrieved ranges node3.watch_log_for("already available. Skipping streaming.") node3.watch_log_for("Resume complete", from_mark=mark) rows = list(session.execute("SELECT bootstrapped FROM system.local WHERE key='local'")) assert rows[0][0] == 'COMPLETED', rows[0][0]
def _wiped_node_cannot_join_test(self, gently): """ @jira_ticket CASSANDRA-9765 Test that if we stop a node and wipe its data then the node cannot join when it is not a seed. Test both a nice shutdown or a forced shutdown, via the gently parameter. """ cluster = self.cluster cluster.populate(3) cluster.start(wait_for_binary_proto=True) version = cluster.version() stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"' # write some data node1 = cluster.nodelist()[0] if version < "2.1": node1.stress(['-n', '10000']) else: node1.stress(['write', 'n=10000', '-rate', 'threads=8']) session = self.patient_cql_connection(node1) original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,))) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) node2.start(wait_for_binary_proto=True) session = self.patient_cql_connection(node2) self.assertEquals(original_rows, list(session.execute("SELECT * FROM {}".format(stress_table,)))) # Stop the new node and wipe its data node2.stop(gently=gently) data_dir = os.path.join(node2.get_path(), 'data') commitlog_dir = os.path.join(node2.get_path(), 'commitlogs') debug("Deleting {}".format(data_dir)) shutil.rmtree(data_dir) shutil.rmtree(commitlog_dir) # Now start it, it should not be allowed to join. mark = node2.mark_log() node2.start(no_wait=True) node2.watch_log_for("A node with address /127.0.0.4 already exists, cancelling join", from_mark=mark)
def consistent_reads_after_bootstrap_test(self): debug("Creating a ring") cluster = self.cluster cluster.set_configuration_options(values={'hinted_handoff_enabled': False, 'write_request_timeout_in_ms': 60000, 'read_request_timeout_in_ms': 60000, 'dynamic_snitch_badness_threshold': 0.0}, batch_commitlog=True) cluster.populate(2).start() node1, node2 = cluster.nodelist() cluster.start(wait_for_binary_proto=True, wait_other_notice=True) debug("Set to talk to node 2") n2session = self.patient_cql_connection(node2) self.create_ks(n2session, 'ks', 2) create_c1c2_table(self, n2session) debug("Generating some data for all nodes") for n in xrange(10, 20): insert_c1c2(n2session, n, ConsistencyLevel.ALL) node1.flush() debug("Taking down node1") node1.stop(wait_other_notice=True) debug("Writing data to only node2") for n in xrange(30, 1000): insert_c1c2(n2session, n, ConsistencyLevel.ONE) node2.flush() debug("Restart node1") node1.start(wait_other_notice=True) debug("Boostraping node3") node3 = new_node(cluster) node3.start(wait_for_binary_proto=True) n3session = self.patient_cql_connection(node3) n3session.execute("USE ks") debug("Checking that no data was lost") for n in xrange(10, 20): query_c1c2(n3session, n, ConsistencyLevel.ALL) for n in xrange(30, 1000): query_c1c2(n3session, n, ConsistencyLevel.ALL)
def read_from_bootstrapped_node_test(self): """Test bootstrapped node sees existing data, eg. CASSANDRA-6648""" cluster = self.cluster cluster.populate(3) version = cluster.version() cluster.start() node1 = cluster.nodes['node1'] if version < "2.1": node1.stress(['-n', '10000']) else: node1.stress(['write', 'n=10000', '-rate', 'threads=8']) node4 = new_node(cluster) node4.start() session = self.patient_cql_connection(node4) stress_table = 'keyspace1.standard1' if self.cluster.version() >= '2.1' else '"Keyspace1"."Standard1"' rows = session.execute('select * from %s limit 10' % stress_table) assert len(list(rows)) == 10
def disk_balance_bootstrap_test(self): cluster = self.cluster if not DISABLE_VNODES: cluster.set_configuration_options(values={'num_tokens': 256}) # apparently we have legitimate errors in the log when bootstrapping (see bootstrap_test.py) self.allow_log_errors = True cluster.set_configuration_options( values={'allocate_tokens_for_keyspace': 'keyspace1'}) cluster.populate(4).start(wait_for_binary_proto=True) node1 = cluster.nodes['node1'] node1.stress([ 'write', 'n=50k', '-rate', 'threads=100', '-schema', 'replication(factor=3)', 'compaction(strategy=SizeTieredCompactionStrategy,enabled=false)' ]) cluster.flush() node5 = new_node(cluster) node5.start(wait_for_binary_proto=True) self.assert_balanced(node5)
def failed_bootstrap_wiped_node_can_join_test(self): """ @jira_ticket CASSANDRA-9765 Test that if a node fails to bootstrap, it can join the cluster even if the data is wiped. """ cluster = self.cluster cluster.populate(1) cluster.set_configuration_options( values={'stream_throughput_outbound_megabits_per_sec': 1}) cluster.start(wait_for_binary_proto=True) stress_table = 'keyspace1.standard1' # write some data, enough for the bootstrap to fail later on node1 = cluster.nodelist()[0] node1.stress(['write', 'n=100K', 'no-warmup', '-rate', 'threads=8']) node1.flush() session = self.patient_cql_connection(node1) original_rows = list( session.execute("SELECT * FROM {}".format(stress_table, ))) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) # kill node2 in the middle of bootstrap t = KillOnBootstrap(node2) t.start() node2.start() t.join() self.assertFalse(node2.is_running()) # wipe any data for node2 self._cleanup(node2) # Now start it again, it should be allowed to join mark = node2.mark_log() node2.start(wait_other_notice=True) node2.watch_log_for("JOINING:", from_mark=mark)
def failed_bootstrap_wiped_node_can_join_test(self): """ @jira_ticket CASSANDRA-9765 Test that if a node fails to bootstrap, it can join the cluster even if the data is wiped. """ cluster = self.cluster cluster.populate(1) cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) cluster.start(wait_for_binary_proto=True) stress_table = 'keyspace1.standard1' # write some data, enough for the bootstrap to fail later on node1 = cluster.nodelist()[0] node1.stress(['write', 'n=100K', 'no-warmup', '-rate', 'threads=8']) node1.flush() session = self.patient_cql_connection(node1) original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,))) # Add a new node, bootstrap=True ensures that it is not a seed node2 = new_node(cluster, bootstrap=True) # kill node2 in the middle of bootstrap t = KillOnBootstrap(node2) t.start() node2.start() t.join() self.assertFalse(node2.is_running()) # wipe any data for node2 self._cleanup(node2) # Now start it again, it should be allowed to join mark = node2.mark_log() node2.start(wait_other_notice=True) node2.watch_log_for("JOINING:", from_mark=mark)
def manual_bootstrap_test(self): """Test adding a new node and bootstrappig it manually. No auto_bootstrap. This test also verify that all data are OK after the addition of the new node. eg. CASSANDRA-9022 """ cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) (node1, node2) = cluster.nodelist() if cluster.version() < "2.1": node1.stress(['-o', 'insert', '-n', '1000', '-l', '2', '-t', '1']) else: node1.stress(['write', 'n=1000', '-schema', 'replication(factor=2)', '-rate', 'threads=1', '-pop', 'dist=UNIFORM(1..1000)']) # Add a new node node3 = new_node(cluster, bootstrap=False) node3.start() node3.repair() node1.cleanup() # Verify the data with tempfile.TemporaryFile(mode='w+') as tmpfile: if cluster.version() < "2.1": node2.stress(['-o', 'read', '-n', '1000', '-e', 'ALL', '-t', '1'], stdout=tmpfile, stderr=subprocess.STDOUT) else: node2.stress(['read', 'n=1000', 'cl=ALL', '-rate', 'threads=1', '-pop', 'dist=UNIFORM(1..1000)'], stdout=tmpfile, stderr=subprocess.STDOUT) tmpfile.seek(0) output = tmpfile.read() debug(output) failure = output.find("Data returned was not validated") self.assertEqual(failure, -1, "Stress failed to validate all data")
def bootstrap(cluster, token): node2 = new_node(cluster) node2.set_configuration_options(values={'initial_token': token}) node2.start(wait_for_binary_proto=True) return node2
def decommission_test(self): cluster = self.cluster tokens = cluster.balanced_tokens(4) cluster.populate(4, tokens=tokens).start() node1, node2, node3, node4 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 2) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=10000, consistency=ConsistencyLevel.QUORUM) cluster.flush() sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] init_size = sizes[0] assert_almost_equal(*sizes) time.sleep(.5) node4.decommission() node4.stop() cluster.cleanup() time.sleep(.5) # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.QUORUM) sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] three_node_sizes = sizes assert_almost_equal(sizes[0], sizes[1]) assert_almost_equal((2.0 / 3.0) * sizes[0], sizes[2]) assert_almost_equal(sizes[2], init_size) if cluster.version() <= '1.2': node3.stop(wait_other_notice=True) node1.removeToken(tokens[2]) time.sleep(.5) cluster.cleanup() time.sleep(.5) # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.QUORUM) sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] assert_almost_equal(*sizes) assert_almost_equal(sizes[0], 2 * init_size) node5 = new_node(cluster, token=(tokens[2] + 1)).start() time.sleep(.5) cluster.cleanup() time.sleep(.5) cluster.compact() time.sleep(.5) # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.QUORUM) sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] # We should be back to the earlir 3 nodes situation for i in xrange(0, len(sizes)): assert_almost_equal(sizes[i], three_node_sizes[i])
def multi_dc_replace_with_rf1_test(self): """ Test that multi-dc replace works when rf=1 on each dc """ cluster = self.cluster cluster.populate([1, 1]) cluster.start() node1, node2 = cluster.nodelist() node1 = cluster.nodes['node1'] yaml_config = """ # Create the keyspace and table keyspace: keyspace1 keyspace_definition: | CREATE KEYSPACE keyspace1 WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 1}; table: users table_definition: CREATE TABLE users ( username text, first_name text, last_name text, email text, PRIMARY KEY(username) ) WITH compaction = {'class':'SizeTieredCompactionStrategy'}; insert: partitions: fixed(1) batchtype: UNLOGGED queries: read: cql: select * from users where username = ? fields: samerow """ with tempfile.NamedTemporaryFile(mode='w+') as stress_config: stress_config.write(yaml_config) stress_config.flush() node1.stress([ 'user', 'profile=' + stress_config.name, 'n=10k', 'no-warmup', 'ops(insert=1)', '-rate', 'threads=50' ]) session = self.patient_cql_connection(node1) # change system_auth keyspace to 2 (default is 1) to avoid # "Unable to find sufficient sources for streaming" warning if cluster.cassandra_version() >= '2.2.0': session.execute(""" ALTER KEYSPACE system_auth WITH replication = {'class':'SimpleStrategy', 'replication_factor':2}; """) # Save initial data stress_table = 'keyspace1.users' query = SimpleStatement('select * from %s LIMIT 1' % stress_table, consistency_level=ConsistencyLevel.TWO) initial_data = rows_to_list(session.execute(query)) # stop node to replace debug("Stopping node to replace.") node2.stop(wait_other_notice=True) node3 = new_node(cluster, data_center='dc2') node3.start(replace_address='127.0.0.2', wait_for_binary_proto=True) assert_bootstrap_state(self, node3, 'COMPLETED') # Check that keyspace was replicated from dc1 to dc2 self.assertFalse( node3.grep_log( "Unable to find sufficient sources for streaming range")) # query should work again with node1 stopped node1.stop(wait_other_notice=True) debug("Verifying data on new node.") session = self.patient_exclusive_cql_connection(node3) assert_all(session, 'SELECT * from {} LIMIT 1'.format(stress_table), expected=initial_data, cl=ConsistencyLevel.LOCAL_ONE)
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress([ 'write', 'n=100K', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50' ]) cluster.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.set_configuration_options( values={'stream_throughput_outbound_megabits_per_sec': 1}) # keep timeout low so that test won't hang node3.set_configuration_options( values={'streaming_socket_timeout_in_ms': 1000}) try: node3.start(wait_other_notice=False) except NodeError: pass # node doesn't start as expected t.join() # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode session = self.patient_exclusive_cql_connection(node3) rows = list( session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'")) self.assertEqual(len(rows), 1) self.assertEqual(rows[0][0], 'IN_PROGRESS') # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node1.start(wait_other_notice=True) node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) rows = list( session.execute( "SELECT bootstrapped FROM system.local WHERE key='local'")) self.assertEqual(rows[0][0], 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr = node3.stress([ 'read', 'n=100k', "no-warmup", '-schema', 'replication(factor=2)', '-rate', 'threads=8' ], capture_output=True) if stdout and "FAILURE" in stdout: debug(stdout) assert False, "Cannot read inserted data after bootstrap"