def quorum_available_during_failure_test(self): CL = 'QUORUM' RF = 3 debug("Creating a ring") cluster = self.cluster if ENABLE_VNODES: tokens = cluster.balanced_tokens(3) cluster.populate(3, tokens=tokens).start() else: cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cluster.start() debug("Set to talk to node 2") cursor = self.cql_connection(node2).cursor() self.create_ks(cursor, 'ks', RF) create_c1c2_table(self, cursor) debug("Generating some data") for n in xrange(100): insert_c1c2(cursor, n, CL) debug("Taking down node1") node1.stop(wait_other_notice=True) debug("Reading back data.") for n in xrange(100): query_c1c2(cursor, n, CL)
def decommission_test(self): cluster = self.cluster tokens = cluster.balanced_tokens(4) cluster.populate(4, tokens=tokens).start() node1, node2, node3, node4 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 2) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=30000, consistency=ConsistencyLevel.QUORUM) cluster.flush() sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] init_size = sizes[0] assert_almost_equal(*sizes) time.sleep(.5) node4.decommission() node4.stop() cluster.cleanup() time.sleep(.5) # Check we can get all the keys for n in xrange(0, 30000): query_c1c2(session, n, ConsistencyLevel.QUORUM) sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] debug(sizes) assert_almost_equal(sizes[0], sizes[1]) assert_almost_equal((2.0 / 3.0) * sizes[0], sizes[2]) assert_almost_equal(sizes[2], init_size)
def movement_test(self): cluster = self.cluster # Create an unbalanced ring cluster.populate(3, tokens=[0, 2**48, 2**62]).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 1) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=10000, consistency=ConsistencyLevel.ONE) cluster.flush() # Move nodes to balance the cluster balancing_tokens = cluster.balanced_tokens(3) escformat = '%s' node1.move(escformat % balancing_tokens[0]) # can't assume 0 is balanced with m3p node2.move(escformat % balancing_tokens[1]) node3.move(escformat % balancing_tokens[2]) time.sleep(1) cluster.cleanup() # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.ONE) # Now the load should be basically even sizes = [node.data_size() for node in [node1, node2, node3]] assert_almost_equal(sizes[0], sizes[1]) assert_almost_equal(sizes[0], sizes[2]) assert_almost_equal(sizes[1], sizes[2])
def _do_hinted_handoff(self, node1, node2, enabled): """ Test that if we stop one node the other one will store hints only when hinted handoff is enabled """ session = self.patient_exclusive_cql_connection(node1) self.create_ks(session, 'ks', 2) create_c1c2_table(self, session) node2.stop(wait_other_notice=True) insert_c1c2(session, n=100, consistency=ConsistencyLevel.ONE) log_mark = node1.mark_log() node2.start(wait_other_notice=True) if enabled: node1.watch_log_for(["Finished hinted"], from_mark=log_mark, timeout=120) node1.stop(wait_other_notice=True) # Check node2 for all the keys that should have been delivered via HH if enabled or not if not enabled session = self.patient_exclusive_cql_connection(node2, keyspace='ks') for n in xrange(0, 100): if enabled: query_c1c2(session, n, ConsistencyLevel.ONE) else: query_c1c2(session, n, ConsistencyLevel.ONE, tolerate_missing=True, must_be_missing=True)
def quorum_quorum_test(self): cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor1 = self.patient_cql_connection(node1).cursor() self.create_ks(cursor1, 'ks', 3) create_c1c2_table(self, cursor1) cursor2 = self.patient_cql_connection(node2, 'ks').cursor() # insert and get at CL.QUORUM for n in xrange(0, 100): insert_c1c2(cursor1, n, "QUORUM") query_c1c2(cursor2, n, "QUORUM") # shutdown a node an test again node3.stop(wait_other_notice=True) for n in xrange(100, 200): insert_c1c2(cursor1, n, "QUORUM") query_c1c2(cursor2, n, "QUORUM") # shutdown another node and test we get unavailabe exception node2.stop(wait_other_notice=True) assert_unavailable(insert_c1c2, cursor1, 200, "QUORUM")
def move_single_node_test(self): """ Test moving a node in a single-node cluster (#4200) """ cluster = self.cluster # Create an unbalanced ring cluster.populate(1, tokens=[0]).start() node1 = cluster.nodelist()[0] time.sleep(0.2) session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 1) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=10000, consistency=ConsistencyLevel.ONE) cluster.flush() node1.move(2**25) time.sleep(1) cluster.cleanup() # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.ONE)
def readrepair_test(self): cluster = self.cluster cluster.set_configuration_options( values={'hinted_handoff_enabled': False}) if DISABLE_VNODES: cluster.populate(2).start() else: tokens = cluster.balanced_tokens(2) cluster.populate(2, tokens=tokens).start() [node1, node2] = cluster.nodelist() cursor = self.patient_cql_connection(node1).cursor() self.create_ks(cursor, 'ks', 2) create_c1c2_table(self, cursor, read_repair=1.0) node2.stop(wait_other_notice=True) for n in xrange(0, 10000): insert_c1c2(cursor, n, "ONE") node2.start(wait_other_notice=True) time.sleep(5) # query everything to cause RR for n in xrange(0, 10000): query_c1c2(cursor, n, "QUORUM") node1.stop(wait_other_notice=True) # Check node2 for all the keys that should have been repaired cursor = self.patient_cql_connection(node2, keyspace='ks').cursor() for n in xrange(0, 10000): query_c1c2(cursor, n, "ONE")
def hintedhandoff_test(self): cluster = self.cluster if DISABLE_VNODES: cluster.populate(2).start() else: tokens = cluster.balanced_tokens(2) cluster.populate(2, tokens=tokens).start() [node1, node2] = cluster.nodelist() cursor = self.patient_cql_connection(node1).cursor() self.create_ks(cursor, 'ks', 2) create_c1c2_table(self, cursor) node2.stop(wait_other_notice=True) for n in xrange(0, 100): insert_c1c2(cursor, n, "ONE") log_mark = node1.mark_log() node2.start() node1.watch_log_for(["Finished hinted"], from_mark=log_mark, timeout=90) node1.stop(wait_other_notice=True) # Check node2 for all the keys that should have been delivered via HH cursor = self.patient_cql_connection(node2, keyspace='ks').cursor() for n in xrange(0, 100): query_c1c2(cursor, n, "ONE")
def _deprecated_repair_jmx(self, method, arguments): cluster = self.cluster debug("Starting cluster..") cluster.populate([1, 1]) node1, node2 = cluster.nodelist() remove_perf_disable_shared_mem(node1) cluster.start() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 2) self.create_cf(session, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=1000, consistency=ConsistencyLevel.ALL) # Run repair mbean = make_mbean('db', 'StorageService') with JolokiaAgent(node1) as jmx: # assert repair runs and returns valid cmd number self.assertEqual(jmx.execute_method(mbean, method, arguments), 1) # wait for log to start node1.watch_log_for("Starting repair command") # get repair parameters from the log l = node1.grep_log("Starting repair command #1, repairing keyspace ks with repair options \(parallelism: (?P<parallelism>\w+), primary range: (?P<pr>\w+), incremental: (?P<incremental>\w+), job threads: (?P<jobs>\d+), ColumnFamilies: (?P<cfs>.+), dataCenters: (?P<dc>.+), hosts: (?P<hosts>.+), # of ranges: (?P<ranges>\d+)\)") self.assertEqual(len(l), 1) line, m = l[0] return {"parallelism": m.group("parallelism"), "primary_range": m.group("pr"), "incremental": m.group("incremental"), "job_threads": m.group("jobs"), "column_families": m.group("cfs"), "data_centers": m.group("dc"), "hosts": m.group("hosts"), "ranges": m.group("ranges")}
def readrepair_test(self): cluster = self.cluster cluster.set_configuration_options(values={"hinted_handoff_enabled": False}) if DISABLE_VNODES: cluster.populate(2).start() else: tokens = cluster.balanced_tokens(2) cluster.populate(2, tokens=tokens).start() node1, node2 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, "ks", 2) create_c1c2_table(self, session, read_repair=1.0) node2.stop(wait_other_notice=True) insert_c1c2(session, n=10000, consistency=ConsistencyLevel.ONE) node2.start(wait_other_notice=True) # query everything to cause RR for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.QUORUM) node1.stop(wait_other_notice=True) # Check node2 for all the keys that should have been repaired session = self.patient_cql_connection(node2, keyspace="ks") for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.ONE)
def readrepair_test(self): cluster = self.cluster cluster.set_configuration_options(values={"hinted_handoff_enabled": False}) if DISABLE_VNODES: cluster.populate(2).start() else: tokens = cluster.balanced_tokens(2) cluster.populate(2, tokens=tokens).start() [node1, node2] = cluster.nodelist() cursor = self.patient_cql_connection(node1).cursor() self.create_ks(cursor, "ks", 2) create_c1c2_table(self, cursor, read_repair=1.0) node2.stop(wait_other_notice=True) for n in xrange(0, 10000): insert_c1c2(cursor, n, "ONE") node2.start(wait_other_notice=True) time.sleep(5) # query everything to cause RR for n in xrange(0, 10000): query_c1c2(cursor, n, "QUORUM") node1.stop(wait_other_notice=True) # Check node2 for all the keys that should have been repaired cursor = self.patient_cql_connection(node2, keyspace="ks").cursor() for n in xrange(0, 10000): query_c1c2(cursor, n, "ONE")
def quorum_quorum_test(self): cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor1 = self.patient_cql_connection(node1).cursor() self.create_ks(cursor1, "ks", 3) create_c1c2_table(self, cursor1) cursor2 = self.patient_cql_connection(node2, "ks").cursor() # insert and get at CL.QUORUM for n in xrange(0, 100): insert_c1c2(cursor1, n, "QUORUM") query_c1c2(cursor2, n, "QUORUM") # shutdown a node an test again node3.stop(wait_other_notice=True) for n in xrange(100, 200): insert_c1c2(cursor1, n, "QUORUM") query_c1c2(cursor2, n, "QUORUM") # shutdown another node and test we get unavailabe exception node2.stop(wait_other_notice=True) assert_unavailable(insert_c1c2, cursor1, 200, "QUORUM")
def blacklisted_directory_test(self): cluster = self.cluster cluster.set_datadir_count(3) cluster.populate(1) [node] = cluster.nodelist() remove_perf_disable_shared_mem(node) cluster.start(wait_for_binary_proto=True) session = self.patient_cql_connection(node) self.create_ks(session, 'ks', 1) create_c1c2_table(self, session) insert_c1c2(session, n=10000) node.flush() for k in xrange(0, 10000): query_c1c2(session, k) node.compact() mbean = make_mbean('db', type='BlacklistedDirectories') with JolokiaAgent(node) as jmx: jmx.execute_method(mbean, 'markUnwritable', [os.path.join(node.get_path(), 'data0')]) for k in xrange(0, 10000): query_c1c2(session, k) node.nodetool('relocatesstables') for k in xrange(0, 10000): query_c1c2(session, k)
def readrepair_test(self): cluster = self.cluster cluster.set_configuration_options( values={'hinted_handoff_enabled': False}) if DISABLE_VNODES: cluster.populate(2).start() else: tokens = cluster.balanced_tokens(2) cluster.populate(2, tokens=tokens).start() node1, node2 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 2) create_c1c2_table(self, session, read_repair=1.0) node2.stop(wait_other_notice=True) insert_c1c2(session, n=10000, consistency=ConsistencyLevel.ONE) node2.start(wait_other_notice=True) # query everything to cause RR for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.QUORUM) node1.stop(wait_other_notice=True) # Check node2 for all the keys that should have been repaired session = self.patient_cql_connection(node2, keyspace='ks') for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.ONE)
def quorum_available_during_failure_test(self): CL = ConsistencyLevel.QUORUM RF = 3 debug("Creating a ring") cluster = self.cluster if DISABLE_VNODES: cluster.populate(3).start() else: tokens = cluster.balanced_tokens(3) cluster.populate(3, tokens=tokens).start() node1, node2, node3 = cluster.nodelist() debug("Set to talk to node 2") session = self.patient_cql_connection(node2) self.create_ks(session, 'ks', RF) create_c1c2_table(self, session) debug("Generating some data") insert_c1c2(session, n=100, consistency=CL) debug("Taking down node1") node1.stop(wait_other_notice=True) debug("Reading back data.") for n in xrange(100): query_c1c2(session, n, CL)
def quorum_available_during_failure_test(self): CL = ConsistencyLevel.QUORUM RF = 3 debug("Creating a ring") cluster = self.cluster if DISABLE_VNODES: cluster.populate(3).start() else: tokens = cluster.balanced_tokens(3) cluster.populate(3, tokens=tokens).start() node1, node2, node3 = cluster.nodelist() cluster.start() debug("Set to talk to node 2") session = self.patient_cql_connection(node2) self.create_ks(session, "ks", RF) create_c1c2_table(self, session) debug("Generating some data") insert_c1c2(session, n=100, consistency=CL) debug("Taking down node1") node1.stop(wait_other_notice=True) debug("Reading back data.") for n in xrange(100): query_c1c2(session, n, CL)
def hintedhandoff_test(self): cluster = self.cluster if DISABLE_VNODES: cluster.populate(2).start() else: tokens = cluster.balanced_tokens(2) cluster.populate(2, tokens=tokens).start() [node1, node2] = cluster.nodelist() cursor = self.patient_cql_connection(node1).cursor() self.create_ks(cursor, "ks", 2) create_c1c2_table(self, cursor) node2.stop(wait_other_notice=True) for n in xrange(0, 100): insert_c1c2(cursor, n, "ONE") log_mark = node1.mark_log() node2.start() node1.watch_log_for(["Finished hinted"], from_mark=log_mark, timeout=90) node1.stop(wait_other_notice=True) # Check node2 for all the keys that should have been delivered via HH cursor = self.patient_cql_connection(node2, keyspace="ks").cursor() for n in xrange(0, 100): query_c1c2(cursor, n, "ONE")
def consistent_reads_after_bootstrap_test(self): debug("Creating a ring") cluster = self.cluster cluster.set_configuration_options(values={ 'hinted_handoff_enabled': False, 'write_request_timeout_in_ms': 60000, 'read_request_timeout_in_ms': 60000, 'dynamic_snitch_badness_threshold': 0.0 }, batch_commitlog=True) cluster.populate(2).start() node1, node2 = cluster.nodelist() cluster.start(wait_for_binary_proto=True, wait_other_notice=True) debug("Set to talk to node 2") n2session = self.patient_cql_connection(node2) self.create_ks(n2session, 'ks', 2) create_c1c2_table(self, n2session) debug("Generating some data for all nodes") insert_c1c2(n2session, keys=range(10, 20), consistency=ConsistencyLevel.ALL) node1.flush() debug("Taking down node1") node1.stop(wait_other_notice=True) debug("Writing data to only node2") insert_c1c2(n2session, keys=range(30, 1000), consistency=ConsistencyLevel.ONE) node2.flush() debug("Restart node1") node1.start(wait_other_notice=True) debug("Bootstraping node3") node3 = new_node(cluster) node3.start(wait_for_binary_proto=True) n3session = self.patient_cql_connection(node3) n3session.execute("USE ks") debug("Checking that no data was lost") for n in xrange(10, 20): query_c1c2(n3session, n, ConsistencyLevel.ALL) for n in xrange(30, 1000): query_c1c2(n3session, n, ConsistencyLevel.ALL)
def quorum_quorum_test(self): session, session2 = self.cl_cl_prepare(ConsistencyLevel.QUORUM, ConsistencyLevel.QUORUM) #Stop a node and retest self.cluster.nodelist()[2].stop() for n in xrange(0, 100): insert_c1c2(session, n, ConsistencyLevel.QUORUM) query_c1c2(session2, n, ConsistencyLevel.QUORUM) self.cluster.nodelist()[1].stop() assert_unavailable(insert_c1c2, session, 100, ConsistencyLevel.QUORUM)
def _deprecated_repair_jmx(self, method, arguments): """ * Launch a two node, two DC cluster * Create a keyspace and table * Insert some data * Call the deprecated repair JMX API based on the arguments passed into this method * Check the node log to see if the correct repair was performed based on the jmx args """ cluster = self.cluster debug("Starting cluster..") cluster.populate([1, 1]) node1, node2 = cluster.nodelist() remove_perf_disable_shared_mem(node1) cluster.start() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 2) self.create_cf(session, 'cf', read_repair=0.0, columns={ 'c1': 'text', 'c2': 'text' }) insert_c1c2(session, n=1000, consistency=ConsistencyLevel.ALL) # Run repair mbean = make_mbean('db', 'StorageService') with JolokiaAgent(node1) as jmx: # assert repair runs and returns valid cmd number self.assertEqual(jmx.execute_method(mbean, method, arguments), 1) # wait for log to start node1.watch_log_for("Starting repair command") # get repair parameters from the log l = node1.grep_log(( "Starting repair command #1, repairing keyspace ks with repair options \(parallelism: (?P<parallelism>\w+), primary range: (?P<pr>\w+), " "incremental: (?P<incremental>\w+), job threads: (?P<jobs>\d+), ColumnFamilies: (?P<cfs>.+), dataCenters: (?P<dc>.+), " "hosts: (?P<hosts>.+), # of ranges: (?P<ranges>\d+)\)")) self.assertEqual(len(l), 1) line, m = l[0] return { "parallelism": m.group("parallelism"), "primary_range": m.group("pr"), "incremental": m.group("incremental"), "job_threads": m.group("jobs"), "column_families": m.group("cfs"), "data_centers": m.group("dc"), "hosts": m.group("hosts"), "ranges": m.group("ranges") }
def consistent_reads_after_move_test(self): debug("Creating a ring") cluster = self.cluster cluster.set_configuration_options(values={ 'hinted_handoff_enabled': False, 'write_request_timeout_in_ms': 60000, 'read_request_timeout_in_ms': 60000, 'dynamic_snitch_badness_threshold': 0.0 }, batch_commitlog=True) cluster.populate(3, tokens=[0, 2**48, 2**62]).start() [node1, node2, node3] = cluster.nodelist() cluster.start() debug("Set to talk to node 2") n2cursor = self.patient_cql_connection(node2).cursor() self.create_ks(n2cursor, 'ks', 2) create_c1c2_table(self, n2cursor) debug("Generating some data for all nodes") for n in xrange(10, 20): insert_c1c2(n2cursor, n, 'ALL') node1.flush() debug("Taking down node1") node1.stop(wait_other_notice=True) debug("Writing data to node2") for n in xrange(30, 1000): insert_c1c2(n2cursor, n, 'ONE') node2.flush() debug("Restart node1") node1.start(wait_other_notice=True) debug("Move token on node3") node3.move(2) debug("Checking that no data was lost") for n in xrange(10, 20): query_c1c2(n2cursor, n, 'ALL') for n in xrange(30, 1000): query_c1c2(n2cursor, n, 'ALL')
def one_all_test(self): session, session2 = self.cl_cl_prepare(ConsistencyLevel.ONE, ConsistencyLevel.ALL) #Stop a node and retest self.cluster.nodelist()[2].stop() for n in xrange(0, 100): insert_c1c2(session, n, ConsistencyLevel.ONE) assert_unavailable(query_c1c2, session2, 100, ConsistencyLevel.ALL) #Stop a node and retest self.cluster.nodelist()[1].stop() for n in xrange(0, 100): insert_c1c2(session, n, ConsistencyLevel.ONE) assert_unavailable(query_c1c2, session2, 100, ConsistencyLevel.ALL)
def non_local_read_test(self): """ This test reads from a coordinator we know has no copy of the data """ cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() cursor = self.patient_cql_connection(node1) self.create_ks(cursor, 'ks', 2) create_c1c2_table(self, cursor) # insert and get at CL.QUORUM (since RF=2, node1 won't have all key locally) for n in xrange(0, 1000): tools.insert_c1c2(cursor, n, ConsistencyLevel.QUORUM) tools.query_c1c2(cursor, n, ConsistencyLevel.QUORUM)
def hintedhandoff_decom_test(self): self.cluster.populate(4).start(wait_for_binary_proto=True) [node1, node2, node3, node4] = self.cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 2) create_c1c2_table(self, session) node4.stop(wait_other_notice=True) insert_c1c2(session, n=100, consistency=ConsistencyLevel.ONE) node1.decommission() node4.start(wait_for_binary_proto=True) node2.decommission() node3.decommission() time.sleep(5) for x in xrange(0, 100): query_c1c2(session, x, ConsistencyLevel.ONE)
def non_local_read_test(self): """ This test reads from a coordinator we know has no copy of the data """ cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor = self.cql_connection(node1).cursor() self.create_ks(cursor, 'ks', 2) self.create_cf(cursor, 'cf') # insert and get at CL.QUORUM (since RF=2, node1 won't have all key locally) for n in xrange(0, 1000): tools.insert_c1c2(cursor, n, "QUORUM") tools.query_c1c2(cursor, n, "QUORUM")
def non_local_read_test(self): """ This test reads from a coordinator we know has no copy of the data """ cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 2) create_c1c2_table(self, session) # insert and get at CL.QUORUM (since RF=2, node1 won't have all key locally) tools.insert_c1c2(session, n=1000, consistency=ConsistencyLevel.QUORUM) for n in xrange(0, 1000): tools.query_c1c2(session, n, ConsistencyLevel.QUORUM)
def one_one_test(self): session, session2 = self.cl_cl_prepare( ConsistencyLevel.ONE, ConsistencyLevel.ONE, tolerate_missing=True) #Stop a node and retest self.cluster.nodelist()[2].stop() for n in xrange(0, 100): insert_c1c2(session, n, ConsistencyLevel.ONE) query_c1c2(session2, n, ConsistencyLevel.ONE, tolerate_missing=True) #Stop a node and retest self.cluster.nodelist()[1].stop() for n in xrange(0, 100): insert_c1c2(session, n, ConsistencyLevel.ONE) query_c1c2(session2, n, ConsistencyLevel.ONE, tolerate_missing=False)
def cl_cl_prepare(self, write_cl, read_cl, tolerate_missing=False): cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 3) create_c1c2_table(self, session) session2 = self.patient_cql_connection(node2, 'ks') # insert and get at CL.QUORUM for n in xrange(0, 100): insert_c1c2(session, n, write_cl) query_c1c2(session2, n, read_cl, tolerate_missing) return session, session2
def movement_test(self): cluster = self.cluster # Create an unbalanced ring cluster.populate(3, tokens=[0, 2**48, 2**62]).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 1) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=30000, consistency=ConsistencyLevel.ONE) cluster.flush() # Move nodes to balance the cluster def move_node(node, token, ip): mark = node.mark_log() node.move(token) # can't assume 0 is balanced with m3p node.watch_log_for('{} state jump to NORMAL'.format(ip), from_mark=mark, timeout=180) time.sleep(3) balancing_tokens = cluster.balanced_tokens(3) move_node(node1, balancing_tokens[0], '127.0.0.1') move_node(node2, balancing_tokens[1], '127.0.0.2') move_node(node3, balancing_tokens[2], '127.0.0.3') time.sleep(1) cluster.cleanup() # Check we can get all the keys for n in xrange(0, 30000): query_c1c2(session, n, ConsistencyLevel.ONE) # Now the load should be basically even sizes = [node.data_size() for node in [node1, node2, node3]] assert_almost_equal(sizes[0], sizes[1]) assert_almost_equal(sizes[0], sizes[2]) assert_almost_equal(sizes[1], sizes[2])
def consistent_reads_after_bootstrap_test(self): debug("Creating a ring") cluster = self.cluster cluster.set_configuration_options(values={'hinted_handoff_enabled': False, 'write_request_timeout_in_ms': 60000, 'read_request_timeout_in_ms': 60000, 'dynamic_snitch_badness_threshold': 0.0}, batch_commitlog=True) cluster.populate(2).start() node1, node2 = cluster.nodelist() cluster.start(wait_for_binary_proto=True, wait_other_notice=True) debug("Set to talk to node 2") n2session = self.patient_cql_connection(node2) self.create_ks(n2session, 'ks', 2) create_c1c2_table(self, n2session) debug("Generating some data for all nodes") for n in xrange(10, 20): insert_c1c2(n2session, n, ConsistencyLevel.ALL) node1.flush() debug("Taking down node1") node1.stop(wait_other_notice=True) debug("Writing data to only node2") for n in xrange(30, 1000): insert_c1c2(n2session, n, ConsistencyLevel.ONE) node2.flush() debug("Restart node1") node1.start(wait_other_notice=True) debug("Boostraping node3") node3 = new_node(cluster) node3.start(wait_for_binary_proto=True) n3session = self.patient_cql_connection(node3) n3session.execute("USE ks") debug("Checking that no data was lost") for n in xrange(10, 20): query_c1c2(n3session, n, ConsistencyLevel.ALL) for n in xrange(30, 1000): query_c1c2(n3session, n, ConsistencyLevel.ALL)
def movement_test(self): cluster = self.cluster # Create an unbalanced ring cluster.populate(3, tokens=[0, 2**48, 2**62]).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 1) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=10000, consistency=ConsistencyLevel.ONE) cluster.flush() # Move nodes to balance the cluster def move_node(node, token, ip): mark = node.mark_log() node.move(token) # can't assume 0 is balanced with m3p node.watch_log_for('{} state jump to NORMAL'.format(ip), from_mark=mark, timeout=180) time.sleep(3) balancing_tokens = cluster.balanced_tokens(3) move_node(node1, balancing_tokens[0], '127.0.0.1') move_node(node2, balancing_tokens[1], '127.0.0.2') move_node(node3, balancing_tokens[2], '127.0.0.3') time.sleep(1) cluster.cleanup() # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.ONE) # Now the load should be basically even sizes = [node.data_size() for node in [node1, node2, node3]] assert_almost_equal(sizes[0], sizes[1]) assert_almost_equal(sizes[0], sizes[2]) assert_almost_equal(sizes[1], sizes[2])
def all_all_test(self): cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor1 = self.cql_connection(node1).cursor() self.create_ks(cursor1, 'ks', 3) create_c1c2_table(self, cursor1) cursor2 = self.cql_connection(node2, 'ks').cursor() # insert and get at CL.ALL for n in xrange(0, 100): insert_c1c2(cursor1, n, "ALL") query_c1c2(cursor2, n, "ALL") # shutdown one node and test we get unavailabe exception node3.stop(wait_other_notice=True) assert_unavailable(insert_c1c2, cursor1, 100, "ALL")
def all_one_test(self): cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor1 = self.patient_cql_connection(node1).cursor() self.create_ks(cursor1, "ks", 3) create_c1c2_table(self, cursor1) cursor2 = self.patient_cql_connection(node2, "ks").cursor() # insert and get at CL.ONE for n in xrange(0, 100): insert_c1c2(cursor1, n, "ALL") query_c1c2(cursor2, n, "ONE") # shutdown a node an test again node3.stop(wait_other_notice=True) assert_unavailable(insert_c1c2, cursor1, 100, "ALL")
def all_one_test(self): cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor1 = self.patient_cql_connection(node1).cursor() self.create_ks(cursor1, 'ks', 3) create_c1c2_table(self, cursor1) cursor2 = self.patient_cql_connection(node2, 'ks').cursor() # insert and get at CL.ONE for n in xrange(0, 100): insert_c1c2(cursor1, n, "ALL") query_c1c2(cursor2, n, "ONE") # shutdown a node an test again node3.stop(wait_other_notice=True) assert_unavailable(insert_c1c2, cursor1, 100, "ALL")
def tracing_from_system_traces_test(self): self.cluster.populate(1).start(wait_for_binary_proto=True) node1, = self.cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 1) create_c1c2_table(self, session) for n in xrange(100): insert_c1c2(session, n) out, err = self.run_cqlsh(node1, 'TRACING ON; SELECT * FROM ks.cf') self.assertIn('Tracing session: ', out) out, err = self.run_cqlsh(node1, 'TRACING ON; SELECT * FROM system_traces.events') self.assertNotIn('Tracing session: ', out) out, err = self.run_cqlsh(node1, 'TRACING ON; SELECT * FROM system_traces.sessions') self.assertNotIn('Tracing session: ', out)
def one_one_test(self): cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor1 = self.patient_cql_connection(node1).cursor() self.create_ks(cursor1, "ks", 3) create_c1c2_table(self, cursor1) cursor2 = self.patient_cql_connection(node2, "ks").cursor() # insert and get at CL.ONE for n in xrange(0, 100): insert_c1c2(cursor1, n, "ONE") retry_till_success(query_c1c2, cursor2, n, "ONE", timeout=5) # shutdown a node an test again node3.stop(wait_other_notice=True) for n in xrange(100, 200): insert_c1c2(cursor1, n, "ONE") retry_till_success(query_c1c2, cursor2, n, "ONE", timeout=5) # shutdown a second node an test again node2.stop(wait_other_notice=True) for n in xrange(200, 300): insert_c1c2(cursor1, n, "ONE") retry_till_success(query_c1c2, cursor1, n, "ONE", timeout=5)
def _populate_cluster(self, start=True): cluster = self.cluster # Disable hinted handoff and set batch commit log so this doesn't # interfere with the test (this must be after the populate) cluster.set_configuration_options(values={'hinted_handoff_enabled': False}) cluster.set_batch_commitlog(enabled=True) debug("Starting cluster..") cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) session.cluster.default_retry_policy = FlakyRetryPolicy(max_retries=15) self.create_ks(session, 'ks', 3) self.create_cf(session, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'}) # Insert 1000 keys, kill node 3, insert 1 key, restart node 3, insert 1000 more keys debug("Inserting data...") insert_c1c2(session, n=1000, consistency=ConsistencyLevel.ALL) node3.flush() node3.stop(wait_other_notice=True) insert_c1c2(session, keys=(1000, ), consistency=ConsistencyLevel.TWO) node3.start(wait_other_notice=True, wait_for_binary_proto=True) insert_c1c2(session, keys=range(1001, 2001), consistency=ConsistencyLevel.ALL) cluster.flush()
def one_one_test(self): cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor1 = self.patient_cql_connection(node1).cursor() self.create_ks(cursor1, 'ks', 3) create_c1c2_table(self, cursor1) cursor2 = self.patient_cql_connection(node2, 'ks').cursor() # insert and get at CL.ONE for n in xrange(0, 100): insert_c1c2(cursor1, n, "ONE") retry_till_success(query_c1c2, cursor2, n, "ONE", timeout=5) # shutdown a node an test again node3.stop(wait_other_notice=True) for n in xrange(100, 200): insert_c1c2(cursor1, n, "ONE") retry_till_success(query_c1c2, cursor2, n, "ONE", timeout=5) # shutdown a second node an test again node2.stop(wait_other_notice=True) for n in xrange(200, 300): insert_c1c2(cursor1, n, "ONE") retry_till_success(query_c1c2, cursor1, n, "ONE", timeout=5)
def consistent_reads_after_move_test(self): debug("Creating a ring") cluster = self.cluster cluster.set_configuration_options(values={ 'hinted_handoff_enabled' : False, 'write_request_timeout_in_ms' : 60000, 'read_request_timeout_in_ms' : 60000, 'dynamic_snitch_badness_threshold' : 0.0}, batch_commitlog=True) cluster.populate(3, tokens=[0, 2**48, 2**62]).start() [node1, node2, node3] = cluster.nodelist() cluster.start() debug("Set to talk to node 2") n2cursor = self.patient_cql_connection(node2) self.create_ks(n2cursor, 'ks', 2) create_c1c2_table(self, n2cursor) debug("Generating some data for all nodes") for n in xrange(10,20): insert_c1c2(n2cursor, n, ConsistencyLevel.ALL) node1.flush() debug("Taking down node1") node1.stop(wait_other_notice=True) debug("Writing data to node2") for n in xrange(30,1000): insert_c1c2(n2cursor, n, ConsistencyLevel.ONE) node2.flush() debug("Restart node1") node1.start(wait_other_notice=True) debug("Move token on node3") node3.move(2) debug("Checking that no data was lost") for n in xrange(10,20): query_c1c2(n2cursor, n, ConsistencyLevel.ALL) for n in xrange(30,1000): query_c1c2(n2cursor, n, ConsistencyLevel.ALL)
def multiple_repair_test(self): cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 3) self.create_cf(session, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'}) debug("insert data") insert_c1c2(session, keys=range(1, 50), consistency=ConsistencyLevel.ALL) node1.flush() debug("bringing down node 3") node3.flush() node3.stop(gently=False) debug("inserting additional data into node 1 and 2") insert_c1c2(session, keys=range(50, 100), consistency=ConsistencyLevel.TWO) node1.flush() node2.flush() debug("restarting and repairing node 3") node3.start(wait_for_binary_proto=True) if cluster.version() >= "2.2": node3.repair() else: node3.nodetool("repair -par -inc") # wait stream handlers to be closed on windows # after session is finished (See CASSANDRA-10644) if is_win: time.sleep(2) debug("stopping node 2") node2.stop(gently=False) debug("inserting data in nodes 1 and 3") insert_c1c2(session, keys=range(100, 150), consistency=ConsistencyLevel.TWO) node1.flush() node3.flush() debug("start and repair node 2") node2.start(wait_for_binary_proto=True) if cluster.version() >= "2.2": node2.repair() else: node2.nodetool("repair -par -inc") debug("replace node and check data integrity") node3.stop(gently=False) node5 = Node('node5', cluster, True, ('127.0.0.5', 9160), ('127.0.0.5', 7000), '7500', '0', None, ('127.0.0.5', 9042)) cluster.add(node5, False) node5.start(replace_address='127.0.0.3', wait_other_notice=True) assert_one(session, "SELECT COUNT(*) FROM ks.cf LIMIT 200", [149])
def decommission_test(self): cluster = self.cluster tokens = cluster.balanced_tokens(4) cluster.populate(4, tokens=tokens).start() node1, node2, node3, node4 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 2) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=30000, consistency=ConsistencyLevel.QUORUM) cluster.flush() sizes = [ node.data_size() for node in cluster.nodelist() if node.is_running() ] init_size = sizes[0] assert_almost_equal(*sizes) time.sleep(.5) node4.decommission() node4.stop() cluster.cleanup() time.sleep(.5) # Check we can get all the keys for n in xrange(0, 30000): query_c1c2(session, n, ConsistencyLevel.QUORUM) sizes = [ node.data_size() for node in cluster.nodelist() if node.is_running() ] debug(sizes) assert_almost_equal(sizes[0], sizes[1]) assert_almost_equal((2.0 / 3.0) * sizes[0], sizes[2]) assert_almost_equal(sizes[2], init_size)
def movement_test(self): cluster = self.cluster # Create an unbalanced ring cluster.populate(3, tokens=[0, 2**48, 2**62]).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 1) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=10000, consistency=ConsistencyLevel.ONE) cluster.flush() # Move nodes to balance the cluster balancing_tokens = cluster.balanced_tokens(3) escformat = '\\%s' if cluster.version() >= '2.1': escformat = '%s' node1.move(escformat % balancing_tokens[0]) # can't assume 0 is balanced with m3p node2.move(escformat % balancing_tokens[1]) node3.move(escformat % balancing_tokens[2]) time.sleep(1) cluster.cleanup() # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.ONE) # Now the load should be basically even sizes = [node.data_size() for node in [node1, node2, node3]] assert_almost_equal(sizes[0], sizes[1]) assert_almost_equal(sizes[0], sizes[2]) assert_almost_equal(sizes[1], sizes[2])
def multiple_repair_test(self): cluster = self.cluster cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cursor = self.patient_cql_connection(node1) self.create_ks(cursor, 'ks', 3) self.create_cf(cursor, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'}) debug("insert data") for x in range(1, 50): insert_c1c2(cursor, x, ConsistencyLevel.ALL) node1.flush() debug("bringing down node 3") node3.flush() node3.stop(gently=False) debug("inserting additional data into node 1 and 2") for y in range(50, 100): insert_c1c2(cursor, y, ConsistencyLevel.TWO) node1.flush() node2.flush() debug("restarting and repairing node 3") node3.start() if cluster.version() >= "3.0": node3.repair() else: node3.nodetool("repair -par -inc") debug("stopping node 2") node2.stop(gently=False) debug("inserting data in nodes 1 and 3") for z in range(100, 150): insert_c1c2(cursor, z, ConsistencyLevel.TWO) node1.flush() node3.flush() debug("start and repair node 2") node2.start() if cluster.version() >= "3.0": node2.repair() else: node2.nodetool("repair -par -inc") debug("replace node and check data integrity") node3.stop(gently=False) node5 = Node('node5', cluster, True, ('127.0.0.5', 9160), ('127.0.0.5', 7000), '7500', '0', None, ('127.0.0.5',9042)) cluster.add(node5, False) node5.start(replace_address = '127.0.0.3', wait_other_notice=True) assert_one(cursor, "SELECT COUNT(*) FROM ks.cf LIMIT 200", [149])
def _setup_multi_dc(self): """ Sets up 3 DCs (2 nodes in 'dc1', and one each in 'dc2' and 'dc3'). After set up, node2 in dc1 lacks some data and needs to be repaired. """ cluster = self.cluster # Disable hinted handoff and set batch commit log so this doesn't # interfer with the test (this must be after the populate) cluster.set_configuration_options( values={'hinted_handoff_enabled': False}, batch_commitlog=True) debug("Starting cluster..") # populate 2 nodes in dc1, and one node each in dc2 and dc3 cluster.populate([2, 1, 1]).start() version = cluster.version() [node1, node2, node3, node4] = cluster.nodelist() session = self.patient_cql_connection(node1) session.execute( "CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 1, 'dc3':1};" ) session.execute("USE ks") self.create_cf(session, 'cf', read_repair=0.0, columns={ 'c1': 'text', 'c2': 'text' }) # Insert 1000 keys, kill node 3, insert 1 key, restart node 3, insert 1000 more keys debug("Inserting data...") insert_c1c2(session, n=1000, consistency=ConsistencyLevel.ALL) node2.flush() node2.stop(wait_other_notice=True) insert_c1c2(session, keys=(1000, ), consistency=ConsistencyLevel.THREE) node2.start(wait_for_binary_proto=True, wait_other_notice=True) node1.watch_log_for_alive(node2) insert_c1c2(session, keys=range(1001, 2001), consistency=ConsistencyLevel.ALL) cluster.flush() # Verify that only node2 has only 2000 keys and others have 2001 keys debug("Checking data...") self.check_rows_on_node(node2, 2000, missings=[1000]) for node in [node1, node3, node4]: self.check_rows_on_node(node, 2001, found=[1000]) return cluster
def _setup_multi_dc(self): """ Sets up 3 DCs (2 nodes in 'dc1', and one each in 'dc2' and 'dc3'). After set up, node2 in dc1 lacks some data and needs to be repaired. """ cluster = self.cluster # Disable hinted handoff and set batch commit log so this doesn't # interfer with the test (this must be after the populate) cluster.set_configuration_options(values={'hinted_handoff_enabled': False}, batch_commitlog=True) debug("Starting cluster..") # populate 2 nodes in dc1, and one node each in dc2 and dc3 cluster.populate([2, 1, 1]).start() version = cluster.version() [node1, node2, node3, node4] = cluster.nodelist() session = self.patient_cql_connection(node1) session.execute("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 1, 'dc3':1};") session.execute("USE ks") self.create_cf(session, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'}) # Insert 1000 keys, kill node 3, insert 1 key, restart node 3, insert 1000 more keys debug("Inserting data...") for i in xrange(0, 1000): insert_c1c2(session, i, ConsistencyLevel.ALL) node2.flush() node2.stop() insert_c1c2(session, 1000, ConsistencyLevel.THREE) node2.start(wait_for_binary_proto=True, wait_other_notice=True) node1.watch_log_for_alive(node2) for i in xrange(1001, 2001): insert_c1c2(session, i, ConsistencyLevel.ALL) cluster.flush() # Verify that only node2 has only 2000 keys and others have 2001 keys debug("Checking data...") self.check_rows_on_node(node2, 2000, missings=[1000]) for node in [node1, node3, node4]: self.check_rows_on_node(node, 2001, found=[1000]) return cluster
def _simple_repair(self, order_preserving_partitioner=False, sequential=True): cluster = self.cluster if order_preserving_partitioner: cluster.set_partitioner( 'org.apache.cassandra.dht.ByteOrderedPartitioner') # Disable hinted handoff and set batch commit log so this doesn't # interfer with the test (this must be after the populate) cluster.set_configuration_options( values={'hinted_handoff_enabled': False}, batch_commitlog=True) debug("Starting cluster..") cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 3) self.create_cf(session, 'cf', read_repair=0.0, columns={ 'c1': 'text', 'c2': 'text' }) # Insert 1000 keys, kill node 3, insert 1 key, restart node 3, insert 1000 more keys debug("Inserting data...") insert_c1c2(session, n=1000, consistency=ConsistencyLevel.ALL) node3.flush() node3.stop(wait_other_notice=True) insert_c1c2(session, keys=(1000, ), consistency=ConsistencyLevel.TWO) node3.start(wait_other_notice=True, wait_for_binary_proto=True) insert_c1c2(session, keys=range(1001, 2001), consistency=ConsistencyLevel.ALL) cluster.flush() # Verify that node3 has only 2000 keys debug("Checking data on node3...") self.check_rows_on_node(node3, 2000, missings=[1000]) # Verify that node1 has 2001 keys debug("Checking data on node1...") self.check_rows_on_node(node1, 2001, found=[1000]) # Verify that node2 has 2001 keys debug("Checking data on node2...") self.check_rows_on_node(node2, 2001, found=[1000]) time.sleep(10) # see CASSANDRA-4373 # Run repair start = time.time() debug("starting repair...") node1.repair(self._repair_options(ks='ks', sequential=sequential)) debug("Repair time: {end}".format(end=time.time() - start)) # Validate that only one range was transfered out_of_sync_logs = node1.grep_log( "/([0-9.]+) and /([0-9.]+) have ([0-9]+) range\(s\) out of sync") if cluster.version() > "1": self.assertEqual( len(out_of_sync_logs), 2, "Lines matching: " + str([elt[0] for elt in out_of_sync_logs])) else: # In pre-1.0, we should have only one line self.assertEqual( len(out_of_sync_logs), 1, "Lines matching: " + str([elt[0] for elt in out_of_sync_logs])) valid = [(node1.address(), node3.address()), (node3.address(), node1.address()), (node2.address(), node3.address()), (node3.address(), node2.address())] for line, m in out_of_sync_logs: self.assertEqual( int(m.group(3)), 1, "Expecting 1 range out of sync, got " + m.group(3)) self.assertIn((m.group(1), m.group(2)), valid, str((m.group(1), m.group(2)))) valid.remove((m.group(1), m.group(2))) valid.remove((m.group(2), m.group(1))) # Check node3 now has the key self.check_rows_on_node(node3, 2001, found=[1000], restart=False)
def decommission_test(self): cluster = self.cluster tokens = cluster.balanced_tokens(4) cluster.populate(4, tokens=tokens).start() node1, node2, node3, node4 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 2) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=10000, consistency=ConsistencyLevel.QUORUM) cluster.flush() sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] init_size = sizes[0] assert_almost_equal(*sizes) time.sleep(.5) node4.decommission() node4.stop() cluster.cleanup() time.sleep(.5) # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.QUORUM) sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] three_node_sizes = sizes assert_almost_equal(sizes[0], sizes[1]) assert_almost_equal((2.0 / 3.0) * sizes[0], sizes[2]) assert_almost_equal(sizes[2], init_size) if cluster.version() <= '1.2': node3.stop(wait_other_notice=True) node1.removeToken(tokens[2]) time.sleep(.5) cluster.cleanup() time.sleep(.5) # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.QUORUM) sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] assert_almost_equal(*sizes) assert_almost_equal(sizes[0], 2 * init_size) node5 = new_node(cluster, token=(tokens[2] + 1)).start() time.sleep(.5) cluster.cleanup() time.sleep(.5) cluster.compact() time.sleep(.5) # Check we can get all the keys for n in xrange(0, 10000): query_c1c2(session, n, ConsistencyLevel.QUORUM) sizes = [node.data_size() for node in cluster.nodelist() if node.is_running()] # We should be back to the earlir 3 nodes situation for i in xrange(0, len(sizes)): assert_almost_equal(sizes[i], three_node_sizes[i])
def simple_rebuild_test(self): """ @jira_ticket CASSANDRA-9119 Test rebuild from other dc works as expected. """ keys = 1000 cluster = self.cluster cluster.set_configuration_options(values={ 'endpoint_snitch': 'org.apache.cassandra.locator.PropertyFileSnitch' }) node1 = cluster.create_node('node1', False, ('127.0.0.1', 9160), ('127.0.0.1', 7000), '7100', '2000', None, binary_interface=('127.0.0.1', 9042)) cluster.add(node1, True, data_center='dc1') # start node in dc1 node1.start(wait_for_binary_proto=True) # populate data in dc1 session = self.patient_exclusive_cql_connection(node1) self.create_ks(session, 'ks', {'dc1': 1}) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=keys, consistency=ConsistencyLevel.LOCAL_ONE) # check data for i in xrange(0, keys): query_c1c2(session, i, ConsistencyLevel.LOCAL_ONE) session.shutdown() # Bootstrapping a new node in dc2 with auto_bootstrap: false node2 = cluster.create_node('node2', False, ('127.0.0.2', 9160), ('127.0.0.2', 7000), '7200', '2001', None, binary_interface=('127.0.0.2', 9042)) cluster.add(node2, False, data_center='dc2') node2.start(wait_other_notice=True, wait_for_binary_proto=True) # wait for snitch to reload time.sleep(60) # alter keyspace to replicate to dc2 session = self.patient_exclusive_cql_connection(node2) session.execute( "ALTER KEYSPACE ks WITH REPLICATION = {'class':'NetworkTopologyStrategy', 'dc1':1, 'dc2':1};" ) # alter system_auth -- rebuilding it no longer possible after # CASSANDRA-11848 prevented local node from being considered a source session.execute( "ALTER KEYSPACE system_auth WITH REPLICATION = {'class':'NetworkTopologyStrategy', 'dc1':1, 'dc2':1};" ) session.execute('USE ks') self.rebuild_errors = 0 # rebuild dc2 from dc1 def rebuild(): try: node2.nodetool('rebuild dc1') except ToolError as e: if 'Node is still rebuilding' in e.stdout: self.rebuild_errors += 1 else: raise e class Runner(Thread): def __init__(self, func): Thread.__init__(self) self.func = func self.thread_exc_info = None def run(self): """ Closes over self to catch any exceptions raised by func and register them at self.thread_exc_info Based on http://stackoverflow.com/a/1854263 """ try: self.func() except Exception: import sys self.thread_exc_info = sys.exc_info() cmd1 = Runner(rebuild) cmd1.start() # concurrent rebuild should not be allowed (CASSANDRA-9119) # (following sleep is needed to avoid conflict in 'nodetool()' method setting up env.) time.sleep(.1) # we don't need to manually raise exeptions here -- already handled rebuild() cmd1.join() # manually raise exception from cmd1 thread # see http://stackoverflow.com/a/1854263 if cmd1.thread_exc_info is not None: raise cmd1.thread_exc_info[1], None, cmd1.thread_exc_info[2] # exactly 1 of the two nodetool calls should fail # usually it will be the one in the main thread, # but occasionally it wins the race with the one in the secondary thread, # so we check that one succeeded and the other failed self.assertEqual( self.rebuild_errors, 1, msg= 'rebuild errors should be 1, but found {}. Concurrent rebuild should not be allowed, but one rebuild command should have succeeded.' .format(self.rebuild_errors)) # check data for i in xrange(0, keys): query_c1c2(session, i, ConsistencyLevel.LOCAL_ONE)
def taketoken_test(self): debug("Creating a ring") cluster = self.cluster cluster.set_configuration_options(values={ 'initial_token': None, 'num_tokens': 10, 'hinted_handoff_enabled': False, 'write_request_timeout_in_ms': 60000, 'read_request_timeout_in_ms': 60000, 'dynamic_snitch_badness_threshold': 0.0 }, batch_commitlog=True) cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cluster.start() debug("Set to talk to node 2") n2cursor = self.patient_cql_connection(node2).cursor() self.create_ks(n2cursor, 'ks', 2) create_c1c2_table(self, n2cursor) debug("Generating some data for all nodes") for n in xrange(10, 20): insert_c1c2(n2cursor, n, 'ALL') node1.flush() debug("Writing data to node2") for n in xrange(30, 1000): insert_c1c2(n2cursor, n, 'ONE') node2.flush() debug("Getting token from node 1") n1cursor = self.patient_cql_connection(node1).cursor() n1cursor.execute('SELECT tokens FROM system.local') n1tokens = n1cursor.fetchone() n3cursor = self.patient_cql_connection(node3).cursor() n3cursor.execute('SELECT tokens FROM system.local') n3tokens = n3cursor.fetchone() debug("Relocate tokens from node1 to node3") i = 0 tl = "" for t in n1tokens[0]: if i == 8: break t = '\\%s' % t tl = "%s %s" % (tl, t) i += 1 cmd = "taketoken %s" % tl debug(cmd) node3.nodetool(cmd) time.sleep(1) debug("Check that the tokens were really moved") n3cursor.execute('SELECT tokens FROM system.local') n3tokens = n3cursor.fetchone() n1cursor.execute('SELECT tokens FROM system.local') n1tokens = n1cursor.fetchone() debug("n1 %s n3 %s" % (n1tokens, n3tokens)) assert len(n3tokens[0]) == 18 assert len(n1tokens[0]) == 2 debug("Checking that no data was lost") for n in xrange(10, 20): query_c1c2(n2cursor, n, 'ALL') for n in xrange(30, 1000): query_c1c2(n2cursor, n, 'ALL')
def _simple_repair(self, order_preserving_partitioner=False, sequential=True): """ * Configure a three node cluster to not use hinted handoff, and to use batch commitlog * Launch the cluster * Create a keyspace at RF 3 and table * Insert one thousand rows at CL ALL * Flush on node3 and shut it down * Insert one row at CL TWO * Restart node3 * Insert one thousand more rows at CL ALL * Flush all nodes * Check node3 only has 2000 keys * Check node1 and node2 have 2001 keys * Perform the repair type specified by the parent test * Assert the appropriate messages are logged * Assert node3 now has all data @jira_ticket CASSANDRA-4373 """ cluster = self.cluster if order_preserving_partitioner: cluster.set_partitioner( 'org.apache.cassandra.dht.ByteOrderedPartitioner') # Disable hinted handoff and set batch commit log so this doesn't # interfere with the test (this must be after the populate) cluster.set_configuration_options( values={'hinted_handoff_enabled': False}, batch_commitlog=True) debug("Starting cluster..") cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 3) self.create_cf(session, 'cf', read_repair=0.0, columns={ 'c1': 'text', 'c2': 'text' }) # Insert 1000 keys, kill node 3, insert 1 key, restart node 3, insert 1000 more keys debug("Inserting data...") insert_c1c2(session, n=1000, consistency=ConsistencyLevel.ALL) node3.flush() node3.stop(wait_other_notice=True) insert_c1c2(session, keys=(1000, ), consistency=ConsistencyLevel.TWO) node3.start(wait_other_notice=True, wait_for_binary_proto=True) insert_c1c2(session, keys=range(1001, 2001), consistency=ConsistencyLevel.ALL) cluster.flush() # Verify that node3 has only 2000 keys debug("Checking data on node3...") self.check_rows_on_node(node3, 2000, missings=[1000]) # Verify that node1 has 2001 keys debug("Checking data on node1...") self.check_rows_on_node(node1, 2001, found=[1000]) # Verify that node2 has 2001 keys debug("Checking data on node2...") self.check_rows_on_node(node2, 2001, found=[1000]) time.sleep(10) # see CASSANDRA-4373 # Run repair start = time.time() debug("starting repair...") node1.repair( _repair_options(self.cluster.version(), ks='ks', sequential=sequential)) debug("Repair time: {end}".format(end=time.time() - start)) # Validate that only one range was transfered out_of_sync_logs = node1.grep_log( "/([0-9.]+) and /([0-9.]+) have ([0-9]+) range\(s\) out of sync") self.assertEqual( len(out_of_sync_logs), 2, "Lines matching: " + str([elt[0] for elt in out_of_sync_logs])) valid_out_of_sync_pairs = [{node1.address(), node3.address()}, {node2.address(), node3.address()}] for line, m in out_of_sync_logs: num_out_of_sync_ranges, out_of_sync_nodes = m.group(3), { m.group(1), m.group(2) } self.assertEqual( int(num_out_of_sync_ranges), 1, "Expecting 1 range out of sync for {}, but saw {}".format( out_of_sync_nodes, line)) self.assertIn(out_of_sync_nodes, valid_out_of_sync_pairs, str(out_of_sync_nodes)) # Check node3 now has the key self.check_rows_on_node(node3, 2001, found=[1000], restart=False)
def multiple_repair_test(self): """ * Launch a three node cluster * Create a keyspace with RF 3 and a table * Insert 49 rows * Stop node3 * Insert 50 more rows * Restart node3 * Issue an incremental repair on node3 * Stop node2 * Insert a final50 rows * Restart node2 * Issue an incremental repair on node2 * Replace node3 with a new node * Verify data integrity # TODO: Several more verifications of data need to be interspersed throughout the test. The final assertion is insufficient. @jira_ticket CASSANDRA-10644 """ cluster = self.cluster cluster.populate(3).start() node1, node2, node3 = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 3) self.create_cf(session, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'}) debug("insert data") insert_c1c2(session, keys=range(1, 50), consistency=ConsistencyLevel.ALL) node1.flush() debug("bringing down node 3") node3.flush() node3.stop(gently=False) debug("inserting additional data into node 1 and 2") insert_c1c2(session, keys=range(50, 100), consistency=ConsistencyLevel.TWO) node1.flush() node2.flush() debug("restarting and repairing node 3") node3.start(wait_for_binary_proto=True) if cluster.version() >= "2.2": node3.repair() else: node3.nodetool("repair -par -inc") # wait stream handlers to be closed on windows # after session is finished (See CASSANDRA-10644) if is_win: time.sleep(2) debug("stopping node 2") node2.stop(gently=False) debug("inserting data in nodes 1 and 3") insert_c1c2(session, keys=range(100, 150), consistency=ConsistencyLevel.TWO) node1.flush() node3.flush() debug("start and repair node 2") node2.start(wait_for_binary_proto=True) if cluster.version() >= "2.2": node2.repair() else: node2.nodetool("repair -par -inc") debug("replace node and check data integrity") node3.stop(gently=False) node5 = Node('node5', cluster, True, ('127.0.0.5', 9160), ('127.0.0.5', 7000), '7500', '0', None, ('127.0.0.5', 9042)) cluster.add(node5, False) node5.start(replace_address='127.0.0.3', wait_other_notice=True) assert_one(session, "SELECT COUNT(*) FROM ks.cf LIMIT 200", [149])
def consistent_reads_after_relocate_test(self): debug("Creating a ring") cluster = self.cluster cluster.set_configuration_options(values={ 'initial_token': None, 'num_tokens': 10, 'hinted_handoff_enabled': False, 'write_request_timeout_in_ms': 60000, 'read_request_timeout_in_ms': 60000, 'dynamic_snitch_badness_threshold': 0.0 }, batch_commitlog=True) cluster.populate(3).start() [node1, node2, node3] = cluster.nodelist() cluster.start() debug("Set to talk to node 2") n2cursor = self.patient_cql_connection(node2).cursor() self.create_ks(n2cursor, 'ks', 2) create_c1c2_table(self, n2cursor) debug("Generating some data for all nodes") for n in xrange(10, 20): insert_c1c2(n2cursor, n, 'ALL') node1.flush() debug("Taking down node1") node3.stop(wait_other_notice=True) debug("Writing data to node2") for n in xrange(30, 1000): insert_c1c2(n2cursor, n, 'ONE') node2.flush() debug("Restart node1") node3.start(wait_other_notice=True) debug("Getting token from node 1") n1cursor = self.patient_cql_connection(node1).cursor() n1cursor.execute('SELECT tokens FROM system.local') tokens = n1cursor.fetchone() debug("Relocate tokens from node1 to node3") tl = " ".join(str(t) for t in list(tokens[0])[:8]) cmd = "taketoken %s" % tl debug(cmd) node3.nodetool(cmd) n1cursor.execute('SELECT tokens FROM system.local') tokens = n1cursor.fetchone() debug("%s" % tokens) assert len(tokens) == 2 debug("Checking that no data was lost") for n in xrange(10, 20): query_c1c2(n2cursor, n, 'ALL') for n in xrange(30, 1000): query_c1c2(n2cursor, n, 'ALL')
def simple_rebuild_test(self): """ @jira_ticket CASSANDRA-9119 Test rebuild from other dc works as expected. """ keys = 1000 cluster = self.cluster cluster.set_configuration_options(values={ 'endpoint_snitch': 'org.apache.cassandra.locator.PropertyFileSnitch' }) node1 = cluster.create_node('node1', False, ('127.0.0.1', 9160), ('127.0.0.1', 7000), '7100', '2000', None, binary_interface=('127.0.0.1', 9042)) cluster.add(node1, True, data_center='dc1') # start node in dc1 node1.start(wait_for_binary_proto=True) # populate data in dc1 session = self.patient_exclusive_cql_connection(node1) self.create_ks(session, 'ks', {'dc1': 1}) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=keys, consistency=ConsistencyLevel.ALL) # check data for i in xrange(0, keys): query_c1c2(session, i, ConsistencyLevel.ALL) session.shutdown() # Bootstraping a new node in dc2 with auto_bootstrap: false node2 = cluster.create_node('node2', False, ('127.0.0.2', 9160), ('127.0.0.2', 7000), '7200', '2001', None, binary_interface=('127.0.0.2', 9042)) cluster.add(node2, False, data_center='dc2') node2.start(wait_other_notice=True, wait_for_binary_proto=True) # wait for snitch to reload time.sleep(60) # alter keyspace to replicate to dc2 session = self.patient_exclusive_cql_connection(node2) session.execute( "ALTER KEYSPACE ks WITH REPLICATION = {'class':'NetworkTopologyStrategy', 'dc1':1, 'dc2':1};" ) session.execute('USE ks') self.rebuild_errors = 0 # rebuild dc2 from dc1 def rebuild(): try: node2.nodetool('rebuild dc1') except NodetoolError as e: if 'Node is still rebuilding' in e.message: self.rebuild_errors += 1 cmd1 = Thread(target=rebuild) cmd1.start() # concurrent rebuild should not be allowed (CASSANDRA-9119) # (following sleep is needed to avoid conflict in 'nodetool()' method setting up env.) time.sleep(.1) try: node2.nodetool('rebuild dc1') except NodetoolError: self.rebuild_errors += 1 cmd1.join() # exactly 1 of the two nodetool calls should fail # usually it will be the one in the main thread, # but occasionally it wins the race with the one in the secondary thread, # so we check that one succeeded and the other failed self.assertEqual( self.rebuild_errors, 1, msg= 'concurrent rebuild should not be allowed, but one rebuild command should have succeeded.' ) # check data for i in xrange(0, keys): query_c1c2(session, i, ConsistencyLevel.ALL)
def rebuild_ranges_test(self): """ @jira_ticket CASSANDRA-10406 """ keys = 1000 cluster = self.cluster tokens = cluster.balanced_tokens_across_dcs(['dc1', 'dc2']) cluster.set_configuration_options(values={ 'endpoint_snitch': 'org.apache.cassandra.locator.PropertyFileSnitch' }) cluster.set_configuration_options(values={'num_tokens': 1}) node1 = cluster.create_node('node1', False, ('127.0.0.1', 9160), ('127.0.0.1', 7000), '7100', '2000', tokens[0], binary_interface=('127.0.0.1', 9042)) node1.set_configuration_options(values={'initial_token': tokens[0]}) cluster.add(node1, True, data_center='dc1') node1 = cluster.nodelist()[0] # start node in dc1 node1.start(wait_for_binary_proto=True) # populate data in dc1 session = self.patient_exclusive_cql_connection(node1) # ks1 will be rebuilt in node2 self.create_ks(session, 'ks1', {'dc1': 1}) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=keys, consistency=ConsistencyLevel.ALL) # ks2 will not be rebuilt in node2 self.create_ks(session, 'ks2', {'dc1': 1}) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=keys, consistency=ConsistencyLevel.ALL) session.shutdown() # Bootstraping a new node in dc2 with auto_bootstrap: false node2 = cluster.create_node('node2', False, ('127.0.0.2', 9160), ('127.0.0.2', 7000), '7200', '2001', tokens[1], binary_interface=('127.0.0.2', 9042)) node2.set_configuration_options(values={'initial_token': tokens[1]}) cluster.add(node2, False, data_center='dc2') node2.start(wait_other_notice=True, wait_for_binary_proto=True) # wait for snitch to reload time.sleep(60) # alter keyspace to replicate to dc2 session = self.patient_exclusive_cql_connection(node2) session.execute( "ALTER KEYSPACE ks1 WITH REPLICATION = {'class':'NetworkTopologyStrategy', 'dc1':1, 'dc2':1};" ) session.execute( "ALTER KEYSPACE ks2 WITH REPLICATION = {'class':'NetworkTopologyStrategy', 'dc1':1, 'dc2':1};" ) session.execute('USE ks1') # rebuild only ks1 with range that is node1's replica node2.nodetool('rebuild -ks ks1 -ts (%s,%s] dc1' % (tokens[1], str(pow(2, 63) - 1))) # check data is sent by stopping node1 node1.stop() for i in xrange(0, keys): query_c1c2(session, i, ConsistencyLevel.ONE) # ks2 should not be streamed session.execute('USE ks2') for i in xrange(0, keys): query_c1c2(session, i, ConsistencyLevel.ONE, tolerate_missing=True, must_be_missing=True)