Example #1
0
    def test_simultaneous_bootstrap(self):
        """
        Attempt to bootstrap two nodes at once, to assert the second bootstrapped node fails, and does not interfere.

        Start a one node cluster and run a stress write workload.
        Start up a second node, and wait for the first node to detect it has joined the cluster.
        While the second node is bootstrapping, start a third node. This should fail.

        @jira_ticket CASSANDRA-7069
        @jira_ticket CASSANDRA-9484
        """

        bootstrap_error = "Other bootstrapping/leaving/moving nodes detected," \
                          " cannot bootstrap while cassandra.consistent.rangemovement is true"

        cluster = self.cluster
        cluster.set_environment_variable(
            'CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')
        cluster.populate(1)
        cluster.start()

        node1, = cluster.nodelist()

        node1.stress([
            'write', 'n=500K', 'no-warmup', '-schema', 'replication(factor=1)',
            '-rate', 'threads=10'
        ])

        node2 = new_node(cluster)
        node2.start()

        for _ in range(30):  # wait until node2 shows up
            ntout = node1.nodetool('status').stdout
            if re.search(r'UJ\s+' + node2.ip_addr, ntout):
                break
            time.sleep(0.1)

        node3 = new_node(cluster, remote_debug_port='2003')
        try:
            node3.start(wait_other_notice=False, verbose=False)
        except NodeError:
            pass  # node doesn't start as expected

        time.sleep(.5)
        node2.watch_log_for("Starting listening for CQL clients")

        node3.watch_log_for(bootstrap_error)

        session = self.patient_exclusive_cql_connection(node2)

        # Repeat the select count(*) query, to help catch
        # bugs like 9484, where count(*) fails at higher
        # data loads.
        for _ in range(5):
            assert_one(session,
                       "SELECT count(*) from keyspace1.standard1", [500000],
                       cl=ConsistencyLevel.ONE)
    def simultaneous_bootstrap_test(self):
        """
        Attempt to bootstrap two nodes at once, to assert the second bootstrapped node fails, and does not interfere.

        Start a one node cluster and run a stress write workload.
        Start up a second node, and wait for the first node to detect it has joined the cluster.
        While the second node is bootstrapping, start a third node. This should fail.

        @jira_ticket CASSANDRA-7069
        @jira_ticket CASSANDRA-9484
        """

        bootstrap_error = (
            "Other bootstrapping/leaving/moving nodes detected,"
            " cannot bootstrap while cassandra.consistent.rangemovement is true"
        )

        self.ignore_log_patterns.append(bootstrap_error)

        cluster = self.cluster
        cluster.populate(1)
        cluster.start(wait_for_binary_proto=True)

        node1, = cluster.nodelist()

        node1.stress([
            'write', 'n=500K', 'no-warmup', '-schema', 'replication(factor=1)',
            '-rate', 'threads=10'
        ])

        node2 = new_node(cluster)
        node2.start(wait_other_notice=True)

        node3 = new_node(cluster, remote_debug_port='2003')
        process = node3.start(wait_other_notice=False)
        stdout, stderr = process.communicate()
        self.assertIn(bootstrap_error, stderr, msg=stderr)
        time.sleep(.5)
        self.assertFalse(node3.is_running(),
                         msg="Two nodes bootstrapped simultaneously")

        node2.watch_log_for("Starting listening for CQL clients")

        session = self.patient_exclusive_cql_connection(node2)

        # Repeat the select count(*) query, to help catch
        # bugs like 9484, where count(*) fails at higher
        # data loads.
        for _ in xrange(5):
            assert_one(session,
                       "SELECT count(*) from keyspace1.standard1", [500000],
                       cl=ConsistencyLevel.ONE)
    def test_bootstrap_waits_for_streaming_to_finish(self):
             """
             Test that bootstrap completes and is marked as such after streaming finishes.
             """

             cluster = self.cluster
             cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')

             logger.debug("Create a cluster")
             cluster.populate(1)
             node1 = cluster.nodelist()[0]

             logger.debug("Start node 1")
             node1.start(wait_for_binary_proto=True)

             logger.debug("Insert 10k rows")
             node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor=2)'])

             logger.debug("Bootstrap node 2 with delay")
             node2 = new_node(cluster, byteman_port='4200')
             node2.update_startup_byteman_script('./byteman/bootstrap_5s_sleep.btm')
             node2.start(wait_for_binary_proto=True)

             assert_bootstrap_state(self, node2, 'COMPLETED')
             assert node2.grep_log('Bootstrap completed', filename='debug.log')
    def decommissioned_wiped_node_can_gossip_to_single_seed_test(self):
        """
        @jira_ticket CASSANDRA-8072
        @jira_ticket CASSANDRA-8422
        Test that if we decommission a node, kill it and wipe its data, it can join a cluster with a single
        seed node.
        """
        cluster = self.cluster
        cluster.populate(1)
        cluster.start(wait_for_binary_proto=True)

        # Add a new node, bootstrap=True ensures that it is not a seed
        node2 = new_node(cluster, bootstrap=True)
        node2.start(wait_for_binary_proto=True, wait_other_notice=True)

        # Decommision the new node and kill it
        debug("Decommissioning & stopping node2")
        node2.decommission()
        node2.stop(wait_other_notice=False)

        # Wipe its data
        for data_dir in node2.data_directories():
            debug("Deleting {}".format(data_dir))
            shutil.rmtree(data_dir)

        commitlog_dir = os.path.join(node2.get_path(), 'commitlogs')
        debug("Deleting {}".format(commitlog_dir))
        shutil.rmtree(commitlog_dir)

        # Now start it, it should be allowed to join
        mark = node2.mark_log()
        debug("Restarting wiped node2")
        node2.start(wait_other_notice=False)
        node2.watch_log_for("JOINING:", from_mark=mark)
    def _do_upgrade(self, login_keyspace=True):
        cluster = self.cluster
        node1 = cluster.nodelist()[0]

        node1.flush()
        time.sleep(.5)
        node1.stop(wait_other_notice=True)

        node1.set_install_dir(install_dir=self.default_install_dir)
        node1.start(wait_other_notice=True, wait_for_binary_proto=True)

        if self.bootstrap:
            cluster.set_install_dir(install_dir=self.default_install_dir)
            # Add a new node, bootstrap=True ensures that it is not a seed
            node2 = new_node(cluster, bootstrap=True)
            node2.start(wait_for_binary_proto=True, jvm_args=self.jvm_args)

            temp_files = self.glob_data_dirs(os.path.join('*', "tmp", "*.dat"))
            debug("temp files: " + str(temp_files))
            self.assertEquals(0, len(temp_files),
                              "Temporary files were not cleaned up.")

        cursor = self.patient_cql_connection(node1)
        if login_keyspace:
            cursor.execute('USE ks')
        return cursor
 def test_cleanup(self):
     """
     @jira_ticket CASSANDRA-11179
     Make sure we remove processed files during cleanup
     """
     cluster = self.cluster
     cluster.set_configuration_options(values={'concurrent_compactors': 4})
     cluster.populate(1)
     cluster.start(wait_for_binary_proto=True)
     node1, = cluster.nodelist()
     for x in range(0, 5):
         node1.stress([
             'write', 'n=100k', 'no-warmup', '-schema',
             'compaction(strategy=SizeTieredCompactionStrategy,enabled=false)',
             'replication(factor=1)', '-rate', 'threads=10'
         ])
         node1.flush()
     node2 = new_node(cluster)
     node2.start(wait_for_binary_proto=True, wait_other_notice=True)
     event = threading.Event()
     failed = threading.Event()
     jobs = 1
     thread = threading.Thread(
         target=self._monitor_datadir,
         args=(node1, event,
               len(node1.get_sstables("keyspace1",
                                      "standard1")), jobs, failed))
     thread.setDaemon(True)
     thread.start()
     node1.nodetool("cleanup -j {} keyspace1 standard1".format(jobs))
     event.set()
     thread.join()
     assert not failed.is_set()
    def _wiped_node_cannot_join_test(self, gently):
        """
        @jira_ticket CASSANDRA-9765
        Test that if we stop a node and wipe its data then the node cannot join
        when it is not a seed. Test both a nice shutdown or a forced shutdown, via
        the gently parameter.
        """
        cluster = self.cluster
        cluster.populate(3)
        cluster.start(wait_for_binary_proto=True)

        stress_table = 'keyspace1.standard1'

        # write some data
        node1 = cluster.nodelist()[0]
        node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8'])

        session = self.patient_cql_connection(node1)
        original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,)))

        # Add a new node, bootstrap=True ensures that it is not a seed
        node4 = new_node(cluster, bootstrap=True)
        node4.start(wait_for_binary_proto=True)

        session = self.patient_cql_connection(node4)
        assert original_rows == list(session.execute("SELECT * FROM {}".format(stress_table,)))

        # Stop the new node and wipe its data
        node4.stop(gently=gently)
        self._cleanup(node4)
        # Now start it, it should not be allowed to join.
        mark = node4.mark_log()
        node4.start(no_wait=True, wait_other_notice=False)
        node4.watch_log_for("A node with address {} already exists, cancelling join".format(node4.address_for_current_version_slashy()), from_mark=mark)
    def bootstrap_test(self):
        """ Test repaired data remains in sync after a bootstrap """
        cluster = self.cluster
        cluster.set_configuration_options(values={'hinted_handoff_enabled': False, 'commitlog_sync_period_in_ms': 500})
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        session = self.patient_exclusive_cql_connection(node3)
        session.execute("CREATE KEYSPACE ks WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 2}")
        session.execute("CREATE TABLE ks.tbl (k INT PRIMARY KEY, v INT)")

        # insert some data
        stmt = SimpleStatement("INSERT INTO ks.tbl (k,v) VALUES (%s, %s)")
        for i in range(1000):
            session.execute(stmt, (i, i))

        node1.repair(options=['ks'])

        for i in range(1000):
            v = i + 1000
            session.execute(stmt, (v, v))

        # everything should be in sync
        for node in [node1, node2, node3]:
            result = node.repair(options=['ks', '--validate'])
            self.assertIn("Repaired data is in sync", result.stdout)

        node4 = new_node(self.cluster)
        node4.start(wait_for_binary_proto=True)

        self.assertEqual(len(self.cluster.nodelist()), 4)
        # everything should still be in sync
        for node in self.cluster.nodelist():
            result = node.repair(options=['ks', '--validate'])
            self.assertIn("Repaired data is in sync", result.stdout)
    def _do_upgrade(self, login_keyspace=True):
        cluster = self.cluster
        node1 = cluster.nodelist()[0]

        node1.flush()
        time.sleep(.5)
        node1.stop(wait_other_notice=True)

        node1.set_install_dir(install_dir=self.fixture_dtest_setup.default_install_dir)
        node1.start(wait_other_notice=True, wait_for_binary_proto=True)

        if self.fixture_dtest_setup.bootstrap:
            cluster.set_install_dir(install_dir=self.fixture_dtest_setup.default_install_dir)
            # Add a new node, bootstrap=True ensures that it is not a seed
            node2 = new_node(cluster, bootstrap=True)
            node2.start(wait_for_binary_proto=True, jvm_args=self.fixture_dtest_setup.jvm_args)

            temp_files = self.glob_data_dirs(os.path.join('*', "tmp", "*.dat"))
            logger.debug("temp files: " + str(temp_files))
            assert 0 == len(temp_files), "Temporary files were not cleaned up."

        cursor = self.patient_cql_connection(node1)
        if login_keyspace:
            cursor.execute('USE ks')
        return cursor
    def test_manual_bootstrap(self):
        """
            Test adding a new node and bootstrapping it manually. No auto_bootstrap.
            This test also verify that all data are OK after the addition of the new node.
            @jira_ticket CASSANDRA-9022
        """
        cluster = self.cluster
        cluster.populate(2).start(wait_other_notice=True)
        (node1, node2) = cluster.nodelist()

        node1.stress([
            'write', 'n=1K', 'no-warmup', '-schema', 'replication(factor=2)',
            '-rate', 'threads=1', '-pop', 'dist=UNIFORM(1..1000)'
        ])

        session = self.patient_exclusive_cql_connection(node2)
        stress_table = 'keyspace1.standard1'

        original_rows = list(session.execute("SELECT * FROM %s" %
                                             stress_table))

        # Add a new node
        node3 = new_node(cluster, bootstrap=False)
        node3.start(wait_for_binary_proto=True)
        node3.repair()
        node1.cleanup()

        current_rows = list(session.execute("SELECT * FROM %s" % stress_table))
        assert original_rows == current_rows
Example #11
0
    def test_manual_bootstrap(self):
        """
            Test adding a new node and bootstrapping it manually. No auto_bootstrap.
            This test also verify that all data are OK after the addition of the new node.
            @jira_ticket CASSANDRA-9022
        """
        cluster = self.cluster
        cluster.populate(2).start(wait_other_notice=True)
        (node1, node2) = cluster.nodelist()

        node1.stress(['write', 'n=1K', 'no-warmup', '-schema', 'replication(factor=2)',
                      '-rate', 'threads=1', '-pop', 'dist=UNIFORM(1..1000)'])

        session = self.patient_exclusive_cql_connection(node2)
        stress_table = 'keyspace1.standard1'

        original_rows = list(session.execute("SELECT * FROM %s" % stress_table))

        # Add a new node
        node3 = new_node(cluster, bootstrap=False)
        node3.start(wait_for_binary_proto=True)
        node3.repair()
        node1.cleanup()

        current_rows = list(session.execute("SELECT * FROM %s" % stress_table))
        assert original_rows == current_rows
    def test_bootstrap_with_reset_bootstrap_state(self):
        """Test bootstrap with resetting bootstrap progress"""
        cluster = self.cluster
        cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1})
        cluster.populate(2).start(wait_other_notice=True)

        node1 = cluster.nodes['node1']
        node1.stress(['write', 'n=100K', '-schema', 'replication(factor=2)'])
        node1.flush()

        # kill node1 in the middle of streaming to let it fail
        t = InterruptBootstrap(node1)
        t.start()

        # start bootstrapping node3 and wait for streaming
        node3 = new_node(cluster)
        try:
            node3.start()
        except NodeError:
            pass  # node doesn't start as expected
        t.join()
        node1.start()

        # restart node3 bootstrap with resetting bootstrap progress
        node3.stop(signal_event=signal.SIGKILL)
        mark = node3.mark_log()
        node3.start(jvm_args=["-Dcassandra.reset_bootstrap_progress=true"])
        # check if we reset bootstrap state
        node3.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark)
        # wait for node3 ready to query
        node3.wait_for_binary_interface(from_mark=mark)

        # check if 2nd bootstrap succeeded
        assert_bootstrap_state(self, node3, 'COMPLETED')
    def test_decommissioned_wiped_node_can_join(self):
        """
        @jira_ticket CASSANDRA-9765
        Test that if we decommission a node and then wipe its data, it can join the cluster.
        """
        cluster = self.cluster
        cluster.populate(3)
        cluster.start(wait_for_binary_proto=True)

        stress_table = 'keyspace1.standard1'

        # write some data
        node1 = cluster.nodelist()[0]
        node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8'])

        session = self.patient_cql_connection(node1)
        original_rows = list(
            session.execute("SELECT * FROM {}".format(stress_table, )))

        # Add a new node, bootstrap=True ensures that it is not a seed
        node4 = new_node(cluster, bootstrap=True)
        node4.start(wait_for_binary_proto=True, wait_other_notice=True)

        session = self.patient_cql_connection(node4)
        assert original_rows == list(
            session.execute("SELECT * FROM {}".format(stress_table, )))

        # Decommission the new node and wipe its data
        node4.decommission()
        node4.stop()
        self._cleanup(node4)
        # Now start it, it should be allowed to join
        mark = node4.mark_log()
        node4.start(wait_other_notice=True)
        node4.watch_log_for("JOINING:", from_mark=mark)
    def test_read_from_bootstrapped_node(self):
        """
        Test bootstrapped node sees existing data
        @jira_ticket CASSANDRA-6648
        """
        cluster = self.cluster
        cluster.populate(3)
        cluster.start()

        node1 = cluster.nodes['node1']
        node1.stress([
            'write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema',
            'replication(factor=2)'
        ])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        original_rows = list(
            session.execute("SELECT * FROM %s" % (stress_table, )))

        node4 = new_node(cluster)
        node4.start(wait_for_binary_proto=True)

        session = self.patient_exclusive_cql_connection(node4)
        new_rows = list(session.execute("SELECT * FROM %s" % (stress_table, )))
        assert original_rows == new_rows
    def test_local_quorum_bootstrap(self):
        """
        Test that CL local_quorum works while a node is bootstrapping.
        @jira_ticket CASSANDRA-8058
        """
        cluster = self.cluster
        cluster.populate([1, 1])
        cluster.start()

        node1 = cluster.nodes['node1']
        yaml_config = """
        # Create the keyspace and table
        keyspace: keyspace1
        keyspace_definition: |
          CREATE KEYSPACE keyspace1 WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 1};
        table: users
        table_definition:
          CREATE TABLE users (
            username text,
            first_name text,
            last_name text,
            email text,
            PRIMARY KEY(username)
          ) WITH compaction = {'class':'SizeTieredCompactionStrategy'};
        insert:
          partitions: fixed(1)
          batchtype: UNLOGGED
        queries:
          read:
            cql: select * from users where username = ?
            fields: samerow
        """
        with tempfile.NamedTemporaryFile(mode='w+') as stress_config:
            stress_config.write(yaml_config)
            stress_config.flush()
            node1.stress([
                'user', 'profile=' + stress_config.name, 'n=200K', 'no-warmup',
                'ops(insert=1)', '-rate', 'threads=10'
            ])

            node3 = new_node(cluster, data_center='dc2')
            node3.start(jvm_args=["-Dcassandra.write_survey=true"],
                        no_wait=True)
            time.sleep(5)

            ntout = node1.nodetool('status').stdout
            assert re.search(r'UJ\s+' + node3.ip_addr, ntout), ntout
            out, err, _ = node1.stress([
                'user', 'profile=' + stress_config.name, 'ops(insert=1)',
                'n=10k', 'no-warmup', 'cl=LOCAL_QUORUM', '-rate', 'threads=10',
                '-errors', 'retries=2'
            ])
            ntout = node1.nodetool('status').stdout
            assert re.search(r'UJ\s+' + node3.ip_addr, ntout), ntout

        logger.debug(out)
        assert_stderr_clean(err)
        regex = re.compile("Operation.+error inserting key.+Exception")
        failure = regex.search(str(out))
        assert failure is None, "Error during stress while bootstrapping"
    def bootstrap_test(self):
        """ Test repaired data remains in sync after a bootstrap """
        cluster = self.cluster
        cluster.set_configuration_options(values={'hinted_handoff_enabled': False, 'commitlog_sync_period_in_ms': 500})
        cluster.populate(3).start()
        node1, node2, node3 = cluster.nodelist()

        session = self.patient_exclusive_cql_connection(node3)
        session.execute("CREATE KEYSPACE ks WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 2}")
        session.execute("CREATE TABLE ks.tbl (k INT PRIMARY KEY, v INT)")

        # insert some data
        stmt = SimpleStatement("INSERT INTO ks.tbl (k,v) VALUES (%s, %s)")
        for i in range(1000):
            session.execute(stmt, (i, i))

        node1.repair(options=['ks'])

        for i in range(1000):
            v = i + 1000
            session.execute(stmt, (v, v))

        # everything should be in sync
        for node in [node1, node2, node3]:
            result = node.repair(options=['ks', '--validate'])
            self.assertIn("Repaired data is in sync", result.stdout)

        node4 = new_node(self.cluster)
        node4.start(wait_for_binary_proto=True)

        self.assertEqual(len(self.cluster.nodelist()), 4)
        # everything should still be in sync
        for node in self.cluster.nodelist():
            result = node.repair(options=['ks', '--validate'])
            self.assertIn("Repaired data is in sync", result.stdout)
Example #17
0
    def test_decommissioned_wiped_node_can_join(self):
        """
        @jira_ticket CASSANDRA-9765
        Test that if we decommission a node and then wipe its data, it can join the cluster.
        """
        cluster = self.cluster
        cluster.populate(3)
        cluster.start(wait_for_binary_proto=True)

        stress_table = 'keyspace1.standard1'

        # write some data
        node1 = cluster.nodelist()[0]
        node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8'])

        session = self.patient_cql_connection(node1)
        original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,)))

        # Add a new node, bootstrap=True ensures that it is not a seed
        node4 = new_node(cluster, bootstrap=True)
        node4.start(wait_for_binary_proto=True, wait_other_notice=True)

        session = self.patient_cql_connection(node4)
        assert original_rows == list(session.execute("SELECT * FROM {}".format(stress_table,)))

        # Decommission the new node and wipe its data
        node4.decommission()
        node4.stop()
        self._cleanup(node4)
        # Now start it, it should be allowed to join
        mark = node4.mark_log()
        node4.start(wait_other_notice=True)
        node4.watch_log_for("JOINING:", from_mark=mark)
    def decommissioned_wiped_node_can_gossip_to_single_seed_test(self):
        """
        @jira_ticket CASSANDRA-8072
        @jira_ticket CASSANDRA-8422
        Test that if we decommission a node, kill it and wipe its data, it can join a cluster with a single
        seed node.
        """
        cluster = self.cluster
        cluster.populate(1)
        cluster.start(wait_for_binary_proto=True)

        # Add a new node, bootstrap=True ensures that it is not a seed
        node2 = new_node(cluster, bootstrap=True)
        node2.start(wait_for_binary_proto=True, wait_other_notice=True)

        # Decommision the new node and kill it
        debug("Decommissioning & stopping node2")
        node2.decommission()
        node2.stop(wait_other_notice=False)

        # Wipe its data
        for data_dir in node2.data_directories():
            debug("Deleting {}".format(data_dir))
            shutil.rmtree(data_dir)

        commitlog_dir = os.path.join(node2.get_path(), 'commitlogs')
        debug("Deleting {}".format(commitlog_dir))
        shutil.rmtree(commitlog_dir)

        # Now start it, it should be allowed to join
        mark = node2.mark_log()
        debug("Restarting wiped node2")
        node2.start(wait_other_notice=False)
        node2.watch_log_for("JOINING:", from_mark=mark)
Example #19
0
 def test_cleanup(self):
     """
     @jira_ticket CASSANDRA-11179
     Make sure we remove processed files during cleanup
     """
     cluster = self.cluster
     cluster.set_configuration_options(values={'concurrent_compactors': 4})
     cluster.populate(1)
     cluster.start(wait_for_binary_proto=True)
     node1, = cluster.nodelist()
     for x in range(0, 5):
         node1.stress(['write', 'n=100k', 'no-warmup', '-schema', 'compaction(strategy=SizeTieredCompactionStrategy,enabled=false)', 'replication(factor=1)', '-rate', 'threads=10'])
         node1.flush()
     node2 = new_node(cluster)
     node2.start(wait_for_binary_proto=True, wait_other_notice=True)
     event = threading.Event()
     failed = threading.Event()
     jobs = 1
     thread = threading.Thread(target=self._monitor_datadir, args=(node1, event, len(node1.get_sstables("keyspace1", "standard1")), jobs, failed))
     thread.setDaemon(True)
     thread.start()
     node1.nodetool("cleanup -j {} keyspace1 standard1".format(jobs))
     event.set()
     thread.join()
     assert not failed.is_set()
    def _bootstrap_test_with_replica_down(self, consistent_range_movement, rf=2):
        """
        Test to check consistent bootstrap will not succeed when there are insufficient replicas
        @jira_ticket CASSANDRA-11848
        """
        cluster = self.cluster
        cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')

        cluster.populate(2)
        node1, node2 = cluster.nodelist()

        node3_token = None
        # Make token assignment deterministic
        if not self.dtest_config.use_vnodes:
            cluster.set_configuration_options(values={'num_tokens': 1})
            tokens = cluster.balanced_tokens(3)
            logger.debug("non-vnode tokens: %r" % (tokens,))
            node1.set_configuration_options(values={'initial_token': tokens[0]})
            node2.set_configuration_options(values={'initial_token': tokens[2]})
            node3_token = tokens[1]  # Add node 3 between node1 and node2

        cluster.start()

        node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor={})'.format(rf)])

        # change system_auth keyspace to 2 (default is 1) to avoid
        # "Unable to find sufficient sources for streaming" warning
        if cluster.cassandra_version() >= '2.2.0':
            session = self.patient_cql_connection(node1)
            session.execute("""
                ALTER KEYSPACE system_auth
                    WITH replication = {'class':'SimpleStrategy', 'replication_factor':2};
            """)

        # Stop node2, so node3 will not be able to perform consistent range movement
        node2.stop(wait_other_notice=True)

        successful_bootstrap_expected = not consistent_range_movement

        node3 = new_node(cluster, token=node3_token)
        node3.start(wait_for_binary_proto=successful_bootstrap_expected, wait_other_notice=successful_bootstrap_expected,
                    jvm_args=["-Dcassandra.consistent.rangemovement={}".format(consistent_range_movement)])

        if successful_bootstrap_expected:
            # with rf=1 and cassandra.consistent.rangemovement=false, missing sources are ignored
            if not consistent_range_movement and rf == 1:
                node3.watch_log_for("Unable to find sufficient sources for streaming range")
            assert node3.is_running()
            assert_bootstrap_state(self, node3, 'COMPLETED')
        else:
            if consistent_range_movement:
                if cluster.version() < '4.0':
                    node3.watch_log_for("A node required to move the data consistently is down")
                else:
                    node3.watch_log_for("Necessary replicas for strict consistency were removed by source filters")
            else:
                node3.watch_log_for("Unable to find sufficient sources for streaming range")
            assert_not_running(node3)
    def _disk_balance_after_boundary_change_test(self, lcs):
        """
            @jira_ticket CASSANDRA-13948

            - Creates a 1 node cluster with 5 disks and insert data with compaction disabled
            - Bootstrap a node2 to make disk boundary changes on node1
            - Enable compaction on node1 and check disks are balanced
            - Decommission node1 to make disk boundary changes on node2
            - Enable compaction on node2 and check disks are balanced
        """

        cluster = self.cluster
        if self.dtest_config.use_vnodes:
            cluster.set_configuration_options(values={'num_tokens': 1024})
        num_disks = 5
        cluster.set_datadir_count(num_disks)
        cluster.set_configuration_options(values={'concurrent_compactors': num_disks})

        logger.debug("Starting node1 with {} data dirs and concurrent_compactors".format(num_disks))
        cluster.populate(1).start(wait_for_binary_proto=True)
        [node1] = cluster.nodelist()

        session = self.patient_cql_connection(node1)
        # reduce system_distributed RF to 1 so we don't require forceful decommission
        session.execute("ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};")
        session.execute("ALTER KEYSPACE system_traces WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};")

        num_flushes = 10
        keys_per_flush = 10000
        keys_to_write = num_flushes * keys_per_flush

        compaction_opts = "LeveledCompactionStrategy,sstable_size_in_mb=1" if lcs else "SizeTieredCompactionStrategy"
        logger.debug("Writing {} keys in {} flushes (compaction_opts={})".format(keys_to_write, num_flushes, compaction_opts))
        total_keys = num_flushes * keys_per_flush
        current_keys = 0
        while current_keys < total_keys:
            start_key = current_keys + 1
            end_key = current_keys + keys_per_flush
            logger.debug("Writing keys {}..{} and flushing".format(start_key, end_key))
            node1.stress(['write', 'n={}'.format(keys_per_flush), "no-warmup", "cl=ALL", "-pop",
                          "seq={}..{}".format(start_key, end_key), "-rate", "threads=1", "-schema", "replication(factor=1)",
                          "compaction(strategy={},enabled=false)".format(compaction_opts)])
            node1.nodetool('flush keyspace1 standard1')
            current_keys = end_key

        # Add a new node, so disk boundaries will change
        logger.debug("Bootstrap node2 and flush")
        node2 = new_node(cluster, bootstrap=True)
        node2.start(wait_for_binary_proto=True, jvm_args=["-Dcassandra.migration_task_wait_in_seconds=10"], set_migration_task=False)
        node2.flush()

        self._assert_balanced_after_boundary_change(node1, total_keys, lcs)

        logger.debug("Decommissioning node1")
        node1.decommission()
        node1.stop()

        self._assert_balanced_after_boundary_change(node2, total_keys, lcs)
Example #22
0
    def test_simultaneous_bootstrap(self):
        """
        Attempt to bootstrap two nodes at once, to assert the second bootstrapped node fails, and does not interfere.

        Start a one node cluster and run a stress write workload.
        Start up a second node, and wait for the first node to detect it has joined the cluster.
        While the second node is bootstrapping, start a third node. This should fail.

        @jira_ticket CASSANDRA-7069
        @jira_ticket CASSANDRA-9484
        """

        bootstrap_error = "Other bootstrapping/leaving/moving nodes detected," \
                          " cannot bootstrap while cassandra.consistent.rangemovement is true"

        cluster = self.cluster
        cluster.populate(1)
        cluster.start(wait_for_binary_proto=True)

        node1, = cluster.nodelist()

        node1.stress(['write', 'n=500K', 'no-warmup', '-schema', 'replication(factor=1)',
                      '-rate', 'threads=10'])

        node2 = new_node(cluster)
        node2.start(wait_other_notice=True)

        node3 = new_node(cluster, remote_debug_port='2003')
        try:
            node3.start(wait_other_notice=False, verbose=False)
        except NodeError:
            pass  # node doesn't start as expected

        time.sleep(.5)
        node2.watch_log_for("Starting listening for CQL clients")

        node3.watch_log_for(bootstrap_error)

        session = self.patient_exclusive_cql_connection(node2)

        # Repeat the select count(*) query, to help catch
        # bugs like 9484, where count(*) fails at higher
        # data loads.
        for _ in range(5):
            assert_one(session, "SELECT count(*) from keyspace1.standard1", [500000], cl=ConsistencyLevel.ONE)
 def bootstrap_with_compatibility_flag_on(cluster, token):
     node2 = new_node(cluster)
     node2.set_configuration_options(values={'initial_token': token})
     # cassandra.force_3_0_protocol_version parameter is needed to allow schema
     # changes during the bootstrapping for upgrades from 3.0.14+ to anything upwards for 3.0.x or 3.x clusters.
     # @jira_ticket CASSANDRA-13004 for detailed context on `cassandra.force_3_0_protocol_version` flag
     node2.start(jvm_args=["-Dcassandra.force_3_0_protocol_version=true"],
                 wait_for_binary_proto=True)
     return node2
Example #24
0
    def test_remove(self):
        """Test  a mix of ring change operations across a mix of transient and repaired/unrepaired data"""
        node4 = new_node(self.cluster, bootstrap=True, token='00040')
        patch_start(node4)
        node4.start(wait_for_binary_proto=True, wait_other_notice=True)
        main_session = self.patient_cql_connection(self.node1)
        nodes = [self.node1, self.node2, self.node3]

        #We want the node being removed to have no data on it so nodetool remove always gets all the necessary data
        #from survivors
        node4_id = node4.nodetool('info').stdout[25:61]
        node4.stop(wait_other_notice=True)

        for i in range(0, 40):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        sessions = [
            self.exclusive_cql_connection(node)
            for node in [self.node1, self.node2, self.node3]
        ]

        expected = [
            gen_expected(range(0, 11), range(21, 40)),
            gen_expected(range(0, 21), range(31, 40)),
            gen_expected(range(11, 31))
        ]

        # Every node should some of its fully replicated data and one and two should have some transient data
        self.check_expected(sessions, expected)

        nodes[0].nodetool('removenode ' + node4_id)

        #Give streaming time to occur, it's asynchronous from removenode completing at other ndoes
        import time
        time.sleep(15)

        # Everyone should have everything except
        expected = [
            gen_expected(range(0, 40)),
            gen_expected(range(0, 40)),
            gen_expected(range(0, 40))
        ]

        self.check_replication(sessions, exactly=3)
        self.check_expected(sessions, expected)
        repair_nodes(nodes)
        cleanup_nodes(nodes)

        self.check_replication(sessions, exactly=2)

        expected = [
            gen_expected(range(0, 11), range(21, 40)),
            gen_expected(range(0, 21), range(31, 40)),
            gen_expected(range(11, 31))
        ]
        self.check_expected(sessions, expected)
 def _bootstrap_new_node(self):
     # Check we can bootstrap a new node on the upgraded cluster:
     logger.debug("Adding a node to the cluster")
     nnode = new_node(self.cluster, remote_debug_port=str(2000 + len(self.cluster.nodes)))
     nnode.start(use_jna=True, wait_other_notice=240, wait_for_binary_proto=True)
     self._write_values()
     self._increment_counters()
     self._check_values()
     self._check_counters()
 def _bootstrap_new_node(self):
     # Check we can bootstrap a new node on the upgraded cluster:
     debug("Adding a node to the cluster")
     nnode = new_node(self.cluster, remote_debug_port=str(2000 + len(self.cluster.nodes)))
     nnode.start(use_jna=True, wait_other_notice=240, wait_for_binary_proto=True)
     self._write_values()
     self._increment_counters()
     self._check_values()
     self._check_counters()
    def test_decommission(self):
        """Test decommissioning a node correctly streams out all the data"""
        node4 = new_node(self.cluster, bootstrap=True, token='00040')
        patch_start(node4)
        node4.start(wait_for_binary_proto=True, wait_other_notice=True)
        main_session = self.patient_cql_connection(self.node1)
        nodes = [self.node1, self.node2, self.node3, node4]

        for i in range(0, 40, 2):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        # Make sure at least a little data is repaired
        repair_nodes(nodes)

        # Ensure that there is at least some transient data around, because of this if it's missing after bootstrap
        # We know we failed to get it from the transient replica losing the range entirely
        nodes[1].stop(wait_other_notice=True)

        for i in range(1, 40, 2):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        nodes[1].start(wait_for_binary_proto=True, wait_other_notice=True)
        sessions = [self.exclusive_cql_connection(node) for node in [self.node1, self.node2, self.node3, node4]]

        expected = [gen_expected(range(0, 11), range(31, 40)),
                    gen_expected(range(0, 21, 2)),
                    gen_expected(range(1, 11, 2), range(11, 31)),
                    gen_expected(range(11, 20, 2), range(21, 40))]

        self.check_expected(sessions, expected)

        #node1 has transient data we want to see streamed out on move
        nodes[3].nodetool('decommission')

        nodes = nodes[:-1]
        sessions = sessions[:-1]

        expected = [gen_expected(range(0, 11), range(11, 21, 2), range(21, 40)),
                    gen_expected(range(0, 21, 2), range(21, 30, 2), range(31, 40)),
                    gen_expected(range(1, 11, 2), range(11, 31), range(31, 40, 2))]

        cleanup_nodes(nodes)

        self.check_replication(sessions, gte=2, lte=3)
        self.check_expected(sessions, expected)

        repair_nodes(nodes)

        #There should be no transient data anywhere
        expected = [gen_expected(range(0, 11), range(21, 40)),
                    gen_expected(range(0, 21), range(31, 40)),
                    gen_expected(range(11, 31))]

        self.check_expected(sessions, expected, nodes, cleanup=True)
        self.check_replication(sessions, exactly=2)
    def _bootstrap_test_with_replica_down(self, consistent_range_movement, rf=2):
        """
        Test to check consistent bootstrap will not succeed when there are insufficient replicas
        @jira_ticket CASSANDRA-11848
        """
        cluster = self.cluster

        cluster.populate(2)
        node1, node2 = cluster.nodelist()

        node3_token = None
        # Make token assignment deterministic
        if not self.dtest_config.use_vnodes:
            cluster.set_configuration_options(values={'num_tokens': 1})
            tokens = cluster.balanced_tokens(3)
            logger.debug("non-vnode tokens: %r" % (tokens,))
            node1.set_configuration_options(values={'initial_token': tokens[0]})
            node2.set_configuration_options(values={'initial_token': tokens[2]})
            node3_token = tokens[1]  # Add node 3 between node1 and node2

        cluster.start()

        node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor={})'.format(rf)])

        # change system_auth keyspace to 2 (default is 1) to avoid
        # "Unable to find sufficient sources for streaming" warning
        if cluster.cassandra_version() >= '2.2.0':
            session = self.patient_cql_connection(node1)
            session.execute("""
                ALTER KEYSPACE system_auth
                    WITH replication = {'class':'SimpleStrategy', 'replication_factor':2};
            """)

        # Stop node2, so node3 will not be able to perform consistent range movement
        node2.stop(wait_other_notice=True)

        successful_bootstrap_expected = not consistent_range_movement

        node3 = new_node(cluster, token=node3_token)
        node3.start(wait_for_binary_proto=successful_bootstrap_expected, wait_other_notice=successful_bootstrap_expected,
                    jvm_args=["-Dcassandra.consistent.rangemovement={}".format(consistent_range_movement)])

        if successful_bootstrap_expected:
            # with rf=1 and cassandra.consistent.rangemovement=false, missing sources are ignored
            if not consistent_range_movement and rf == 1:
                node3.watch_log_for("Unable to find sufficient sources for streaming range")
            assert node3.is_running()
            assert_bootstrap_state(self, node3, 'COMPLETED')
        else:
            if consistent_range_movement:
                if cluster.version() < '4.0':
                    node3.watch_log_for("A node required to move the data consistently is down")
                else:
                    node3.watch_log_for("Necessary replicas for strict consistency were removed by source filters")
            else:
                node3.watch_log_for("Unable to find sufficient sources for streaming range")
            assert_not_running(node3)
 def bootstrap_with_compatibility_flag_on(cluster, token):
     node2 = new_node(cluster)
     node2.set_configuration_options(values={'initial_token': token})
     # cassandra.force_3_0_protocol_version parameter is needed to allow schema
     # changes during the bootstrapping for upgrades from 3.0.14+ to anything upwards for 3.0.x or 3.x clusters.
     # @jira_ticket CASSANDRA-13004 for detailed context on `cassandra.force_3_0_protocol_version` flag
     node2.start(jvm_args=["-Dcassandra.force_3_0_protocol_version=true"],
                 wait_for_binary_proto=True)
     return node2
Example #30
0
    def test_resumable_bootstrap(self):
        """
        Test resuming bootstrap after data streaming failure
        """
        cluster = self.cluster
        cluster.populate(2)

        node1 = cluster.nodes['node1']
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start(wait_other_notice=True)
        # kill stream to node3 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit(['./byteman/pre4.0/stream_failure.btm'])
        else:
            node1.byteman_submit(['./byteman/4.0/stream_failure.btm'])
        node1.stress([
            'write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema',
            'replication(factor=2)', '-rate', 'threads=50'
        ])
        cluster.flush()

        # start bootstrapping node3 and wait for streaming
        node3 = new_node(cluster)
        node3.start(wait_other_notice=False, wait_for_binary_proto=True)

        # wait for node3 ready to query
        node3.watch_log_for("Starting listening for CQL clients")
        mark = node3.mark_log()
        # check if node3 is still in bootstrap mode
        retry_till_success(assert_bootstrap_state,
                           tester=self,
                           node=node3,
                           expected_bootstrap_state='IN_PROGRESS',
                           timeout=120)

        # bring back node1 and invoke nodetool bootstrap to resume bootstrapping
        node3.nodetool('bootstrap resume')

        node3.watch_log_for("Resume complete", from_mark=mark)
        assert_bootstrap_state(self, node3, 'COMPLETED')

        # cleanup to guarantee each node will only have sstables of its ranges
        cluster.cleanup()

        logger.debug("Check data is present")
        # Let's check stream bootstrap completely transferred data
        stdout, stderr, _ = node3.stress([
            'read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)',
            '-rate', 'threads=8'
        ])

        if stdout is not None:
            assert "FAILURE" not in stdout
        def bootstrap_on_write_survey_and_join(cluster, token):
            node2 = new_node(cluster)
            node2.set_configuration_options(values={'initial_token': token})
            node2.start(jvm_args=["-Dcassandra.write_survey=true"], wait_for_binary_proto=True)

            assert len(node2.grep_log('Startup complete, but write survey mode is active, not becoming an active ring member.'))
            assert_bootstrap_state(self, node2, 'IN_PROGRESS')

            node2.nodetool("join")
            assert len(node2.grep_log('Leaving write survey mode and joining ring at operator request'))
            return node2
Example #32
0
    def local_quorum_bootstrap_test(self):
        """
        Test that CL local_quorum works while a node is bootstrapping.
        @jira_ticket CASSANDRA-8058
        """

        cluster = self.cluster
        cluster.populate([1, 1])
        cluster.start()

        node1 = cluster.nodes['node1']
        yaml_config = """
        # Create the keyspace and table
        keyspace: keyspace1
        keyspace_definition: |
          CREATE KEYSPACE keyspace1 WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 1};
        table: users
        table_definition:
          CREATE TABLE users (
            username text,
            first_name text,
            last_name text,
            email text,
            PRIMARY KEY(username)
          ) WITH compaction = {'class':'SizeTieredCompactionStrategy'};
        insert:
          partitions: fixed(1)
          batchtype: UNLOGGED
        queries:
          read:
            cql: select * from users where username = ?
            fields: samerow
        """
        with tempfile.NamedTemporaryFile(mode='w+') as stress_config:
            stress_config.write(yaml_config)
            stress_config.flush()
            node1.stress(['user', 'profile=' + stress_config.name, 'n=2M', 'no-warmup',
                          'ops(insert=1)', '-rate', 'threads=50'])

            node3 = new_node(cluster, data_center='dc2')
            node3.start(no_wait=True)
            time.sleep(3)

            out, err, _ = node1.stress(['user', 'profile=' + stress_config.name, 'ops(insert=1)',
                                        'n=500K', 'no-warmup', 'cl=LOCAL_QUORUM',
                                        '-rate', 'threads=5',
                                        '-errors', 'retries=2'])

        debug(out)
        assert_stderr_clean(err)
        regex = re.compile("Operation.+error inserting key.+Exception")
        failure = regex.search(out)
        self.assertIsNone(failure, "Error during stress while bootstrapping")
Example #33
0
    def move_test(self, move_token, expected_after_move,
                  expected_after_repair):
        """Helper method to run a move test cycle"""
        node4 = new_node(self.cluster, bootstrap=True, token='00040')
        patch_start(node4)
        node4.start(wait_for_binary_proto=True)
        main_session = self.patient_cql_connection(self.node1)
        nodes = [self.node1, self.node2, self.node3, node4]

        for i in range(0, 40, 2):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        # Make sure at least a little data is repaired
        repair_nodes(nodes)

        # Ensure that there is at least some transient data around, because of this if it's missing after bootstrap
        # We know we failed to get it from the transient replica losing the range entirely
        nodes[1].stop(wait_other_notice=True)

        for i in range(1, 40, 2):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        nodes[1].start(wait_for_binary_proto=True)
        sessions = [
            self.exclusive_cql_connection(node)
            for node in [self.node1, self.node2, self.node3, node4]
        ]

        expected = [
            gen_expected(range(0, 11), range(31, 40)),
            gen_expected(range(0, 21, 2)),
            gen_expected(range(1, 11, 2), range(11, 31)),
            gen_expected(range(11, 20, 2), range(21, 40))
        ]
        self.check_expected(sessions, expected)
        self.check_replication(sessions, exactly=2)

        nodes[0].nodetool('move %s' % move_token)
        cleanup_nodes(nodes)

        self.check_replication(sessions, gte=2, lte=3)
        self.check_expected(sessions, expected=expected_after_move)

        repair_nodes(nodes)

        self.check_expected(sessions,
                            expected_after_repair,
                            nodes,
                            cleanup=True)
        self.check_replication(sessions, exactly=2)
    def test_consistent_reads_after_bootstrap(self):
        logger.debug("Creating a ring")
        cluster = self.cluster
        cluster.set_configuration_options(
            values={
                'hinted_handoff_enabled': False,
                'write_request_timeout_in_ms': 60000,
                'read_request_timeout_in_ms': 60000,
                'dynamic_snitch_badness_threshold': 0.0
            })
        cluster.set_batch_commitlog(enabled=True)

        cluster.populate(2)
        node1, node2 = cluster.nodelist()
        cluster.start(wait_for_binary_proto=True, wait_other_notice=True)

        logger.debug("Set to talk to node 2")
        n2session = self.patient_cql_connection(node2)
        create_ks(n2session, 'ks', 2)
        create_c1c2_table(self, n2session)

        logger.debug("Generating some data for all nodes")
        insert_c1c2(n2session,
                    keys=list(range(10, 20)),
                    consistency=ConsistencyLevel.ALL)

        node1.flush()
        logger.debug("Taking down node1")
        node1.stop(wait_other_notice=True)

        logger.debug("Writing data to only node2")
        insert_c1c2(n2session,
                    keys=list(range(30, 1000)),
                    consistency=ConsistencyLevel.ONE)
        node2.flush()

        logger.debug("Restart node1")
        node1.start(wait_other_notice=True)

        logger.debug("Bootstraping node3")
        node3 = new_node(cluster)
        node3.start(wait_for_binary_proto=True)

        n3session = self.patient_cql_connection(node3)
        n3session.execute("USE ks")
        logger.debug("Checking that no data was lost")
        for n in range(10, 20):
            query_c1c2(n3session, n, ConsistencyLevel.ALL)

        for n in range(30, 1000):
            query_c1c2(n3session, n, ConsistencyLevel.ALL)
    def disk_balance_bootstrap_test(self):
        cluster = self.cluster
        if not DISABLE_VNODES:
            cluster.set_configuration_options(values={'num_tokens': 256})
        # apparently we have legitimate errors in the log when bootstrapping (see bootstrap_test.py)
        self.allow_log_errors = True
        cluster.populate(4).start(wait_for_binary_proto=True)
        node1 = cluster.nodes['node1']

        node1.stress(['write', 'n=50k', 'no-warmup', '-rate', 'threads=100', '-schema', 'replication(factor=3)', 'compaction(strategy=SizeTieredCompactionStrategy,enabled=false)'])
        cluster.flush()
        node5 = new_node(cluster)
        node5.start(wait_for_binary_proto=True)
        self.assert_balanced(node5)
Example #36
0
    def test_resumable_bootstrap(self):
        """
        Test resuming bootstrap after data streaming failure
        """
        cluster = self.cluster
        cluster.set_environment_variable(
            'CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')
        cluster.populate(2)

        node1 = cluster.nodes['node1']
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start()
        # kill stream to node3 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit([self.byteman_submit_path_pre_4_0])
        else:
            node1.byteman_submit([self.byteman_submit_path_4_0])
        node1.stress([
            'write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema',
            'replication(factor=2)', '-rate', 'threads=50'
        ])
        cluster.flush()

        # start bootstrapping node3 and wait for streaming
        node3 = new_node(cluster)
        node3.start(wait_other_notice=False)

        # let streaming fail as we expect
        node3.watch_log_for('Some data streaming failed')

        # bring back node3 and invoke nodetool bootstrap to resume bootstrapping
        node3.nodetool('bootstrap resume')
        node3.wait_for_binary_interface()
        assert_bootstrap_state(self, node3, 'COMPLETED')

        # cleanup to guarantee each node will only have sstables of its ranges
        cluster.cleanup()

        logger.debug("Check data is present")
        # Let's check stream bootstrap completely transferred data
        stdout, stderr, _ = node3.stress([
            'read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)',
            '-rate', 'threads=8'
        ])

        if stdout is not None:
            assert "FAILURE" not in stdout
Example #37
0
    def test_simple_bootstrap_small_keepalive_period(self):
        """
        @jira_ticket CASSANDRA-11841
        Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or
        2*streaming_keep_alive_period_in_secs to receive a single sstable
        """
        cluster = self.cluster
        yaml_opts = {'streaming_keep_alive_period_in_secs': 2}
        if cluster.version() < '4.0':
            yaml_opts['streaming_socket_timeout_in_ms'] = 1000
        cluster.set_configuration_options(values=yaml_opts)

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]

        logger.debug("Setting up byteman on {}".format(node1.name))
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start()

        # Create more than one sstable larger than 1MB
        node1.stress([
            'write', 'n=1K', '-rate', 'threads=8', '-schema',
            'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'
        ])
        cluster.flush()

        logger.debug("Submitting byteman script to {} to".format(node1.name))
        # Sleep longer than streaming_socket_timeout_in_ms to make sure the node will not be killed
        node1.byteman_submit([mk_bman_path('stream_5s_sleep.btm')])

        # Bootstraping a new node with very small streaming_socket_timeout_in_ms
        node2 = new_node(cluster)
        node2.start(wait_for_binary_proto=True)

        # Shouldn't fail due to streaming socket timeout timeout
        assert_bootstrap_state(self, node2, 'COMPLETED')

        if cluster.version() < '4.0':
            for node in cluster.nodelist():
                assert node.grep_log(
                    'Scheduling keep-alive task with 2s period.',
                    filename='debug.log')
                assert node.grep_log('Sending keep-alive',
                                     filename='debug.log')
                assert node.grep_log('Received keep-alive',
                                     filename='debug.log')
Example #38
0
    def test_decommissioned_wiped_node_can_gossip_to_single_seed(self):
        """
        @jira_ticket CASSANDRA-8072
        @jira_ticket CASSANDRA-8422
        Test that if we decommission a node, kill it and wipe its data, it can join a cluster with a single
        seed node.
        """
        cluster = self.cluster
        cluster.set_environment_variable(
            'CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')
        cluster.populate(1)
        cluster.start()

        node1 = cluster.nodelist()[0]
        # Add a new node, bootstrap=True ensures that it is not a seed
        node2 = new_node(cluster, bootstrap=True)
        node2.start(wait_for_binary_proto=True)

        session = self.patient_cql_connection(node1)

        if cluster.version() >= '2.2':
            # reduce system_distributed RF to 2 so we don't require forceful decommission
            session.execute(
                "ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};"
            )

        session.execute(
            "ALTER KEYSPACE system_traces WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};"
        )

        # Decommision the new node and kill it
        logger.debug("Decommissioning & stopping node2")
        node2.decommission()
        node2.stop(wait_other_notice=False)

        # Wipe its data
        for data_dir in node2.data_directories():
            logger.debug("Deleting {}".format(data_dir))
            shutil.rmtree(data_dir)

        commitlog_dir = os.path.join(node2.get_path(), 'commitlogs')
        logger.debug("Deleting {}".format(commitlog_dir))
        shutil.rmtree(commitlog_dir)

        # Now start it, it should be allowed to join
        mark = node2.mark_log()
        logger.debug("Restarting wiped node2")
        node2.start(wait_other_notice=False)
        node2.watch_log_for("JOINING:", from_mark=mark)
    def test_wait_for_schema(self):

        #start a one node cluster
        cluster = self.cluster
        cluster.populate(1, install_byteman=True)
        node1 = cluster.nodes['node1']
        cluster.start()

        session = self.patient_cql_connection(node1)
        create_ks(session, 'ks', 2)
        create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})

        empty_size = data_size(node1, 'ks', 'cf')

        keys = 1000
        insert_statement = session.prepare(
            "INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')")
        execute_concurrent_with_args(session, insert_statement,
                                     [['k%d' % k] for k in range(keys)])

        node1.flush()
        node1.compact()
        initial_size = data_size(node1, 'ks', 'cf')
        #logger.debug("node1 size for ks.cf before bootstrapping node2: %s" % float(initial_size))

        node2 = new_node(cluster)
        node2.set_configuration_options(
            values={'request_timeout_in_ms': 10000})

        mark = node2.mark_log()

        node1.byteman_submit(['./byteman/migration_request_sleep.btm'])

        node2.start(jvm_args=["-Dcassandra.migration_task_wait_in_seconds=20"],
                    set_migration_task=False,
                    wait_for_binary_proto=True)

        node2.watch_log_for('Prepare completed. Receiving',
                            from_mark=mark,
                            timeout=6)

        node2.flush()
        node2.compact()
        #logger.debug("node2 joined with size for ks.cf : %s" % float(data_size(node2, 'ks','cf')))

        node1.stop()
        rows = session.execute('SELECT count(*) from ks.cf')
        assert rows[0][0] == 1000
        cluster.stop()
    def test_disk_balance_bootstrap(self):
        cluster = self.cluster
        if self.dtest_config.use_vnodes:
            cluster.set_configuration_options(values={'num_tokens': 256})
        # apparently we have legitimate errors in the log when bootstrapping (see bootstrap_test.py)
        self.fixture_dtest_setup.allow_log_errors = True
        cluster.populate(4).start(wait_for_binary_proto=True)
        node1 = cluster.nodes['node1']

        node1.stress(['write', 'n=50k', 'no-warmup', '-rate', 'threads=100', '-schema', 'replication(factor=3)',
                      'compaction(strategy=SizeTieredCompactionStrategy,enabled=false)'])
        cluster.flush()
        node5 = new_node(cluster)
        node5.start(wait_for_binary_proto=True)
        self.assert_balanced(node5)
    def test_simple_bootstrap_nodata(self):
        """
        @jira_ticket CASSANDRA-11010
        Test that bootstrap completes if streaming from nodes with no data
        """
        cluster = self.cluster
        # Create a two-node cluster
        cluster.populate(2)
        cluster.start(wait_other_notice=True)

        # Bootstrapping a new node
        node3 = new_node(cluster)
        node3.start(wait_for_binary_proto=True, wait_other_notice=True)

        assert_bootstrap_state(self, node3, 'COMPLETED')
Example #42
0
    def test_simple_bootstrap_nodata(self):
        """
        @jira_ticket CASSANDRA-11010
        Test that bootstrap completes if streaming from nodes with no data
        """
        cluster = self.cluster
        # Create a two-node cluster
        cluster.populate(2)
        cluster.start(wait_other_notice=True)

        # Bootstrapping a new node
        node3 = new_node(cluster)
        node3.start(wait_for_binary_proto=True, wait_other_notice=True)

        assert_bootstrap_state(self, node3, 'COMPLETED')
    def test_remove(self):
        """Test  a mix of ring change operations across a mix of transient and repaired/unrepaired data"""
        node4 = new_node(self.cluster, bootstrap=True, token='00040')
        patch_start(node4)
        node4.start(wait_for_binary_proto=True, wait_other_notice=True)
        main_session = self.patient_cql_connection(self.node1)
        nodes = [self.node1, self.node2, self.node3]

        #We want the node being removed to have no data on it so nodetool remove always gets all the necessary data
        #from survivors
        node4_id = node4.nodetool('info').stdout[25:61]
        node4.stop(wait_other_notice=True)

        for i in range(0, 40):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        sessions = [self.exclusive_cql_connection(node) for node in [self.node1, self.node2, self.node3]]

        expected = [gen_expected(range(0, 11), range(21, 40)),
                    gen_expected(range(0, 21), range(31, 40)),
                    gen_expected(range(11, 31))]

        # Every node should some of its fully replicated data and one and two should have some transient data
        self.check_expected(sessions, expected)

        nodes[0].nodetool('removenode ' + node4_id)

        #Give streaming time to occur, it's asynchronous from removenode completing at other ndoes
        import time
        time.sleep(15)

        # Everyone should have everything except
        expected = [gen_expected(range(0, 40)),
                    gen_expected(range(0, 40)),
                    gen_expected(range(0,40))]

        self.check_replication(sessions, exactly=3)
        self.check_expected(sessions, expected)
        repair_nodes(nodes)
        cleanup_nodes(nodes)

        self.check_replication(sessions, exactly=2)

        expected = [gen_expected(range(0,11), range(21,40)),
                    gen_expected(range(0,21), range(31, 40)),
                    gen_expected(range(11,31))]
        self.check_expected(sessions, expected)
    def test_simple_bootstrap_nodata(self):
        """
        @jira_ticket CASSANDRA-11010
        Test that bootstrap completes if streaming from nodes with no data
        """
        cluster = self.cluster
        cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')
        # Create a two-node cluster
        cluster.populate(2)
        cluster.start()

        # Bootstrapping a new node
        node3 = new_node(cluster)
        node3.start(wait_for_binary_proto=True)

        assert_bootstrap_state(self, node3, 'COMPLETED')
    def simple_bootstrap_test_small_keepalive_period(self):
        """
        @jira_ticket CASSANDRA-11841
        Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or
        2*streaming_keep_alive_period_in_secs to receive a single sstable
        """
        cluster = self.cluster
        cluster.set_configuration_options(
            values={
                'stream_throughput_outbound_megabits_per_sec': 1,
                'streaming_socket_timeout_in_ms': 1000,
                'streaming_keep_alive_period_in_secs': 1
            })

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]
        cluster.start(wait_other_notice=True)

        # Create more than one sstable larger than 1MB
        node1.stress([
            'write', 'n=50K', '-rate', 'threads=8', '-schema',
            'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'
        ])
        cluster.flush()
        node1.stress([
            'write', 'n=50K', '-rate', 'threads=8', '-schema',
            'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'
        ])
        cluster.flush()
        self.assertGreater(node1.get_sstables("keyspace1", "standard1"), 1)

        # Bootstraping a new node with very small streaming_socket_timeout_in_ms
        node2 = new_node(cluster)
        node2.start(wait_for_binary_proto=True)

        # Shouldn't fail due to streaming socket timeout timeout
        assert_bootstrap_state(self, node2, 'COMPLETED')

        for node in cluster.nodelist():
            self.assertTrue(
                node.grep_log('Scheduling keep-alive task with 1s period.',
                              filename='debug.log'))
            self.assertTrue(
                node.grep_log('Sending keep-alive', filename='debug.log'))
            self.assertTrue(
                node.grep_log('Received keep-alive', filename='debug.log'))
    def resumable_bootstrap_test(self):
        """
        Test resuming bootstrap after data streaming failure
        """

        cluster = self.cluster
        cluster.populate(2)

        node1 = cluster.nodes['node1']
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start(wait_other_notice=True)
        # kill stream to node3 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit(['./byteman/pre4.0/stream_failure.btm'])
        else:
            node1.byteman_submit(['./byteman/4.0/stream_failure.btm'])
        node1.stress(['write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50'])
        cluster.flush()

        # start bootstrapping node3 and wait for streaming
        node3 = new_node(cluster)
        node3.start(wait_other_notice=False, wait_for_binary_proto=True)

        # wait for node3 ready to query
        node3.watch_log_for("Starting listening for CQL clients")
        mark = node3.mark_log()
        # check if node3 is still in bootstrap mode
        assert_bootstrap_state(self, node3, 'IN_PROGRESS')

        # bring back node1 and invoke nodetool bootstrap to resume bootstrapping
        node3.nodetool('bootstrap resume')

        node3.watch_log_for("Resume complete", from_mark=mark)
        assert_bootstrap_state(self, node3, 'COMPLETED')

        # cleanup to guarantee each node will only have sstables of its ranges
        cluster.cleanup()

        debug("Check data is present")
        # Let's check stream bootstrap completely transferred data
        stdout, stderr, _ = node3.stress(['read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8'])

        if stdout is not None:
            self.assertNotIn("FAILURE", stdout)
    def test_consistent_reads_after_bootstrap(self):
        logger.debug("Creating a ring")
        cluster = self.cluster
        cluster.set_configuration_options(values={'hinted_handoff_enabled': False,
                                                  'write_request_timeout_in_ms': 60000,
                                                  'read_request_timeout_in_ms': 60000,
                                                  'dynamic_snitch_badness_threshold': 0.0})
        cluster.set_batch_commitlog(enabled=True)

        cluster.populate(2)
        node1, node2 = cluster.nodelist()
        cluster.start(wait_for_binary_proto=True, wait_other_notice=True)

        logger.debug("Set to talk to node 2")
        n2session = self.patient_cql_connection(node2)
        create_ks(n2session, 'ks', 2)
        create_c1c2_table(self, n2session)

        logger.debug("Generating some data for all nodes")
        insert_c1c2(n2session, keys=list(range(10, 20)), consistency=ConsistencyLevel.ALL)

        node1.flush()
        logger.debug("Taking down node1")
        node1.stop(wait_other_notice=True)

        logger.debug("Writing data to only node2")
        insert_c1c2(n2session, keys=list(range(30, 1000)), consistency=ConsistencyLevel.ONE)
        node2.flush()

        logger.debug("Restart node1")
        node1.start(wait_other_notice=True)

        logger.debug("Bootstraping node3")
        node3 = new_node(cluster)
        node3.start(wait_for_binary_proto=True)

        n3session = self.patient_cql_connection(node3)
        n3session.execute("USE ks")
        logger.debug("Checking that no data was lost")
        for n in range(10, 20):
            query_c1c2(n3session, n, ConsistencyLevel.ALL)

        for n in range(30, 1000):
            query_c1c2(n3session, n, ConsistencyLevel.ALL)
Example #48
0
    def test_decommissioned_wiped_node_can_gossip_to_single_seed(self):
        """
        @jira_ticket CASSANDRA-8072
        @jira_ticket CASSANDRA-8422
        Test that if we decommission a node, kill it and wipe its data, it can join a cluster with a single
        seed node.
        """
        cluster = self.cluster
        cluster.populate(1)
        cluster.start(wait_for_binary_proto=True)

        node1 = cluster.nodelist()[0]
        # Add a new node, bootstrap=True ensures that it is not a seed
        node2 = new_node(cluster, bootstrap=True)
        node2.start(wait_for_binary_proto=True, wait_other_notice=True)

        session = self.patient_cql_connection(node1)

        if cluster.version() >= '2.2':
            # reduce system_distributed RF to 2 so we don't require forceful decommission
            session.execute("ALTER KEYSPACE system_distributed WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};")

        session.execute("ALTER KEYSPACE system_traces WITH REPLICATION = {'class':'SimpleStrategy', 'replication_factor':'1'};")

        # Decommision the new node and kill it
        logger.debug("Decommissioning & stopping node2")
        node2.decommission()
        node2.stop(wait_other_notice=False)

        # Wipe its data
        for data_dir in node2.data_directories():
            logger.debug("Deleting {}".format(data_dir))
            shutil.rmtree(data_dir)

        commitlog_dir = os.path.join(node2.get_path(), 'commitlogs')
        logger.debug("Deleting {}".format(commitlog_dir))
        shutil.rmtree(commitlog_dir)

        # Now start it, it should be allowed to join
        mark = node2.mark_log()
        logger.debug("Restarting wiped node2")
        node2.start(wait_other_notice=False)
        node2.watch_log_for("JOINING:", from_mark=mark)
Example #49
0
    def test_simple_bootstrap_small_keepalive_period(self):
        """
        @jira_ticket CASSANDRA-11841
        Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or
        2*streaming_keep_alive_period_in_secs to receive a single sstable
        """
        cluster = self.cluster
        yaml_opts = {'streaming_keep_alive_period_in_secs': 2}
        if cluster.version() < '4.0':
            yaml_opts['streaming_socket_timeout_in_ms'] = 1000
        cluster.set_configuration_options(values=yaml_opts)

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]

        logger.debug("Setting up byteman on {}".format(node1.name))
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start(wait_other_notice=True)

        # Create more than one sstable larger than 1MB
        node1.stress(['write', 'n=1K', '-rate', 'threads=8', '-schema',
                      'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'])
        cluster.flush()

        logger.debug("Submitting byteman script to {} to".format(node1.name))
        # Sleep longer than streaming_socket_timeout_in_ms to make sure the node will not be killed
        node1.byteman_submit(['./byteman/stream_5s_sleep.btm'])

        # Bootstraping a new node with very small streaming_socket_timeout_in_ms
        node2 = new_node(cluster)
        node2.start(wait_for_binary_proto=True)

        # Shouldn't fail due to streaming socket timeout timeout
        assert_bootstrap_state(self, node2, 'COMPLETED')

        for node in cluster.nodelist():
            assert node.grep_log('Scheduling keep-alive task with 2s period.', filename='debug.log')
            assert node.grep_log('Sending keep-alive', filename='debug.log')
            assert node.grep_log('Received keep-alive', filename='debug.log')
    def move_test(self, move_token, expected_after_move, expected_after_repair):
        """Helper method to run a move test cycle"""
        node4 = new_node(self.cluster, bootstrap=True, token='00040')
        patch_start(node4)
        node4.start(wait_for_binary_proto=True, wait_other_notice=True)
        main_session = self.patient_cql_connection(self.node1)
        nodes = [self.node1, self.node2, self.node3, node4]

        for i in range(0, 40, 2):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        # Make sure at least a little data is repaired
        repair_nodes(nodes)

        # Ensure that there is at least some transient data around, because of this if it's missing after bootstrap
        # We know we failed to get it from the transient replica losing the range entirely
        nodes[1].stop(wait_other_notice=True)

        for i in range(1, 40, 2):
            print("Inserting " + str(i))
            self.insert_row(i, i, i, main_session)

        nodes[1].start(wait_for_binary_proto=True, wait_other_notice=True)
        sessions = [self.exclusive_cql_connection(node) for node in [self.node1, self.node2, self.node3, node4]]

        expected = [gen_expected(range(0, 11), range(31, 40)),
                    gen_expected(range(0, 21, 2)),
                    gen_expected(range(1, 11, 2), range(11, 31)),
                    gen_expected(range(11, 20, 2), range(21, 40))]
        self.check_expected(sessions, expected)
        self.check_replication(sessions, exactly=2)

        nodes[0].nodetool('move %s' % move_token)
        cleanup_nodes(nodes)

        self.check_replication(sessions, gte=2, lte=3)
        self.check_expected(sessions, expected=expected_after_move)

        repair_nodes(nodes)

        self.check_expected(sessions, expected_after_repair, nodes, cleanup=True)
        self.check_replication(sessions, exactly=2)
    def test_resumable_bootstrap(self):
        """
        Test resuming bootstrap after data streaming failure
        """
        cluster = self.cluster
        cluster.populate(2)

        node1 = cluster.nodes['node1']
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start(wait_other_notice=True)
        # kill stream to node3 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit([self.byteman_submit_path_pre_4_0])
        else:
            node1.byteman_submit([self.byteman_submit_path_4_0])
        node1.stress(['write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50'])
        cluster.flush()

        # start bootstrapping node3 and wait for streaming
        node3 = new_node(cluster)
        node3.start(wait_other_notice=False)

        # let streaming fail as we expect
        node3.watch_log_for('Some data streaming failed')

        # bring back node3 and invoke nodetool bootstrap to resume bootstrapping
        node3.nodetool('bootstrap resume')
        node3.wait_for_binary_interface()
        assert_bootstrap_state(self, node3, 'COMPLETED')

        # cleanup to guarantee each node will only have sstables of its ranges
        cluster.cleanup()

        logger.debug("Check data is present")
        # Let's check stream bootstrap completely transferred data
        stdout, stderr, _ = node3.stress(['read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8'])

        if stdout is not None:
            assert "FAILURE" not in stdout
Example #52
0
    def test_read_from_bootstrapped_node(self):
        """
        Test bootstrapped node sees existing data
        @jira_ticket CASSANDRA-6648
        """
        cluster = self.cluster
        cluster.populate(3)
        cluster.start()

        node1 = cluster.nodes['node1']
        node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor=2)'])

        session = self.patient_cql_connection(node1)
        stress_table = 'keyspace1.standard1'
        original_rows = list(session.execute("SELECT * FROM %s" % (stress_table,)))

        node4 = new_node(cluster)
        node4.start(wait_for_binary_proto=True)

        session = self.patient_exclusive_cql_connection(node4)
        new_rows = list(session.execute("SELECT * FROM %s" % (stress_table,)))
        assert original_rows == new_rows
Example #53
0
    def test_failed_bootstrap_wiped_node_can_join(self):
        """
        @jira_ticket CASSANDRA-9765
        Test that if a node fails to bootstrap, it can join the cluster even if the data is wiped.
        """
        cluster = self.cluster
        cluster.populate(1)
        cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1})
        cluster.start(wait_for_binary_proto=True)

        stress_table = 'keyspace1.standard1'

        # write some data, enough for the bootstrap to fail later on
        node1 = cluster.nodelist()[0]
        node1.stress(['write', 'n=100K', 'no-warmup', '-rate', 'threads=8'])
        node1.flush()

        session = self.patient_cql_connection(node1)
        original_rows = list(session.execute("SELECT * FROM {}".format(stress_table,)))

        # Add a new node, bootstrap=True ensures that it is not a seed
        node2 = new_node(cluster, bootstrap=True)

        # kill node2 in the middle of bootstrap
        t = KillOnBootstrap(node2)
        t.start()

        node2.start()
        t.join()
        assert not node2.is_running()

        # wipe any data for node2
        self._cleanup(node2)
        # Now start it again, it should be allowed to join
        mark = node2.mark_log()
        node2.start(wait_other_notice=True)
        node2.watch_log_for("JOINING:", from_mark=mark)
Example #54
0
    def simple_bootstrap_test_small_keepalive_period(self):
        """
        @jira_ticket CASSANDRA-11841
        Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or
        2*streaming_keep_alive_period_in_secs to receive a single sstable
        """
        cluster = self.cluster
        cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1,
                                                  'streaming_socket_timeout_in_ms': 1000,
                                                  'streaming_keep_alive_period_in_secs': 1})

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]
        cluster.start(wait_other_notice=True)

        # Create more than one sstable larger than 1MB
        node1.stress(['write', 'n=50K', '-rate', 'threads=8', '-schema',
                      'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'])
        cluster.flush()
        node1.stress(['write', 'n=50K', '-rate', 'threads=8', '-schema',
                      'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'])
        cluster.flush()
        self.assertGreater(node1.get_sstables("keyspace1", "standard1"), 1)

        # Bootstraping a new node with very small streaming_socket_timeout_in_ms
        node2 = new_node(cluster)
        node2.start(wait_for_binary_proto=True)

        # Shouldn't fail due to streaming socket timeout timeout
        assert_bootstrap_state(self, node2, 'COMPLETED')

        for node in cluster.nodelist():
            self.assertTrue(node.grep_log('Scheduling keep-alive task with 1s period.', filename='debug.log'))
            self.assertTrue(node.grep_log('Sending keep-alive', filename='debug.log'))
            self.assertTrue(node.grep_log('Received keep-alive', filename='debug.log'))
    def test_bootstrap(self):
        """ Test repaired data remains in sync after a bootstrap """
        self.fixture_dtest_setup.setup_overrides.cluster_options = ImmutableMapping({'hinted_handoff_enabled': 'false',
                                                                                     'commitlog_sync_period_in_ms': 500})
        self.init_default_config()
        self.cluster.populate(3).start()
        node1, node2, node3 = self.cluster.nodelist()

        session = self.patient_exclusive_cql_connection(node3)
        session.execute("CREATE KEYSPACE ks WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 2}")
        session.execute("CREATE TABLE ks.tbl (k INT PRIMARY KEY, v INT)")

        # insert some data
        stmt = SimpleStatement("INSERT INTO ks.tbl (k,v) VALUES (%s, %s)")
        for i in range(1000):
            session.execute(stmt, (i, i))

        node1.repair(options=['ks'])

        for i in range(1000):
            v = i + 1000
            session.execute(stmt, (v, v))

        # everything should be in sync
        for node in [node1, node2, node3]:
            result = node.repair(options=['ks', '--validate'])
            assert "Repaired data is in sync" in result.stdout

        node4 = new_node(self.cluster)
        node4.start(wait_for_binary_proto=True)

        assert len(self.cluster.nodelist()) == 4
        # everything should still be in sync
        for node in self.cluster.nodelist():
            result = node.repair(options=['ks', '--validate'])
            assert "Repaired data is in sync" in result.stdout
Example #56
0
 def default_bootstrap(cluster, token):
     node2 = new_node(cluster)
     node2.set_configuration_options(values={'initial_token': token})
     node2.start(wait_for_binary_proto=True)
     return node2