コード例 #1
0
    def test_bootstrap_waits_for_streaming_to_finish(self):
             """
             Test that bootstrap completes and is marked as such after streaming finishes.
             """

             cluster = self.cluster
             cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')

             logger.debug("Create a cluster")
             cluster.populate(1)
             node1 = cluster.nodelist()[0]

             logger.debug("Start node 1")
             node1.start(wait_for_binary_proto=True)

             logger.debug("Insert 10k rows")
             node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor=2)'])

             logger.debug("Bootstrap node 2 with delay")
             node2 = new_node(cluster, byteman_port='4200')
             node2.update_startup_byteman_script('./byteman/bootstrap_5s_sleep.btm')
             node2.start(wait_for_binary_proto=True)

             assert_bootstrap_state(self, node2, 'COMPLETED')
             assert node2.grep_log('Bootstrap completed', filename='debug.log')
コード例 #2
0
    def test_bootstrap_with_reset_bootstrap_state(self):
        """Test bootstrap with resetting bootstrap progress"""
        cluster = self.cluster
        cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1})
        cluster.populate(2).start(wait_other_notice=True)

        node1 = cluster.nodes['node1']
        node1.stress(['write', 'n=100K', '-schema', 'replication(factor=2)'])
        node1.flush()

        # kill node1 in the middle of streaming to let it fail
        t = InterruptBootstrap(node1)
        t.start()

        # start bootstrapping node3 and wait for streaming
        node3 = new_node(cluster)
        try:
            node3.start()
        except NodeError:
            pass  # node doesn't start as expected
        t.join()
        node1.start()

        # restart node3 bootstrap with resetting bootstrap progress
        node3.stop(signal_event=signal.SIGKILL)
        mark = node3.mark_log()
        node3.start(jvm_args=["-Dcassandra.reset_bootstrap_progress=true"])
        # check if we reset bootstrap state
        node3.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark)
        # wait for node3 ready to query
        node3.wait_for_binary_interface(from_mark=mark)

        # check if 2nd bootstrap succeeded
        assert_bootstrap_state(self, node3, 'COMPLETED')
コード例 #3
0
    def test_multi_dc_replace_with_rf1(self):
        """
        Test that multi-dc replace works when rf=1 on each dc
        """
        self._setup(n=[1, 1])

        yaml_config = """
        # Create the keyspace and table
        keyspace: keyspace1
        keyspace_definition: |
          CREATE KEYSPACE keyspace1 WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 1};
        table: users
        table_definition:
          CREATE TABLE users (
            username text,
            first_name text,
            last_name text,
            email text,
            PRIMARY KEY(username)
          ) WITH compaction = {'class':'SizeTieredCompactionStrategy'};
        insert:
          partitions: fixed(1)
          batchtype: UNLOGGED
        queries:
          read:
            cql: select * from users where username = ?
            fields: samerow
        """
        with tempfile.NamedTemporaryFile(mode='w+') as stress_config:
            stress_config.write(yaml_config)
            stress_config.flush()
            self.query_node.stress([
                'user', 'profile=' + stress_config.name, 'n=10k', 'no-warmup',
                'ops(insert=1)', '-rate', 'threads=5'
            ])
            # need to sleep for a bit to try and let things catch up as we frequently do a lot of
            # GC after the stress invocation above causing the next step of the test to timeout.
            # and then flush to make sure we really are fully caught up
            time.sleep(30)

        # Save initial data
        table_name = 'keyspace1.users'
        initial_data = self._fetch_initial_data(table=table_name,
                                                cl=ConsistencyLevel.TWO)

        self._stop_node_to_replace(table=table_name)

        self._do_replace(data_center='dc2')

        assert_bootstrap_state(self, self.replacement_node, 'COMPLETED')

        # Check that keyspace was replicated from dc1 to dc2
        assert not self.replacement_node.grep_log(
            "Unable to find sufficient sources for streaming range")

        self._verify_data(initial_data,
                          table=table_name,
                          cl=ConsistencyLevel.LOCAL_ONE)
コード例 #4
0
    def _bootstrap_test_with_replica_down(self, consistent_range_movement, rf=2):
        """
        Test to check consistent bootstrap will not succeed when there are insufficient replicas
        @jira_ticket CASSANDRA-11848
        """
        cluster = self.cluster
        cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')

        cluster.populate(2)
        node1, node2 = cluster.nodelist()

        node3_token = None
        # Make token assignment deterministic
        if not self.dtest_config.use_vnodes:
            cluster.set_configuration_options(values={'num_tokens': 1})
            tokens = cluster.balanced_tokens(3)
            logger.debug("non-vnode tokens: %r" % (tokens,))
            node1.set_configuration_options(values={'initial_token': tokens[0]})
            node2.set_configuration_options(values={'initial_token': tokens[2]})
            node3_token = tokens[1]  # Add node 3 between node1 and node2

        cluster.start()

        node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor={})'.format(rf)])

        # change system_auth keyspace to 2 (default is 1) to avoid
        # "Unable to find sufficient sources for streaming" warning
        if cluster.cassandra_version() >= '2.2.0':
            session = self.patient_cql_connection(node1)
            session.execute("""
                ALTER KEYSPACE system_auth
                    WITH replication = {'class':'SimpleStrategy', 'replication_factor':2};
            """)

        # Stop node2, so node3 will not be able to perform consistent range movement
        node2.stop(wait_other_notice=True)

        successful_bootstrap_expected = not consistent_range_movement

        node3 = new_node(cluster, token=node3_token)
        node3.start(wait_for_binary_proto=successful_bootstrap_expected, wait_other_notice=successful_bootstrap_expected,
                    jvm_args=["-Dcassandra.consistent.rangemovement={}".format(consistent_range_movement)])

        if successful_bootstrap_expected:
            # with rf=1 and cassandra.consistent.rangemovement=false, missing sources are ignored
            if not consistent_range_movement and rf == 1:
                node3.watch_log_for("Unable to find sufficient sources for streaming range")
            assert node3.is_running()
            assert_bootstrap_state(self, node3, 'COMPLETED')
        else:
            if consistent_range_movement:
                if cluster.version() < '4.0':
                    node3.watch_log_for("A node required to move the data consistently is down")
                else:
                    node3.watch_log_for("Necessary replicas for strict consistency were removed by source filters")
            else:
                node3.watch_log_for("Unable to find sufficient sources for streaming range")
            assert_not_running(node3)
コード例 #5
0
    def _bootstrap_test_with_replica_down(self, consistent_range_movement, rf=2):
        """
        Test to check consistent bootstrap will not succeed when there are insufficient replicas
        @jira_ticket CASSANDRA-11848
        """
        cluster = self.cluster

        cluster.populate(2)
        node1, node2 = cluster.nodelist()

        node3_token = None
        # Make token assignment deterministic
        if not self.dtest_config.use_vnodes:
            cluster.set_configuration_options(values={'num_tokens': 1})
            tokens = cluster.balanced_tokens(3)
            logger.debug("non-vnode tokens: %r" % (tokens,))
            node1.set_configuration_options(values={'initial_token': tokens[0]})
            node2.set_configuration_options(values={'initial_token': tokens[2]})
            node3_token = tokens[1]  # Add node 3 between node1 and node2

        cluster.start()

        node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor={})'.format(rf)])

        # change system_auth keyspace to 2 (default is 1) to avoid
        # "Unable to find sufficient sources for streaming" warning
        if cluster.cassandra_version() >= '2.2.0':
            session = self.patient_cql_connection(node1)
            session.execute("""
                ALTER KEYSPACE system_auth
                    WITH replication = {'class':'SimpleStrategy', 'replication_factor':2};
            """)

        # Stop node2, so node3 will not be able to perform consistent range movement
        node2.stop(wait_other_notice=True)

        successful_bootstrap_expected = not consistent_range_movement

        node3 = new_node(cluster, token=node3_token)
        node3.start(wait_for_binary_proto=successful_bootstrap_expected, wait_other_notice=successful_bootstrap_expected,
                    jvm_args=["-Dcassandra.consistent.rangemovement={}".format(consistent_range_movement)])

        if successful_bootstrap_expected:
            # with rf=1 and cassandra.consistent.rangemovement=false, missing sources are ignored
            if not consistent_range_movement and rf == 1:
                node3.watch_log_for("Unable to find sufficient sources for streaming range")
            assert node3.is_running()
            assert_bootstrap_state(self, node3, 'COMPLETED')
        else:
            if consistent_range_movement:
                if cluster.version() < '4.0':
                    node3.watch_log_for("A node required to move the data consistently is down")
                else:
                    node3.watch_log_for("Necessary replicas for strict consistency were removed by source filters")
            else:
                node3.watch_log_for("Unable to find sufficient sources for streaming range")
            assert_not_running(node3)
コード例 #6
0
    def _test_restart_failed_replace(self, mode):
        self.ignore_log_patterns = list(self.ignore_log_patterns) + [r'Error while waiting on bootstrap to complete']
        self._setup(n=3, enable_byteman=True)
        self._insert_data(n="1k")

        initial_data = self._fetch_initial_data()

        self._stop_node_to_replace()

        debug("Submitting byteman script to make stream fail")
        self.query_node.byteman_submit(['./byteman/stream_failure.btm'])

        self._do_replace(jvm_option='replace_address_first_boot',
                         opts={'streaming_socket_timeout_in_ms': 1000})

        # Make sure bootstrap did not complete successfully
        assert_bootstrap_state(self, self.replacement_node, 'IN_PROGRESS')

        if mode == 'reset_resume_state':
            mark = self.replacement_node.mark_log()
            debug("Restarting replacement node with -Dcassandra.reset_bootstrap_progress=true")
            # restart replacement node with resetting bootstrap state
            self.replacement_node.stop()
            self.replacement_node.start(jvm_args=[
                                        "-Dcassandra.replace_address_first_boot={}".format(self.replaced_node.address()),
                                        "-Dcassandra.reset_bootstrap_progress=true"
                                        ],
                                        wait_for_binary_proto=True)
            # check if we reset bootstrap state
            self.replacement_node.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark)
        elif mode == 'resume':
            debug("Resuming failed bootstrap")
            self.replacement_node.nodetool('bootstrap resume')
            # check if we skipped already retrieved ranges
            self.replacement_node.watch_log_for("already available. Skipping streaming.")
            self.replacement_node.watch_log_for("Resume complete")
        elif mode == 'wipe':
            self.replacement_node.stop()

            debug("Waiting other nodes to detect node stopped")
            self.query_node.watch_log_for("FatClient /{} has been silent for 30000ms, removing from gossip".format(self.replacement_node.address()), timeout=60)
            self.query_node.watch_log_for("Node /{} failed during replace.".format(self.replacement_node.address()), timeout=60, filename='debug.log')

            debug("Restarting node after wiping data")
            self._cleanup(self.replacement_node)
            self.replacement_node.start(jvm_args=["-Dcassandra.replace_address_first_boot={}".format(self.replaced_node.address())],
                                        wait_for_binary_proto=True)
        else:
            raise RuntimeError('invalid mode value {mode}'.format(mode))

        # check if bootstrap succeeded
        assert_bootstrap_state(self, self.replacement_node, 'COMPLETED')

        debug("Bootstrap finished successully, verifying data.")

        self._verify_data(initial_data)
コード例 #7
0
    def _test_restart_failed_replace(self, mode):
        self.ignore_log_patterns = list(self.ignore_log_patterns) + [r'Error while waiting on bootstrap to complete']
        self._setup(n=3, enable_byteman=True)
        self._insert_data(n="1k")

        initial_data = self._fetch_initial_data()

        self._stop_node_to_replace()

        debug("Submitting byteman script to make stream fail")
        self.query_node.byteman_submit(['./byteman/stream_failure.btm'])

        self._do_replace(jvm_option='replace_address_first_boot',
                         opts={'streaming_socket_timeout_in_ms': 1000})

        # Make sure bootstrap did not complete successfully
        assert_bootstrap_state(self, self.replacement_node, 'IN_PROGRESS')

        if mode == 'reset_resume_state':
            mark = self.replacement_node.mark_log()
            debug("Restarting replacement node with -Dcassandra.reset_bootstrap_progress=true")
            # restart replacement node with resetting bootstrap state
            self.replacement_node.stop()
            self.replacement_node.start(jvm_args=[
                                        "-Dcassandra.replace_address_first_boot={}".format(self.replaced_node.address()),
                                        "-Dcassandra.reset_bootstrap_progress=true"
                                        ],
                                        wait_for_binary_proto=True)
            # check if we reset bootstrap state
            self.replacement_node.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark)
        elif mode == 'resume':
            debug("Resuming failed bootstrap")
            self.replacement_node.nodetool('bootstrap resume')
            # check if we skipped already retrieved ranges
            self.replacement_node.watch_log_for("already available. Skipping streaming.")
            self.replacement_node.watch_log_for("Resume complete")
        elif mode == 'wipe':
            self.replacement_node.stop()

            debug("Waiting other nodes to detect node stopped")
            self.query_node.watch_log_for("FatClient /{} has been silent for 30000ms, removing from gossip".format(self.replacement_node.address()), timeout=60)
            self.query_node.watch_log_for("Node /{} failed during replace.".format(self.replacement_node.address()), timeout=60, filename='debug.log')

            debug("Restarting node after wiping data")
            self._cleanup(self.replacement_node)
            self.replacement_node.start(jvm_args=["-Dcassandra.replace_address_first_boot={}".format(self.replaced_node.address())],
                                        wait_for_binary_proto=True)
        else:
            raise RuntimeError('invalid mode value {mode}'.format(mode))

        # check if bootstrap succeeded
        assert_bootstrap_state(self, self.replacement_node, 'COMPLETED')

        debug("Bootstrap finished successully, verifying data.")

        self._verify_data(initial_data)
コード例 #8
0
    def test_resumable_bootstrap(self):
        """
        Test resuming bootstrap after data streaming failure
        """
        cluster = self.cluster
        cluster.populate(2)

        node1 = cluster.nodes['node1']
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start(wait_other_notice=True)
        # kill stream to node3 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit(['./byteman/pre4.0/stream_failure.btm'])
        else:
            node1.byteman_submit(['./byteman/4.0/stream_failure.btm'])
        node1.stress([
            'write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema',
            'replication(factor=2)', '-rate', 'threads=50'
        ])
        cluster.flush()

        # start bootstrapping node3 and wait for streaming
        node3 = new_node(cluster)
        node3.start(wait_other_notice=False, wait_for_binary_proto=True)

        # wait for node3 ready to query
        node3.watch_log_for("Starting listening for CQL clients")
        mark = node3.mark_log()
        # check if node3 is still in bootstrap mode
        retry_till_success(assert_bootstrap_state,
                           tester=self,
                           node=node3,
                           expected_bootstrap_state='IN_PROGRESS',
                           timeout=120)

        # bring back node1 and invoke nodetool bootstrap to resume bootstrapping
        node3.nodetool('bootstrap resume')

        node3.watch_log_for("Resume complete", from_mark=mark)
        assert_bootstrap_state(self, node3, 'COMPLETED')

        # cleanup to guarantee each node will only have sstables of its ranges
        cluster.cleanup()

        logger.debug("Check data is present")
        # Let's check stream bootstrap completely transferred data
        stdout, stderr, _ = node3.stress([
            'read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)',
            '-rate', 'threads=8'
        ])

        if stdout is not None:
            assert "FAILURE" not in stdout
コード例 #9
0
        def bootstrap_on_write_survey_and_join(cluster, token):
            node2 = new_node(cluster)
            node2.set_configuration_options(values={'initial_token': token})
            node2.start(jvm_args=["-Dcassandra.write_survey=true"], wait_for_binary_proto=True)

            assert len(node2.grep_log('Startup complete, but write survey mode is active, not becoming an active ring member.'))
            assert_bootstrap_state(self, node2, 'IN_PROGRESS')

            node2.nodetool("join")
            assert len(node2.grep_log('Leaving write survey mode and joining ring at operator request'))
            return node2
コード例 #10
0
    def test_multi_dc_replace_with_rf1(self):
        """
        Test that multi-dc replace works when rf=1 on each dc
        """
        self._setup(n=[1, 1])

        yaml_config = """
        # Create the keyspace and table
        keyspace: keyspace1
        keyspace_definition: |
          CREATE KEYSPACE keyspace1 WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 1};
        table: users
        table_definition:
          CREATE TABLE users (
            username text,
            first_name text,
            last_name text,
            email text,
            PRIMARY KEY(username)
          ) WITH compaction = {'class':'SizeTieredCompactionStrategy'};
        insert:
          partitions: fixed(1)
          batchtype: UNLOGGED
        queries:
          read:
            cql: select * from users where username = ?
            fields: samerow
        """
        with tempfile.NamedTemporaryFile(mode='w+') as stress_config:
            stress_config.write(yaml_config)
            stress_config.flush()
            self.query_node.stress(['user', 'profile=' + stress_config.name, 'n=10k', 'no-warmup',
                                    'ops(insert=1)', '-rate', 'threads=5'])
            # need to sleep for a bit to try and let things catch up as we frequently do a lot of
            # GC after the stress invocation above causing the next step of the test to timeout.
            # and then flush to make sure we really are fully caught up
            time.sleep(30)

        # Save initial data
        table_name = 'keyspace1.users'
        initial_data = self._fetch_initial_data(table=table_name, cl=ConsistencyLevel.TWO)

        self._stop_node_to_replace(table=table_name)

        self._do_replace(data_center='dc2')

        assert_bootstrap_state(self, self.replacement_node, 'COMPLETED')

        # Check that keyspace was replicated from dc1 to dc2
        assert not self.replacement_node.grep_log("Unable to find sufficient sources for streaming range")

        self._verify_data(initial_data, table=table_name, cl=ConsistencyLevel.LOCAL_ONE)
コード例 #11
0
    def test_resumable_bootstrap(self):
        """
        Test resuming bootstrap after data streaming failure
        """
        cluster = self.cluster
        cluster.set_environment_variable(
            'CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')
        cluster.populate(2)

        node1 = cluster.nodes['node1']
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start()
        # kill stream to node3 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit([self.byteman_submit_path_pre_4_0])
        else:
            node1.byteman_submit([self.byteman_submit_path_4_0])
        node1.stress([
            'write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema',
            'replication(factor=2)', '-rate', 'threads=50'
        ])
        cluster.flush()

        # start bootstrapping node3 and wait for streaming
        node3 = new_node(cluster)
        node3.start(wait_other_notice=False)

        # let streaming fail as we expect
        node3.watch_log_for('Some data streaming failed')

        # bring back node3 and invoke nodetool bootstrap to resume bootstrapping
        node3.nodetool('bootstrap resume')
        node3.wait_for_binary_interface()
        assert_bootstrap_state(self, node3, 'COMPLETED')

        # cleanup to guarantee each node will only have sstables of its ranges
        cluster.cleanup()

        logger.debug("Check data is present")
        # Let's check stream bootstrap completely transferred data
        stdout, stderr, _ = node3.stress([
            'read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)',
            '-rate', 'threads=8'
        ])

        if stdout is not None:
            assert "FAILURE" not in stdout
コード例 #12
0
    def test_simple_bootstrap_small_keepalive_period(self):
        """
        @jira_ticket CASSANDRA-11841
        Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or
        2*streaming_keep_alive_period_in_secs to receive a single sstable
        """
        cluster = self.cluster
        yaml_opts = {'streaming_keep_alive_period_in_secs': 2}
        if cluster.version() < '4.0':
            yaml_opts['streaming_socket_timeout_in_ms'] = 1000
        cluster.set_configuration_options(values=yaml_opts)

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]

        logger.debug("Setting up byteman on {}".format(node1.name))
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start()

        # Create more than one sstable larger than 1MB
        node1.stress([
            'write', 'n=1K', '-rate', 'threads=8', '-schema',
            'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'
        ])
        cluster.flush()

        logger.debug("Submitting byteman script to {} to".format(node1.name))
        # Sleep longer than streaming_socket_timeout_in_ms to make sure the node will not be killed
        node1.byteman_submit([mk_bman_path('stream_5s_sleep.btm')])

        # Bootstraping a new node with very small streaming_socket_timeout_in_ms
        node2 = new_node(cluster)
        node2.start(wait_for_binary_proto=True)

        # Shouldn't fail due to streaming socket timeout timeout
        assert_bootstrap_state(self, node2, 'COMPLETED')

        if cluster.version() < '4.0':
            for node in cluster.nodelist():
                assert node.grep_log(
                    'Scheduling keep-alive task with 2s period.',
                    filename='debug.log')
                assert node.grep_log('Sending keep-alive',
                                     filename='debug.log')
                assert node.grep_log('Received keep-alive',
                                     filename='debug.log')
コード例 #13
0
    def test_simple_bootstrap_nodata(self):
        """
        @jira_ticket CASSANDRA-11010
        Test that bootstrap completes if streaming from nodes with no data
        """
        cluster = self.cluster
        # Create a two-node cluster
        cluster.populate(2)
        cluster.start(wait_other_notice=True)

        # Bootstrapping a new node
        node3 = new_node(cluster)
        node3.start(wait_for_binary_proto=True, wait_other_notice=True)

        assert_bootstrap_state(self, node3, 'COMPLETED')
コード例 #14
0
    def test_simple_bootstrap_nodata(self):
        """
        @jira_ticket CASSANDRA-11010
        Test that bootstrap completes if streaming from nodes with no data
        """
        cluster = self.cluster
        # Create a two-node cluster
        cluster.populate(2)
        cluster.start(wait_other_notice=True)

        # Bootstrapping a new node
        node3 = new_node(cluster)
        node3.start(wait_for_binary_proto=True, wait_other_notice=True)

        assert_bootstrap_state(self, node3, 'COMPLETED')
コード例 #15
0
    def test_simple_bootstrap_nodata(self):
        """
        @jira_ticket CASSANDRA-11010
        Test that bootstrap completes if streaming from nodes with no data
        """
        cluster = self.cluster
        cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')
        # Create a two-node cluster
        cluster.populate(2)
        cluster.start()

        # Bootstrapping a new node
        node3 = new_node(cluster)
        node3.start(wait_for_binary_proto=True)

        assert_bootstrap_state(self, node3, 'COMPLETED')
コード例 #16
0
    def resumable_bootstrap_test(self):
        """
        Test resuming bootstrap after data streaming failure
        """

        cluster = self.cluster
        cluster.populate(2)

        node1 = cluster.nodes['node1']
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start(wait_other_notice=True)
        # kill stream to node3 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit(['./byteman/pre4.0/stream_failure.btm'])
        else:
            node1.byteman_submit(['./byteman/4.0/stream_failure.btm'])
        node1.stress(['write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50'])
        cluster.flush()

        # start bootstrapping node3 and wait for streaming
        node3 = new_node(cluster)
        node3.start(wait_other_notice=False, wait_for_binary_proto=True)

        # wait for node3 ready to query
        node3.watch_log_for("Starting listening for CQL clients")
        mark = node3.mark_log()
        # check if node3 is still in bootstrap mode
        assert_bootstrap_state(self, node3, 'IN_PROGRESS')

        # bring back node1 and invoke nodetool bootstrap to resume bootstrapping
        node3.nodetool('bootstrap resume')

        node3.watch_log_for("Resume complete", from_mark=mark)
        assert_bootstrap_state(self, node3, 'COMPLETED')

        # cleanup to guarantee each node will only have sstables of its ranges
        cluster.cleanup()

        debug("Check data is present")
        # Let's check stream bootstrap completely transferred data
        stdout, stderr, _ = node3.stress(['read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8'])

        if stdout is not None:
            self.assertNotIn("FAILURE", stdout)
コード例 #17
0
    def simple_bootstrap_test_small_keepalive_period(self):
        """
        @jira_ticket CASSANDRA-11841
        Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or
        2*streaming_keep_alive_period_in_secs to receive a single sstable
        """
        cluster = self.cluster
        cluster.set_configuration_options(
            values={
                'stream_throughput_outbound_megabits_per_sec': 1,
                'streaming_socket_timeout_in_ms': 1000,
                'streaming_keep_alive_period_in_secs': 1
            })

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]
        cluster.start(wait_other_notice=True)

        # Create more than one sstable larger than 1MB
        node1.stress([
            'write', 'n=50K', '-rate', 'threads=8', '-schema',
            'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'
        ])
        cluster.flush()
        node1.stress([
            'write', 'n=50K', '-rate', 'threads=8', '-schema',
            'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'
        ])
        cluster.flush()
        self.assertGreater(node1.get_sstables("keyspace1", "standard1"), 1)

        # Bootstraping a new node with very small streaming_socket_timeout_in_ms
        node2 = new_node(cluster)
        node2.start(wait_for_binary_proto=True)

        # Shouldn't fail due to streaming socket timeout timeout
        assert_bootstrap_state(self, node2, 'COMPLETED')

        for node in cluster.nodelist():
            self.assertTrue(
                node.grep_log('Scheduling keep-alive task with 1s period.',
                              filename='debug.log'))
            self.assertTrue(
                node.grep_log('Sending keep-alive', filename='debug.log'))
            self.assertTrue(
                node.grep_log('Received keep-alive', filename='debug.log'))
コード例 #18
0
    def test_simple_bootstrap_small_keepalive_period(self):
        """
        @jira_ticket CASSANDRA-11841
        Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or
        2*streaming_keep_alive_period_in_secs to receive a single sstable
        """
        cluster = self.cluster
        yaml_opts = {'streaming_keep_alive_period_in_secs': 2}
        if cluster.version() < '4.0':
            yaml_opts['streaming_socket_timeout_in_ms'] = 1000
        cluster.set_configuration_options(values=yaml_opts)

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]

        logger.debug("Setting up byteman on {}".format(node1.name))
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start(wait_other_notice=True)

        # Create more than one sstable larger than 1MB
        node1.stress(['write', 'n=1K', '-rate', 'threads=8', '-schema',
                      'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)'])
        cluster.flush()

        logger.debug("Submitting byteman script to {} to".format(node1.name))
        # Sleep longer than streaming_socket_timeout_in_ms to make sure the node will not be killed
        node1.byteman_submit(['./byteman/stream_5s_sleep.btm'])

        # Bootstraping a new node with very small streaming_socket_timeout_in_ms
        node2 = new_node(cluster)
        node2.start(wait_for_binary_proto=True)

        # Shouldn't fail due to streaming socket timeout timeout
        assert_bootstrap_state(self, node2, 'COMPLETED')

        for node in cluster.nodelist():
            assert node.grep_log('Scheduling keep-alive task with 2s period.', filename='debug.log')
            assert node.grep_log('Sending keep-alive', filename='debug.log')
            assert node.grep_log('Received keep-alive', filename='debug.log')
コード例 #19
0
    def _base_bootstrap_test(self, bootstrap=None, bootstrap_from_version=None):
        def default_bootstrap(cluster, token):
            node2 = new_node(cluster)
            node2.set_configuration_options(values={'initial_token': token})
            node2.start(wait_for_binary_proto=True)
            return node2

        if bootstrap is None:
            bootstrap = default_bootstrap

        cluster = self.cluster
        tokens = cluster.balanced_tokens(2)
        cluster.set_configuration_options(values={'num_tokens': 1})

        debug("[node1, node2] tokens: %r" % (tokens,))

        keys = 10000

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]
        if bootstrap_from_version:
            debug("starting source node on version {}".format(bootstrap_from_version))
            node1.set_install_dir(version=bootstrap_from_version)
        node1.set_configuration_options(values={'initial_token': tokens[0]})
        cluster.start(wait_other_notice=True)

        session = self.patient_cql_connection(node1)
        self.create_ks(session, 'ks', 1)
        self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})

        # record the size before inserting any of our own data
        empty_size = node1.data_size()
        debug("node1 empty size : %s" % float(empty_size))

        insert_statement = session.prepare("INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')")
        execute_concurrent_with_args(session, insert_statement, [['k%d' % k] for k in range(keys)])

        node1.flush()
        node1.compact()
        initial_size = node1.data_size()
        debug("node1 size before bootstrapping node2: %s" % float(initial_size))

        # Reads inserted data all during the bootstrap process. We shouldn't
        # get any error
        reader = self.go(lambda _: query_c1c2(session, random.randint(0, keys - 1), ConsistencyLevel.ONE))

        # Bootstrapping a new node in the current version
        node2 = bootstrap(cluster, tokens[1])
        node2.compact()

        reader.check()
        node1.cleanup()
        debug("node1 size after cleanup: %s" % float(node1.data_size()))
        node1.compact()
        debug("node1 size after compacting: %s" % float(node1.data_size()))
        time.sleep(.5)
        reader.check()

        debug("node2 size after compacting: %s" % float(node2.data_size()))

        size1 = float(node1.data_size())
        size2 = float(node2.data_size())
        assert_almost_equal(size1, size2, error=0.3)
        assert_almost_equal(float(initial_size - empty_size), 2 * (size1 - float(empty_size)))

        assert_bootstrap_state(self, node2, 'COMPLETED')
        if bootstrap_from_version:
            self.assertTrue(node2.grep_log('does not support keep-alive', filename='debug.log'))
コード例 #20
0
    def _test_restart_failed_replace(self, mode):
        self.fixture_dtest_setup.ignore_log_patterns = list(
            self.fixture_dtest_setup.ignore_log_patterns) + [
                r'Error while waiting on bootstrap to complete'
            ]

        self._setup(n=3, enable_byteman=True)
        self._insert_data(n="1k")

        initial_data = self._fetch_initial_data()

        self._stop_node_to_replace()

        logger.debug("Submitting byteman script to make stream fail")
        btmmark = self.query_node.mark_log()

        if self.cluster.version() < '4.0':
            self.query_node.byteman_submit(
                [mk_bman_path('pre4.0/stream_failure.btm')])
            self._do_replace(jvm_option='replace_address_first_boot',
                             opts={'streaming_socket_timeout_in_ms': 1000},
                             wait_for_binary_proto=False,
                             wait_other_notice=True)
        else:
            self.query_node.byteman_submit(
                [mk_bman_path('4.0/stream_failure.btm')])
            self._do_replace(jvm_option='replace_address_first_boot',
                             wait_for_binary_proto=False,
                             wait_other_notice=True)

        # Make sure bootstrap did not complete successfully
        self.query_node.watch_log_for("Triggering network failure",
                                      from_mark=btmmark)
        self.query_node.watch_log_for("Stream failed", from_mark=btmmark)
        self.replacement_node.watch_log_for("Stream failed")
        self.replacement_node.watch_log_for(
            "Some data streaming failed.*IN_PROGRESS$")

        if mode == 'reset_resume_state':
            mark = self.replacement_node.mark_log()
            logger.debug(
                "Restarting replacement node with -Dcassandra.reset_bootstrap_progress=true"
            )
            # restart replacement node with resetting bootstrap state (with 180s timeout)
            self.replacement_node.stop()
            self.replacement_node.start(jvm_args=[
                "-Dcassandra.replace_address_first_boot={}".format(
                    self.replaced_node.address()),
                "-Dcassandra.reset_bootstrap_progress=true"
            ],
                                        wait_for_binary_proto=180)
            # check if we reset bootstrap state
            self.replacement_node.watch_log_for(
                "Resetting bootstrap progress to start fresh", from_mark=mark)
        elif mode == 'resume':
            logger.debug("Resuming failed bootstrap")
            self.replacement_node.nodetool('bootstrap resume')
            # check if we skipped already retrieved ranges
            self.replacement_node.watch_log_for(
                "already available. Skipping streaming.")
            self.replacement_node.watch_log_for("Resume complete")
        elif mode == 'wipe':
            self.replacement_node.stop()

            logger.debug("Waiting other nodes to detect node stopped")
            node_log_str = self.replacement_node.address_for_current_version_slashy(
            )
            self.query_node.watch_log_for(
                "FatClient {} has been silent for 30000ms, removing from gossip"
                .format(node_log_str),
                timeout=120)
            self.query_node.watch_log_for(
                "Node {} failed during replace.".format(node_log_str),
                timeout=120,
                filename='debug.log')

            logger.debug("Restarting node after wiping data")
            self._cleanup(self.replacement_node)
            self.replacement_node.start(jvm_args=[
                "-Dcassandra.replace_address_first_boot={}".format(
                    self.replaced_node.address())
            ],
                                        wait_for_binary_proto=120)
        else:
            raise RuntimeError('invalid mode value {mode}'.format(mode=mode))

        # check if bootstrap succeeded
        assert_bootstrap_state(self, self.replacement_node, 'COMPLETED')

        logger.debug("Bootstrap finished successfully, verifying data.")

        self._verify_data(initial_data)
コード例 #21
0
    def _base_bootstrap_test(self,
                             bootstrap=None,
                             bootstrap_from_version=None,
                             enable_ssl=None):
        def default_bootstrap(cluster, token):
            node2 = new_node(cluster)
            node2.set_configuration_options(values={'initial_token': token})
            node2.start(wait_for_binary_proto=True)
            return node2

        if bootstrap is None:
            bootstrap = default_bootstrap

        cluster = self.cluster

        if enable_ssl:
            logger.debug("***using internode ssl***")
            generate_ssl_stores(self.fixture_dtest_setup.test_path)
            cluster.enable_internode_ssl(self.fixture_dtest_setup.test_path)

        tokens = cluster.balanced_tokens(2)
        cluster.set_configuration_options(values={'num_tokens': 1})

        logger.debug("[node1, node2] tokens: %r" % (tokens, ))

        keys = 10000

        # Create a single node cluster
        cluster.populate(1)
        node1 = cluster.nodelist()[0]
        if bootstrap_from_version:
            logger.debug("starting source node on version {}".format(
                bootstrap_from_version))
            node1.set_install_dir(version=bootstrap_from_version)
        node1.set_configuration_options(values={'initial_token': tokens[0]})
        cluster.start(wait_other_notice=True)

        session = self.patient_cql_connection(node1)
        create_ks(session, 'ks', 1)
        create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'})

        # record the size before inserting any of our own data
        empty_size = data_size(node1, 'ks', 'cf')
        logger.debug("node1 empty size for ks.cf: %s" % float(empty_size))

        insert_statement = session.prepare(
            "INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')")
        execute_concurrent_with_args(session, insert_statement,
                                     [['k%d' % k] for k in range(keys)])

        node1.flush()
        node1.compact()
        initial_size = data_size(node1, 'ks', 'cf')
        logger.debug("node1 size for ks.cf before bootstrapping node2: %s" %
                     float(initial_size))

        # Reads inserted data all during the bootstrap process. We shouldn't
        # get any error
        query_c1c2(session, random.randint(0, keys - 1), ConsistencyLevel.ONE)
        session.shutdown()

        # Bootstrapping a new node in the current version
        node2 = bootstrap(cluster, tokens[1])
        node2.compact()

        node1.cleanup()
        logger.debug("node1 size for ks.cf after cleanup: %s" %
                     float(data_size(node1, 'ks', 'cf')))
        node1.compact()
        logger.debug("node1 size for ks.cf after compacting: %s" %
                     float(data_size(node1, 'ks', 'cf')))

        logger.debug("node2 size for ks.cf after compacting: %s" %
                     float(data_size(node2, 'ks', 'cf')))

        size1 = float(data_size(node1, 'ks', 'cf'))
        size2 = float(data_size(node2, 'ks', 'cf'))
        assert_almost_equal(size1, size2, error=0.3)
        assert_almost_equal(float(initial_size - empty_size),
                            2 * (size1 - float(empty_size)))

        assert_bootstrap_state(self, node2, 'COMPLETED')
コード例 #22
0
    def test_bootstrap_binary_disabled(self):
        """
        Test binary while bootstrapping and streaming fails
        @jira_ticket CASSANDRA-14526, CASSANDRA-14525
        """
        config = {
            'authenticator': 'org.apache.cassandra.auth.PasswordAuthenticator',
            'authorizer': 'org.apache.cassandra.auth.CassandraAuthorizer',
            'role_manager': 'org.apache.cassandra.auth.CassandraRoleManager',
            'permissions_validity_in_ms': 0,
            'roles_validity_in_ms': 0
        }

        cluster = self.cluster
        cluster.populate(1)

        node1 = cluster.nodes['node1']
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start(wait_other_notice=True)
        # kill stream to node2 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit([self.byteman_submit_path_pre_4_0])
        else:
            node1.byteman_submit([self.byteman_submit_path_4_0])
        node1.stress([
            'write', 'n=1K', 'no-warmup', 'cl=ONE', '-schema',
            'replication(factor=3)', '-rate', 'threads=50', '-mode', 'native',
            'cql3', 'user=cassandra', 'password=cassandra'
        ])
        cluster.flush()

        # start bootstrapping node2 and wait for streaming
        node2 = new_node(cluster)
        node2.set_configuration_options(values=config)
        node2.byteman_port = '8101'  # set for when we add node3
        node2.import_config_files()
        node2.start(jvm_args=["-Dcassandra.ring_delay_ms=5000"],
                    wait_other_notice=True)
        self.assert_log_had_msg(node2,
                                'Some data streaming failed',
                                timeout=30)
        self.assert_log_had_msg(
            node2,
            'Not starting client transports as bootstrap has not completed',
            timeout=30)

        try:
            node2.nodetool('join')
            pytest.fail('nodetool should have errored and failed to join ring')
        except ToolError as t:
            assert "Cannot join the ring until bootstrap completes" in t.stdout

        node2.nodetool('bootstrap resume')
        node2.wait_for_binary_interface()
        assert_bootstrap_state(self,
                               node2,
                               'COMPLETED',
                               user='******',
                               password='******')

        # Test write survey behaviour
        node3 = new_node(cluster)
        node3.set_configuration_options(values=config)

        # kill stream to node3 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit([self.byteman_submit_path_pre_4_0])
            node2.byteman_submit([self.byteman_submit_path_pre_4_0])
        else:
            node1.byteman_submit([self.byteman_submit_path_4_0])
            node2.byteman_submit([self.byteman_submit_path_4_0])
        node3.start(jvm_args=[
            "-Dcassandra.write_survey=true", "-Dcassandra.ring_delay_ms=5000"
        ],
                    wait_other_notice=True)
        self.assert_log_had_msg(node3,
                                'Some data streaming failed',
                                timeout=30)
        self.assert_log_had_msg(
            node3,
            "Not starting client transports in write_survey mode as it's bootstrapping or auth is enabled",
            timeout=30)

        try:
            node3.nodetool('join')
            pytest.fail('nodetool should have errored and failed to join ring')
        except ToolError as t:
            assert "Cannot join the ring until bootstrap completes" in t.stdout

        node3.nodetool('bootstrap resume')
        self.assert_log_had_msg(
            node3,
            "Not starting client transports in write_survey mode as it's bootstrapping or auth is enabled",
            timeout=30)

        # Should succeed in joining
        node3.nodetool('join')
        self.assert_log_had_msg(
            node3,
            "Leaving write survey mode and joining ring at operator request",
            timeout=30)
        assert_bootstrap_state(self,
                               node3,
                               'COMPLETED',
                               user='******',
                               password='******')
        node3.wait_for_binary_interface(timeout=30)
コード例 #23
0
    def test_bootstrap_binary_disabled(self):
        """
        Test binary while bootstrapping and streaming fails.

        This test was ported to jvm-dtest org.apache.cassandra.distributed.test.BootstrapBinaryDisabledTest,
        as of this writing there are a few limitations with jvm-dtest which requries this test to
        stay, namely vnode support (ci also tests under different configs).  Once jvm-dtest supports
        vnodes, this test can go away in favor of that class.

        @jira_ticket CASSANDRA-14526, CASSANDRA-14525, CASSANDRA-16127
        """
        config = {'authenticator': 'org.apache.cassandra.auth.PasswordAuthenticator',
                  'authorizer': 'org.apache.cassandra.auth.CassandraAuthorizer',
                  'role_manager': 'org.apache.cassandra.auth.CassandraRoleManager',
                  'permissions_validity_in_ms': 0,
                  'roles_validity_in_ms': 0}

        cluster = self.cluster
        cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True')
        cluster.populate(1)

        node1 = cluster.nodes['node1']
        # set up byteman
        node1.byteman_port = '8100'
        node1.import_config_files()

        cluster.start()
        # kill stream to node2 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit([self.byteman_submit_path_pre_4_0])
        else:
            node1.byteman_submit([self.byteman_submit_path_4_0])
        node1.stress(['write', 'n=1K', 'no-warmup', 'cl=ONE', '-schema', 'replication(factor=3)', '-rate', 'threads=50', '-mode', 'native', 'cql3', 'user=cassandra', 'password=cassandra'])
        cluster.flush()

        # start bootstrapping node2 and wait for streaming
        node2 = new_node(cluster)
        node2.set_configuration_options(values=config)
        node2.byteman_port = '8101' # set for when we add node3
        node2.import_config_files()
        node2.start(jvm_args=["-Dcassandra.ring_delay_ms=5000"])
        self.assert_log_had_msg(node2, 'Some data streaming failed')

        try:
            node2.nodetool('join')
            pytest.fail('nodetool should have errored and failed to join ring')
        except ToolError as t:
            assert "Cannot join the ring until bootstrap completes" in t.stdout

        node2.nodetool('bootstrap resume')
        node2.wait_for_binary_interface()
        assert_bootstrap_state(self, node2, 'COMPLETED', user='******', password='******')

        # Test write survey behaviour
        node3 = new_node(cluster)
        node3.set_configuration_options(values=config)

        # kill stream to node3 in the middle of streaming to let it fail
        if cluster.version() < '4.0':
            node1.byteman_submit([self.byteman_submit_path_pre_4_0])
            node2.byteman_submit([self.byteman_submit_path_pre_4_0])
        else:
            node1.byteman_submit([self.byteman_submit_path_4_0])
            node2.byteman_submit([self.byteman_submit_path_4_0])
        node3.start(jvm_args=["-Dcassandra.write_survey=true", "-Dcassandra.ring_delay_ms=5000"])
        self.assert_log_had_msg(node3, 'Some data streaming failed')
        self.assert_log_had_msg(node3, "Not starting client transports in write_survey mode as it's bootstrapping or auth is enabled")

        try:
            node3.nodetool('join')
            pytest.fail('nodetool should have errored and failed to join ring')
        except ToolError as t:
            assert "Cannot join the ring until bootstrap completes" in t.stdout

        node3.nodetool('bootstrap resume')
        self.assert_log_had_msg(node3, "Not starting client transports in write_survey mode as it's bootstrapping or auth is enabled")

        # Should succeed in joining
        node3.nodetool('join')
        self.assert_log_had_msg(node3, "Leaving write survey mode and joining ring at operator request")
        assert_bootstrap_state(self, node3, 'COMPLETED', user='******', password='******')
        node3.wait_for_binary_interface()