Esempio n. 1
0
    def _produce_consumed_space(self, orig_free: list[float]) -> bool:
        """ Test helper: produce about 10MiB of data and return true if any of
        the nodes saw a reduction in free space.
        """
        num_records = 10240
        record_size = 1024

        # Produce data and confirm metrics update
        ktools = KafkaCliTools(self.redpanda)
        ktools.produce(self.topic, num_records, record_size, acks=-1)

        new_free = self._node_disk_free_bytes()
        return self._count_greater(orig_free, new_free) > 0
Esempio n. 2
0
class TopicDeleteTest(RedpandaTest):
    """
    Verify that topic deletion cleans up storage.
    """
    topics = (TopicSpec(partition_count=3,
                        cleanup_policy=TopicSpec.CLEANUP_COMPACT), )

    def __init__(self, test_context):
        extra_rp_conf = dict(log_segment_size=262144, )

        super(TopicDeleteTest, self).__init__(test_context=test_context,
                                              num_brokers=3,
                                              extra_rp_conf=extra_rp_conf)

        self.kafka_tools = KafkaCliTools(self.redpanda)

    @cluster(num_nodes=3)
    def topic_delete_test(self):
        def produce_until_partitions():
            self.kafka_tools.produce(self.topic, 1024, 1024)
            storage = self.redpanda.storage()
            return len(list(storage.partitions("kafka", self.topic))) == 9

        wait_until(lambda: produce_until_partitions(),
                   timeout_sec=30,
                   backoff_sec=2,
                   err_msg="Expected partition did not materialize")

        self.kafka_tools.delete_topic(self.topic)

        def topic_storage_purged():
            storage = self.redpanda.storage()
            return all(
                map(lambda n: self.topic not in n.ns["kafka"].topics,
                    storage.nodes))

        try:
            wait_until(lambda: topic_storage_purged(),
                       timeout_sec=30,
                       backoff_sec=2,
                       err_msg="Topic storage was not removed")

        except:
            # On errors, dump listing of the storage location
            for node in self.redpanda.nodes:
                self.logger.error(f"Storage listing on {node.name}:")
                for line in node.account.ssh_capture(
                        f"find {self.redpanda.DATA_DIR}"):
                    self.logger.error(line.strip())

            raise
Esempio n. 3
0
class TopicAutocreateTest(RedpandaTest):
    """
    Verify that autocreation works, and that the settings of an autocreated
    topic match those for a topic created by hand with rpk.
    """
    def __init__(self, test_context):
        super(TopicAutocreateTest, self).__init__(
            test_context=test_context,
            num_brokers=1,
            extra_rp_conf={'auto_create_topics_enabled': False})

        self.kafka_tools = KafkaCliTools(self.redpanda)
        self.rpk = RpkTool(self.redpanda)

    @cluster(num_nodes=1)
    def topic_autocreate_test(self):
        auto_topic = 'autocreated'
        manual_topic = "manuallycreated"

        # With autocreation disabled, producing to a nonexistent topic should not work.
        try:
            # Use rpk rather than kafka CLI because rpk errors out promptly
            self.rpk.produce(auto_topic, "foo", "bar")
        except Exception:
            # The write failed, and shouldn't have created a topic
            assert auto_topic not in self.kafka_tools.list_topics()
        else:
            assert False, "Producing to a nonexistent topic should fail"

        # Enable autocreation
        self.redpanda.restart_nodes(self.redpanda.nodes,
                                    {'auto_create_topics_enabled': True})

        # Auto create topic
        assert auto_topic not in self.kafka_tools.list_topics()
        self.kafka_tools.produce(auto_topic, 1, 4096)
        assert auto_topic in self.kafka_tools.list_topics()
        auto_topic_spec = self.kafka_tools.describe_topic(auto_topic)
        assert auto_topic_spec.retention_ms is None
        assert auto_topic_spec.retention_bytes is None

        # Create topic by hand, compare its properties to the autocreated one
        self.rpk.create_topic(manual_topic)
        manual_topic_spec = self.kafka_tools.describe_topic(auto_topic)
        assert manual_topic_spec.retention_ms == auto_topic_spec.retention_ms
        assert manual_topic_spec.retention_bytes == auto_topic_spec.retention_bytes

        # Clear name and compare the rest of the attributes
        manual_topic_spec.name = auto_topic_spec.name = None
        assert manual_topic_spec == auto_topic_spec
Esempio n. 4
0
    def test_bytes_sent(self):
        num_records = 10240
        records_size = 512

        # Produce some data (10240 records * 512 bytes = 5MB of data)
        kafka_tools = KafkaCliTools(self.redpanda)
        kafka_tools.produce(self.topic, num_records, records_size, acks=-1)

        # Establish the current counter which won't be zero, but
        # we don't really care what the actual value is
        start_bytes = self._bytes_sent()
        self.logger.debug(f"Start bytes: {start_bytes}")

        # Consume and count the bytes. The counted bytes
        # will be in bytes_received
        self._consume_and_count_bytes()
        self.logger.debug(f"Bytes received: {self._bytes_received}")

        # Get the number of bytes sent up to this point
        # and calcuate total
        end_bytes = self._bytes_sent()
        total_sent = end_bytes - start_bytes
        self.logger.debug(f"End bytes: {end_bytes}")
        self.logger.debug(f"Total sent: {total_sent}")

        def in_percent_threshold(n1, n2, threshold):
            percent_increase = (abs(n2 - n1) / n2) * 100
            self.logger.debug(
                f"Percent increase: {percent_increase}, Threshold: {threshold}"
            )
            return threshold >= percent_increase

        # Expect total to be larger than bytes_received
        # because bytes_received doesn't account for other
        # responses between the client and brokers
        assert total_sent > self._bytes_received

        # Total bytes sent should be within 5% threshold.
        # Threshold of 5% is generally an OK place to start
        assert in_percent_threshold(total_sent,
                                    self._bytes_received,
                                    threshold=5.0)
Esempio n. 5
0
class TopicDeleteTest(RedpandaTest):
    """
    Verify that topic deletion cleans up storage.
    """
    topics = (TopicSpec(partition_count=3,
                        cleanup_policy=TopicSpec.CLEANUP_COMPACT), )

    def __init__(self, test_context):
        extra_rp_conf = dict(log_segment_size=262144, )

        super(TopicDeleteTest, self).__init__(test_context=test_context,
                                              num_brokers=3,
                                              extra_rp_conf=extra_rp_conf)

        self.kafka_tools = KafkaCliTools(self.redpanda)

    @cluster(num_nodes=3)
    def topic_delete_test(self):
        def produce_until_partitions():
            self.kafka_tools.produce(self.topic, 1024, 1024)
            storage = self.redpanda.storage()
            return len(list(storage.partitions("kafka", self.topic))) == 9

        wait_until(lambda: produce_until_partitions(),
                   timeout_sec=30,
                   backoff_sec=2,
                   err_msg="Expected partition did not materialize")

        self.kafka_tools.delete_topic(self.topic)

        def topic_storage_purged():
            storage = self.redpanda.storage()
            return all(
                map(lambda n: self.topic not in n.ns["kafka"].topics,
                    storage.nodes))

        wait_until(lambda: topic_storage_purged(),
                   timeout_sec=30,
                   backoff_sec=2,
                   err_msg="Topic storage was not removed")
Esempio n. 6
0
    def test_partition_metrics(self):
        num_records = 10240
        records_size = 512

        # initially all metrics have to be equal to 0
        assert self._bytes_produced() == 0
        assert self._records_produced() == 0

        assert self._bytes_fetched() == 0
        assert self._records_fetched() == 0

        # Produce some data (10240 records * 512 bytes = 5MB of data)
        kafka_tools = KafkaCliTools(self.redpanda)
        kafka_tools.produce(self.topic, num_records, records_size, acks=-1)

        rec_produced = self._records_produced()
        self.redpanda.logger.info(f"records produced: {rec_produced}")
        assert rec_produced == num_records
        bytes_produced = self._bytes_produced()
        self.redpanda.logger.info(f"bytes produced: {bytes_produced}")
        # bytes produced should be bigger than sent records size because of
        # batch headers overhead
        assert bytes_produced >= num_records * records_size

        # fetch metrics shouldn't change
        assert self._bytes_fetched() == 0
        assert self._records_fetched() == 0

        # read all messages
        rpk = RpkTool(self.redpanda)
        rpk.consume(self.topic, n=num_records)

        rec_fetched = self._records_fetched()
        self.redpanda.logger.info(f"records fetched: {rec_fetched}")

        bytes_fetched = self._bytes_fetched()
        self.redpanda.logger.info(f"bytes fetched: {bytes_fetched}")

        assert bytes_fetched == bytes_produced
        assert rec_fetched == rec_produced
Esempio n. 7
0
 def _run(self):
     producer = KafkaCliTools(self._redpanda)
     producer.produce(self._topic, self._num_records, self._records_size)
Esempio n. 8
0
class ArchivalTest(RedpandaTest):
    log_segment_size = 1048576  # 1MB
    log_compaction_interval_ms = 10000

    s3_host_name = "minio-s3"
    s3_access_key = "panda-user"
    s3_secret_key = "panda-secret"
    s3_region = "panda-region"
    s3_topic_name = "panda-topic"
    topics = (TopicSpec(name='panda-topic',
                        partition_count=1,
                        replication_factor=3), )

    def __init__(self, test_context):
        self.s3_bucket_name = f"panda-bucket-{uuid.uuid1()}"
        self._extra_rp_conf = dict(
            cloud_storage_enabled=True,
            cloud_storage_access_key=ArchivalTest.s3_access_key,
            cloud_storage_secret_key=ArchivalTest.s3_secret_key,
            cloud_storage_region=ArchivalTest.s3_region,
            cloud_storage_bucket=self.s3_bucket_name,
            cloud_storage_disable_tls=True,
            cloud_storage_api_endpoint=ArchivalTest.s3_host_name,
            cloud_storage_api_endpoint_port=9000,
            cloud_storage_reconciliation_interval_ms=500,
            cloud_storage_max_connections=5,
            log_compaction_interval_ms=self.log_compaction_interval_ms,
            log_segment_size=self.log_segment_size,
        )
        if test_context.function_name == "test_timeboxed_uploads":
            self._extra_rp_conf.update(
                log_segment_size=1024 * 1024 * 1024,
                cloud_storage_segment_max_upload_interval_sec=1)

        super(ArchivalTest, self).__init__(test_context=test_context,
                                           extra_rp_conf=self._extra_rp_conf)

        self.kafka_tools = KafkaCliTools(self.redpanda)
        self.rpk = RpkTool(self.redpanda)
        self.s3_client = S3Client(
            region='panda-region',
            access_key=u"panda-user",
            secret_key=u"panda-secret",
            endpoint=f'http://{ArchivalTest.s3_host_name}:9000',
            logger=self.logger)

    def setUp(self):
        self.s3_client.empty_bucket(self.s3_bucket_name)
        self.s3_client.create_bucket(self.s3_bucket_name)
        # Deletes in S3 are eventually consistent so we might still
        # see previously removed objects for a while.
        validate(self._check_bucket_is_emtpy, self.logger, 300)
        super().setUp()  # topic is created here
        # enable archival for topic
        for topic in self.topics:
            self.rpk.alter_topic_config(topic.name, 'redpanda.remote.write',
                                        'true')

    def tearDown(self):
        self.s3_client.empty_bucket(self.s3_bucket_name)
        super().tearDown()

    @cluster(num_nodes=3)
    def test_write(self):
        """Simpe smoke test, write data to redpanda and check if the
        data hit the S3 storage bucket"""
        self.kafka_tools.produce(self.topic, 10000, 1024)
        validate(self._quick_verify, self.logger, 90)

    @cluster(num_nodes=3)
    def test_isolate(self):
        """Verify that our isolate/rejoin facilities actually work"""
        with firewall_blocked(self.redpanda.nodes, self._get_s3_endpoint_ip()):
            self.kafka_tools.produce(self.topic, 10000, 1024)
            time.sleep(10)  # can't busy wait here

            # Topic manifest can be present in the bucket because topic is created before
            # firewall is blocked. No segments or partition manifest should be present.
            topic_manifest_id = "d0000000/meta/kafka/panda-topic/topic_manifest.json"
            objects = self.s3_client.list_objects(self.s3_bucket_name)
            keys = [x.Key for x in objects]

            assert len(keys) < 2, \
                f"Bucket should be empty or contain only {topic_manifest_id}, but contains {keys}"

            if len(keys) == 1:
                assert topic_manifest_id == keys[0], \
                    f"Bucket should be empty or contain only {topic_manifest_id}, but contains {keys[0]}"

    @cluster(num_nodes=3)
    def test_reconnect(self):
        """Disconnect redpanda from S3, write data, connect redpanda to S3
        and check that the data is uploaded"""
        with firewall_blocked(self.redpanda.nodes, self._get_s3_endpoint_ip()):
            self.kafka_tools.produce(self.topic, 10000, 1024)
            time.sleep(10)  # sleep is needed because we need to make sure that
            # reconciliation loop kicked in and started uploading
            # data, otherwse we can rejoin before archival storage
            # will even try to upload new segments
        validate(self._quick_verify, self.logger, 90)

    @cluster(num_nodes=3)
    def test_one_node_reconnect(self):
        """Disconnect one redpanda node from S3, write data, connect redpanda to S3
        and check that the data is uploaded"""
        self.kafka_tools.produce(self.topic, 1000, 1024)
        leaders = list(self._get_partition_leaders().values())
        with firewall_blocked(leaders[0:1], self._get_s3_endpoint_ip()):
            self.kafka_tools.produce(self.topic, 9000, 1024)
            time.sleep(10)  # sleep is needed because we need to make sure that
            # reconciliation loop kicked in and started uploading
            # data, otherwse we can rejoin before archival storage
            # will even try to upload new segments
        validate(self._quick_verify, self.logger, 90)

    @cluster(num_nodes=3)
    def test_connection_drop(self):
        """Disconnect redpanda from S3 during the active upload, restore connection
        and check that everything is uploaded"""
        self.kafka_tools.produce(self.topic, 10000, 1024)
        with firewall_blocked(self.redpanda.nodes, self._get_s3_endpoint_ip()):
            time.sleep(10)  # sleep is needed because we need to make sure that
            # reconciliation loop kicked in and started uploading
            # data, otherwse we can rejoin before archival storage
            # will even try to upload new segments
        validate(self._quick_verify, self.logger, 90)

    @cluster(num_nodes=3)
    def test_connection_flicker(self):
        """Disconnect redpanda from S3 during the active upload for short period of time
        during upload and check that everything is uploaded"""
        con_enabled = True
        for _ in range(0, 20):
            # upload data in batches
            if con_enabled:
                with firewall_blocked(self.redpanda.nodes,
                                      self._get_s3_endpoint_ip()):
                    self.kafka_tools.produce(self.topic, 500, 1024)
            else:
                self.kafka_tools.produce(self.topic, 500, 1024)
            con_enabled = not con_enabled
            time.sleep(1)
        time.sleep(10)
        validate(self._quick_verify, self.logger, 90)

    @cluster(num_nodes=3)
    def test_single_partition_leadership_transfer(self):
        """Start uploading data, restart leader node of the partition 0 to trigger the
        leadership transfer, continue upload, verify S3 bucket content"""
        self.kafka_tools.produce(self.topic, 5000, 1024)
        time.sleep(5)
        leaders = self._get_partition_leaders()
        node = leaders[0]
        self.redpanda.stop_node(node)
        time.sleep(1)
        self.redpanda.start_node(node)
        time.sleep(5)
        self.kafka_tools.produce(self.topic, 5000, 1024)
        validate(self._cross_node_verify, self.logger, 90)

    @cluster(num_nodes=3)
    def test_all_partitions_leadership_transfer(self):
        """Start uploading data, restart leader nodes of all partitions to trigger the
        leadership transfer, continue upload, verify S3 bucket content"""
        self.kafka_tools.produce(self.topic, 5000, 1024)
        time.sleep(5)
        leaders = self._get_partition_leaders()
        for ip, node in leaders.items():
            self.logger.debug(f"going to restart node {ip}")
            self.redpanda.stop_node(node)
            time.sleep(1)
            self.redpanda.start_node(node)
        time.sleep(5)
        self.kafka_tools.produce(self.topic, 5000, 1024)
        validate(self._cross_node_verify, self.logger, 90)

    @cluster(num_nodes=3)
    def test_timeboxed_uploads(self):
        """This test checks segment upload time limit. The feature is enabled in the
        configuration. The configuration defines maximum time interval between uploads.
        If the option is set then redpanda will start uploading a segment partially if
        configured amount of time passed since previous upload and the segment has some
        new data.
        The test sets the timeout value to 1s. Then it uploads data in batches with delays
        between the batches. The segment size is set to 1GiB. We upload 10MiB total. So
        normally, there won't be any data uploaded to Minio. But since the time limit for
        a segment is set to 1s we will see a bunch of segments in the bucket. The offsets
        of the segments won't align with the segment in the redpanda data directory. But
        their respective offset ranges should align and the sizes should make sense.
        """

        # The offsets of the segments in the Minio bucket won't necessary
        # correlate with the write bursts here. The upload depends on the
        # timeout but also on raft and current high_watermark. So we can
        # expect that the bucket won't have 9 segments with 1000 offsets.
        # The actual segments will be larger.
        for i in range(0, 10):
            self.kafka_tools.produce(self.topic, 1000, 1024)
            time.sleep(1)
        time.sleep(5)

        def check_upload():
            # check that the upload happened
            ntps = set()
            sizes = {}

            for node in self.redpanda.nodes:
                checksums = self._get_redpanda_log_segment_checksums(node)
                self.logger.info(
                    f"Node: {node.account.hostname} checksums: {checksums}")
                lst = [
                    _parse_normalized_segment_path(path, md5, size)
                    for path, (md5, size) in checksums.items()
                ]
                lst = sorted(lst, key=lambda x: x.base_offset)
                segments = defaultdict(int)
                sz = defaultdict(int)
                for it in lst:
                    ntps.add(it.ntp)
                    sz[it.ntp] += it.size
                    segments[it.ntp] += 1
                for ntp, s in segments.items():
                    assert s != 0, f"expected to have at least one segment per partition, got {s}"
                for ntp, s in sz.items():
                    if ntp not in sizes:
                        sizes[ntp] = s

            # Download manifest for partitions
            for ntp in ntps:
                manifest = self._download_partition_manifest(ntp)
                self.logger.info(f"downloaded manifest {manifest}")
                segments = []
                for _, segment in manifest['segments'].items():
                    segments.append(segment)

                segments = sorted(segments, key=lambda s: s['base_offset'])
                self.logger.info(f"sorted segments {segments}")

                prev_committed_offset = -1
                size = 0
                for segment in segments:
                    self.logger.info(
                        f"checking {segment} prev: {prev_committed_offset}")
                    base_offset = segment['base_offset']
                    assert prev_committed_offset + 1 == base_offset, f"inconsistent segments, " +\
                        "expected base_offset: {prev_committed_offset + 1}, actual: {base_offset}"
                    prev_committed_offset = segment['committed_offset']
                    size += segment['size_bytes']
                assert sizes[ntp] >= size
                assert size > 0

        validate(check_upload, self.logger, 90)

    @cluster(num_nodes=3)
    def test_retention_archival_coordination(self):
        """
        Test that only archived segments can be evicted and that eviction
        restarts once the segments have been archived.
        """
        self.kafka_tools.alter_topic_config(
            self.topic,
            {
                TopicSpec.PROPERTY_RETENTION_BYTES: 5 * self.log_segment_size,
            },
        )

        with firewall_blocked(self.redpanda.nodes, self._get_s3_endpoint_ip()):
            produce_until_segments(redpanda=self.redpanda,
                                   topic=self.topic,
                                   partition_idx=0,
                                   count=10)

            # Sleep some time sufficient for log eviction under normal conditions
            # and check that no segment has been evicted (because we can't upload
            # segments to the cloud storage).
            time.sleep(3 * self.log_compaction_interval_ms / 1000.0)
            counts = list(
                segments_count(self.redpanda, self.topic, partition_idx=0))
            self.logger.info(f"node segment counts: {counts}")
            assert len(counts) == len(self.redpanda.nodes)
            assert all(c >= 10 for c in counts)

        # Check that eviction restarts after we restored the connection to cloud
        # storage.
        wait_for_segments_removal(redpanda=self.redpanda,
                                  topic=self.topic,
                                  partition_idx=0,
                                  count=6)

    def _check_bucket_is_emtpy(self):
        allobj = self._list_objects()
        for obj in allobj:
            self.logger.debug(
                f"found object {obj} in bucket {self.s3_bucket_name}")
        assert len(allobj) == 0

    def _get_partition_leaders(self):
        kcat = KafkaCat(self.redpanda)
        m = kcat.metadata()
        self.logger.info(f"kcat.metadata() == {m}")
        brokers = {}
        for b in m['brokers']:
            id = b['id']
            ip = b['name']
            ip = ip[:ip.index(':')]
            for n in self.redpanda.nodes:
                n_ip = n.account.hostname
                self.logger.debug(f"matching {n_ip} over {ip}")
                if n_ip == ip:
                    brokers[id] = n
                    break
        self.logger.debug(f"found brokers {brokers}")
        assert len(brokers) == 3
        leaders = {}
        for topic in m['topics']:
            if topic['topic'] == ArchivalTest.s3_topic_name:
                for part in topic['partitions']:
                    leader_id = part['leader']
                    partition_id = part['partition']
                    leader = brokers[leader_id]
                    leaders[partition_id] = leader
        return leaders

    def _download_partition_manifest(self, ntp):
        """Find and download individual partition manifest"""
        expected = f"{ntp.ns}/{ntp.topic}/{ntp.partition}_{ntp.revision}/manifest.json"
        id = None
        objects = []
        for loc in self._list_objects():
            objects.append(loc)
            if expected in loc:
                id = loc
                break
        if id is None:
            objlist = "\n".join(objects)
            self.logger.debug(
                f"expected path {expected} is not found in the bucket, bucket content: \n{objlist}"
            )
            assert not id is None
        manifest = self.s3_client.get_object_data(self.s3_bucket_name, id)
        self.logger.info(f"manifest found: {manifest}")
        return json.loads(manifest)

    def _verify_manifest(self, ntp, manifest, remote):
        """Check that all segments that present in manifest are available
        in remote storage"""
        for sname, _ in manifest['segments'].items():
            spath = f"{ntp.ns}/{ntp.topic}/{ntp.partition}_{ntp.revision}/{sname}"
            self.logger.info(f"validating manifest path {spath}")
            assert spath in remote
        ranges = [(int(m['base_offset']), int(m['committed_offset']))
                  for _, m in manifest['segments'].items()]
        ranges = sorted(ranges, key=lambda x: x[0])
        last_offset = -1
        num_gaps = 0
        for base, committed in ranges:
            if last_offset + 1 != base:
                self.logger.debug(
                    f"gap between {last_offset} and {base} detected")
                num_gaps += 1
            last_offset = committed
        assert num_gaps == 0

    def _cross_node_verify(self):
        """Verify data on all nodes taking into account possible alignment issues
        caused by leadership transitions.
        The verification algorithm is following:
        - Download and verify partition manifest;
        - Partition manifest has all segments and metadata like committed offset
          and base offset. We can also retrieve MD5 hash of every segment;
        - Load segment metadata for every redpanda node.
        - Scan every node's metadata and match segments with manifest, on success
          remove matched segment from the partition manifest.
        The goal #1 is to remove all segments from the manifest. The goal #2 is to
        find the last segment that's supposed to be uploaded from the leader node,
        it's base offset should be equal to manifest's last offset + 1.
        The segments match if:
        - The base offset and md5 hashes are the same;
        - The committed offset of both segments are the same, md5 hashes are different,
          and base offset of the segment from manifest is larger than base offset of the
          segment from redpanda node. In this case we should also compare the data
          directly by scanning both segments.
        """
        nodes = {}
        ntps = set()

        for node in self.redpanda.nodes:
            checksums = self._get_redpanda_log_segment_checksums(node)
            self.logger.info(
                f"Node: {node.account.hostname} checksums: {checksums}")
            lst = [
                _parse_normalized_segment_path(path, md5, size)
                for path, (md5, size) in checksums.items()
            ]
            lst = sorted(lst, key=lambda x: x.base_offset)
            nodes[node.account.hostname] = lst
            for it in lst:
                ntps.add(it.ntp)

        # Download metadata from S3
        remote = self._get_redpanda_s3_checksums()

        # Download manifest for partitions
        manifests = {}
        for ntp in ntps:
            manifest = self._download_partition_manifest(ntp)
            manifests[ntp] = manifest
            self._verify_manifest(ntp, manifest, remote)

        for ntp in ntps:
            self.logger.debug(f"verifying {ntp}")
            manifest = manifests[ntp]
            segments = manifest['segments']
            manifest_segments = [
                _parse_manifest_segment(manifest, sname, meta, remote,
                                        self.logger)
                for sname, meta in segments.items()
            ]
            manifest_segments = sorted(manifest_segments,
                                       key=lambda x: x.base_offset)

            for node_key, node_segments in nodes.items():
                self.logger.debug(f"checking {ntp} on {node_key}")
                for mix, msegm in enumerate(manifest_segments):
                    if not msegm is None:
                        segments = sorted([
                            segment
                            for segment in node_segments if segment.ntp == ntp
                        ],
                                          key=lambda x: x.base_offset)
                        self.logger.debug(
                            f"checking manifest segment {msegm} over {node_key} segments {segments}"
                        )
                        found = False
                        for ix in range(0, len(segments)):
                            nsegm = segments[ix]
                            if nsegm.ntp != ntp:
                                continue
                            nsegm_co = -1 if (ix + 1) == len(segments) else (
                                segments[ix + 1].base_offset - 1)
                            self.logger.debug(
                                f"comparing {msegm.base_offset}:{msegm.committed_offset}:{msegm.md5} to {nsegm.base_offset}:{nsegm_co}:{nsegm.md5}"
                            )
                            if msegm.base_offset == nsegm.base_offset and msegm.md5 == nsegm.md5:
                                # Success
                                self.logger.info(
                                    f"found match for segment {msegm.ntp} {msegm.base_offset} on {node_key}"
                                )
                                manifest_segments[mix] = None
                                found = True
                                break
                            if msegm.committed_offset == nsegm_co and msegm.base_offset > nsegm.base_offset:
                                # Found segment with truncated head (due to leadership transition)
                                actual_hash = self._get_partial_checksum(
                                    node_key, nsegm.normalized_path,
                                    msegm.size)
                                self.logger.info(
                                    f"partial hash {actual_hash} retreived, s3 hash {msegm.md5}"
                                )
                                if actual_hash == msegm.md5:
                                    manifest_segments[mix] = None
                                    self.logger.info(
                                        f"partial match for segment {msegm.ntp} {msegm.base_offset}-"
                                        +
                                        f"{msegm.committed_offset} on {node_key}"
                                    )
                                    found = True
                                    break
                        if not found:
                            self.logger.debug(
                                f"failed to match {msegm.base_offset}:{msegm.committed_offset}"
                            )
                        else:
                            self.logger.debug(
                                f"matched {msegm.base_offset}:{msegm.committed_offset} successfully"
                            )

            # All segments should be matched and set to None
            if any(manifest_segments):
                self.logger.debug(
                    f"manifest segments that fail to validate: {manifest_segments}"
                )
            assert not any(manifest_segments)
            # Verify goal #2, the last segment on a leader node is manifest.last_offset + 1
            ntp_offsets = []
            for node_key, node_segments in nodes.items():
                offsets = [
                    segm.base_offset for segm in node_segments
                    if segm.ntp == ntp
                ]
                if offsets:
                    max_offset = max([
                        segm.base_offset for segm in node_segments
                        if segm.ntp == ntp
                    ])
                    ntp_offsets.append(max_offset)
                    self.logger.debug(
                        f"NTP {ntp} has the largest offset {max_offset} on node {node_key}"
                    )
                else:
                    self.logger.debug(
                        f"NTP {ntp} has no offsets on node {node_key}")

            last_offset = int(manifest['last_offset'])
            self.logger.debug(
                f"last offset: {last_offset}, ntp offsets: {ntp_offsets}")
            assert (last_offset + 1) in ntp_offsets

    def _list_objects(self):
        """Emulate ListObjects call by fetching the topic manifests and
        iterating through its content"""
        try:
            topic_manifest_id = "d0000000/meta/kafka/panda-topic/topic_manifest.json"
            partition_manifest_id = "d0000000/meta/kafka/panda-topic/0_9/manifest.json"
            manifest = self.s3_client.get_object_data(self.s3_bucket_name,
                                                      partition_manifest_id)
            results = [topic_manifest_id, partition_manifest_id]
            for id in manifest['segments'].keys():
                results.append(id)
            self.logger.debug(f"ListObjects(source: manifest): {results}")
        except:
            results = [
                loc.Key
                for loc in self.s3_client.list_objects(self.s3_bucket_name)
            ]
            self.logger.debug(f"ListObjects: {results}")
        return results

    def _quick_verify(self):
        """Verification algorithm that works only if no leadership
        transfer happend during the run. It works by looking up all
        segments from the remote storage in local redpanda storages.
        It's done by using md5 hashes of the nodes.
        """
        local = {}
        for node in self.redpanda.nodes:
            checksums = self._get_redpanda_log_segment_checksums(node)
            self.logger.info(
                f"Node: {node.account.hostname} checksums: {checksums}")
            for k, v in checksums.items():
                local.setdefault(k, set()).add(v)
        remote = self._get_redpanda_s3_checksums()
        self.logger.info(f"S3 checksums: {remote}")
        self.logger.info(f"Local checksums: {local}")
        assert len(local) != 0
        assert len(remote) != 0
        md5fails = 0
        lookup_fails = 0
        for path, csum in remote.items():
            self.logger.info(f"checking remote path: {path} csum: {csum}")
            if path not in local:
                self.logger.debug(
                    f"remote path {path} can't be found in any of the local storages"
                )
                lookup_fails += 1
            else:
                if len(local[path]) != 1:
                    self.logger.info(
                        f"remote segment {path} have more than one variant {local[path]}"
                    )
                if not csum in local[path]:
                    self.logger.debug(
                        f"remote md5 {csum} doesn't match any local {local[path]}"
                    )
                    md5fails += 1
        if md5fails != 0:
            self.logger.debug(
                f"Validation failed, {md5fails} remote segments doesn't match")
        if lookup_fails != 0:
            self.logger.debug(
                f"Validation failed, remote {lookup_fails} remote locations doesn't match local"
            )
        assert md5fails == 0 and lookup_fails == 0

        # Validate partitions
        # for every partition the segment with largest base offset shouldn't be
        # available in remote storage
        local_partitions = {}
        remote_partitions = {}
        for path, items in local.items():
            meta = _parse_normalized_segment_path(path, '', 0)
            local_partitions.setdefault(meta.ntp, []).append((meta, items))
        for path, items in remote.items():
            meta = _parse_normalized_segment_path(path, '', 0)
            remote_partitions.setdefault(meta.ntp, []).append((meta, items))
        self.logger.info(
            f"generated local partitions {local_partitions.keys()}")
        self.logger.info(
            f"generated remote partitions {remote_partitions.keys()}")

        # Download manifest for partitions
        manifests = {}
        for ntp in local_partitions.keys():
            manifest = self._download_partition_manifest(ntp)
            manifests[ntp] = manifest
            self._verify_manifest(ntp, manifest, remote)

        # Check that all local partition are archived
        assert len(local_partitions) == 1
        assert len(remote_partitions) == 1
        missing_partitions = 0
        for key in local_partitions.keys():
            if key not in remote_partitions:
                self.logger.debug(f"partition {key} not found in remote set")
                missing_partitions += 1
        assert missing_partitions == 0

    def _get_redpanda_log_segment_checksums(self, node):
        """Get MD5 checksums of log segments that match the topic. The paths are
        normalized (<namespace>/<topic>/<partition>_<rev>/...)."""
        checksums = self.redpanda.data_checksum(node)

        # Filter out all unwanted paths
        def included(path):
            controller_log_prefix = os.path.join(RedpandaService.DATA_DIR,
                                                 "redpanda")
            log_segment_extension = ".log"
            return not path.startswith(
                controller_log_prefix) and path.endswith(log_segment_extension)

        # Remove data dir from path
        def normalize_path(path):
            return os.path.relpath(path, RedpandaService.DATA_DIR)

        return {
            normalize_path(path): value
            for path, value in checksums.items() if included(path)
        }

    def _get_redpanda_s3_checksums(self):
        """Get MD5 checksums of log segments stored in S3 (minio). The paths are
        normalized (<namespace>/<topic>/<partition>_<rev>/...)."""
        def normalize(path):
            return path[9:]  # 8-character hash + /

        def included(path):
            manifest_extension = ".json"
            return not path.endswith(manifest_extension)

        return {
            normalize(it.Key): (it.ETag, it.ContentLength)
            for it in self.s3_client.list_objects(self.s3_bucket_name)
            if included(it.Key)
        }

    def _get_partial_checksum(self, hostname, normalized_path, tail_bytes):
        """Compute md5 checksum of the last 'tail_bytes' of the file located
        on a node."""
        node = None
        for n in self.redpanda.nodes:
            if n.account.hostname == hostname:
                node = n
        full_path = os.path.join(RedpandaService.DATA_DIR, normalized_path)
        cmd = f"tail -c {tail_bytes} {full_path} | md5sum"
        line = node.account.ssh_output(cmd)
        tokens = line.split()
        return tokens[0].decode()

    def _isolate(self, nodes, ips):
        """Isolate certain ips from the nodes using firewall rules"""
        cmd = []
        for ip in ips:
            cmd.append(f"iptables -A INPUT -s {ip} -j DROP")
            cmd.append(f"iptables -A OUTPUT -d {ip} -j DROP")
        cmd = " && ".join(cmd)
        for node in nodes:
            node.account.ssh_output(cmd, allow_fail=False)

    def _rejoin(self, nodes, ips):
        """Remove firewall rules that isolate ips from the nodes"""
        cmd = []
        for ip in ips:
            cmd.append(f"iptables -D INPUT -s {ip} -j DROP")
            cmd.append(f"iptables -D OUTPUT -d {ip} -j DROP")
        cmd = " && ".join(cmd)
        for node in nodes:
            node.account.ssh_output(cmd, allow_fail=False)

    def _host_name_to_ip_address(self, hostname):
        ip_host = self.redpanda.nodes[0].account.ssh_output(
            f'getent hosts {hostname}')
        return ip_host.split()[0].decode()

    def _get_s3_endpoint_ip(self):
        return self._host_name_to_ip_address(ArchivalTest.s3_host_name)

    def _get_rp_cluster_ips(self, nhosts=4):
        lst = []
        for ix in range(1, nhosts + 1):
            h = f"rp_n{ix}_1"
            lst.append(self._host_name_to_ip_address(h))
        return lst
class ArchivalTest(RedpandaTest):
    topics = tuple([
        TopicSpec(name=f'panda-topic-{ix}',
                  partition_count=10,
                  replication_factor=3) for ix in range(0, 10)
    ])

    GLOBAL_S3_BUCKET = "s3_bucket"
    GLOBAL_S3_REGION = "s3_region"
    GLOBAL_S3_ACCESS_KEY = "s3_access_key"
    GLOBAL_S3_SECRET_KEY = "s3_secret_key"

    MINIO_HOST_NAME = "minio-s3"
    MINIO_BUCKET_NAME = "panda-bucket"
    MINIO_ACCESS_KEY = "panda-user"
    MINIO_SECRET_KEY = "panda-secret"
    MINIO_REGION = "panda-region"
    MINIO_TOPIC_NAME = "panda-topic"

    def __init__(self, test_context):
        self.s3_bucket = test_context.globals.get(self.GLOBAL_S3_BUCKET, None)
        self.s3_region = test_context.globals.get(self.GLOBAL_S3_REGION, None)
        self.s3_access_key = test_context.globals.get(
            self.GLOBAL_S3_ACCESS_KEY, None)
        self.s3_secret_key = test_context.globals.get(
            self.GLOBAL_S3_SECRET_KEY, None)
        self.s3_endpoint = None
        self.real_thing = self.s3_bucket and self.s3_region and self.s3_access_key and self.s3_secret_key
        if self.real_thing:
            extra_rp_conf = dict(
                developer_mode=True,
                disable_metrics=False,
                cloud_storage_enabled=True,
                cloud_storage_access_key=self.s3_access_key,
                cloud_storage_secret_key=self.s3_secret_key,
                cloud_storage_region=self.s3_region,
                cloud_storage_bucket=self.s3_bucket,
                cloud_storage_reconciliation_interval_ms=10000,
                cloud_storage_max_connections=10,
                cloud_storage_trust_file="/etc/ssl/certs/ca-certificates.crt",
                log_segment_size=32 * 1048576  # 32MB
            )
        else:
            bucket_name = f"{ArchivalTest.MINIO_BUCKET_NAME}-{uuid.uuid1()}"
            self.s3_bucket = bucket_name
            self.s3_region = ArchivalTest.MINIO_REGION
            self.s3_access_key = ArchivalTest.MINIO_ACCESS_KEY
            self.s3_secret_key = ArchivalTest.MINIO_SECRET_KEY
            extra_rp_conf = dict(
                developer_mode=True,
                disable_metrics=False,
                cloud_storage_enabled=True,
                cloud_storage_access_key=ArchivalTest.MINIO_ACCESS_KEY,
                cloud_storage_secret_key=ArchivalTest.MINIO_SECRET_KEY,
                cloud_storage_region=ArchivalTest.MINIO_REGION,
                cloud_storage_bucket=bucket_name,
                cloud_storage_disable_tls=True,
                cloud_storage_api_endpoint=ArchivalTest.MINIO_HOST_NAME,
                cloud_storage_api_endpoint_port=9000,
                cloud_storage_reconciliation_interval_ms=10000,
                cloud_storage_max_connections=5,
                log_segment_size=32 * 1048576  # 32MB
            )
            self.s3_endpoint = f'http://{ArchivalTest.MINIO_HOST_NAME}:9000'

        super(ArchivalTest, self).__init__(test_context=test_context,
                                           extra_rp_conf=extra_rp_conf)

        self.kafka_tools = KafkaCliTools(self.redpanda)
        self.s3_client = S3Client(region=self.s3_region,
                                  access_key=self.s3_access_key,
                                  secret_key=self.s3_secret_key,
                                  endpoint=self.s3_endpoint,
                                  logger=self.logger)

    def setUp(self):
        if not self.real_thing:
            self.s3_client.empty_bucket(self.s3_bucket)
            self.s3_client.create_bucket(self.s3_bucket)
        super().setUp()

    def tearDown(self):
        if not self.real_thing:
            self.s3_client.empty_bucket(self.s3_bucket)
        super().tearDown()

    @cluster(num_nodes=3)
    def test_write(self):
        """Simpe smoke test, write data to redpanda and check if the
        data hit the S3 storage bucket"""
        for topic in ArchivalTest.topics:
            time.sleep(5)
            self.kafka_tools.produce(topic.name, 10000, 64 * 1024)
        time.sleep(30)
        self._verify()

    def verify_remote(self):
        results = [
            loc.Key for loc in self.s3_client.list_objects(self.s3_bucket)
        ]
        self.logger.debug(f"ListObjects: {results}")

    def _get_redpanda_log_segment_checksums(self, node):
        """Get MD5 checksums of log segments that match the topic. The paths are
        normalized (<namespace>/<topic>/<partition>_<rev>/...)."""
        checksums = self.redpanda.data_checksum(node)

        # Filter out all unwanted paths
        def included(path):
            controller_log_prefix = os.path.join(RedpandaService.DATA_DIR,
                                                 "redpanda")
            log_segment_extension = ".log"
            return not path.startswith(
                controller_log_prefix) and path.endswith(log_segment_extension)

        # Remove data dir from path
        def normalize_path(path):
            return os.path.relpath(path, RedpandaService.DATA_DIR)

        return {
            normalize_path(path): value
            for path, value in checksums.items() if included(path)
        }

    def _get_redpanda_s3_checksums(self):
        """Get MD5 checksums of log segments stored in S3 (minio). The paths are
        normalized (<namespace>/<topic>/<partition>_<rev>/...)."""
        def normalize(path):
            return path[9:]  # 8-character hash + /

        def included(path):
            manifest_extension = ".json"
            return not path.endswith(manifest_extension)

        return {
            normalize(it.Key): (it.ETag, it.ContentLength)
            for it in self.s3_client.list_objects(self.s3_bucket)
            if included(it.Key)
        }

    def _download_partition_manifest(self, ntp):
        """Find and download individual partition manifest"""
        expected = f"{ntp.ns}/{ntp.topic}/{ntp.partition}_{ntp.revision}/manifest.json"
        id = None
        objects = []
        for loc in self._list_objects():
            objects.append(loc)
            if expected in loc:
                id = loc
                break
        if id is None:
            objlist = "\n".join(objects)
            self.logger.debug(
                f"expected path {expected} is not found in the bucket, bucket content: \n{objlist}"
            )
            assert not id is None
        manifest = self.s3_client.get_object_data(self.s3_bucket, id)
        self.logger.info(f"manifest found: {manifest}")
        return json.loads(manifest)

    def _verify_manifest(self, ntp, manifest, remote):
        """Check that all segments that present in manifest are available
        in remote storage"""
        for sname, _ in manifest['segments'].items():
            spath = f"{ntp.ns}/{ntp.topic}/{ntp.partition}_{ntp.revision}/{sname}"
            self.logger.info(f"validating manifest path {spath}")
            assert spath in remote
        ranges = [(int(m['base_offset']), int(m['committed_offset']))
                  for _, m in manifest['segments'].items()]
        ranges = sorted(ranges, key=lambda x: x[0])
        last_offset = -1
        num_gaps = 0
        for base, committed in ranges:
            if last_offset + 1 != base:
                self.logger.debug(
                    f"gap between {last_offset} and {base} detected")
                num_gaps += 1
            last_offset = committed
        assert num_gaps == 0

    def _get_partial_checksum(self, hostname, normalized_path, tail_bytes):
        """Compute md5 checksum of the last 'tail_bytes' of the file located
        on a node."""
        node = None
        for n in self.redpanda.nodes:
            if n.account.hostname == hostname:
                node = n
        full_path = os.path.join(RedpandaService.DATA_DIR, normalized_path)
        cmd = f"tail -c {tail_bytes} {full_path} | md5sum"
        line = node.account.ssh_output(cmd)
        tokens = line.split()
        return tokens[0].decode()

    def _verify(self):
        """Verify data on all nodes taking into account possible alignment issues
        caused by leadership transitions.
        The verification algorithm is following: 
        - Download and verify partition manifest;
        - Partition manifest has all segments and metadata like committed offset 
          and base offset. We can also retrieve MD5 hash of every segment;
        - Load segment metadata for every redpanda node.
        - Scan every node's metadata and match segments with manifest, on success
          remove matched segment from the partition manifest.
        The goal #1 is to remove all segments from the manifest. The goal #2 is to
        find the last segment that's supposed to be uploaded from the leader node,
        it's base offset should be equal to manifest's last offset + 1.
        The segments match if:
        - The base offset and md5 hashes are the same;
        - The committed offset of both segments are the same, md5 hashes are different,
          and base offset of the segment from manifest is larger than base offset of the
          segment from redpanda node. In this case we should also compare the data 
          directly by scanning both segments.
        """
        nodes = {}
        ntps = set()
        for node in self.redpanda.nodes:
            checksums = self._get_redpanda_log_segment_checksums(node)
            self.logger.info(
                f"Node: {node.account.hostname} checksums: {checksums}")
            lst = [
                _parse_normalized_segment_path(path, md5, size)
                for path, (md5, size) in checksums.items()
            ]
            lst = sorted(lst, key=lambda x: x.base_offset)
            nodes[node.account.hostname] = lst
            for it in lst:
                ntps.add(it.ntp)

        # Download metadata from S3
        remote = self._get_redpanda_s3_checksums()

        # Download manifest for partitions
        manifests = {}
        for ntp in ntps:
            manifest = self._download_partition_manifest(ntp)
            manifests[ntp] = manifest
            self._verify_manifest(ntp, manifest, remote)

        for ntp in ntps:
            self.logger.debug(f"verifying {ntp}")
            manifest = manifests[ntp]
            segments = manifest['segments']
            manifest_segments = [
                _parse_manifest_segment(manifest, sname, meta, remote,
                                        self.logger)
                for sname, meta in segments.items()
            ]
            manifest_segments = sorted(manifest_segments,
                                       key=lambda x: x.base_offset)

            for node_key, node_segments in nodes.items():
                self.logger.debug(f"checking {ntp} on {node_key}")
                for mix, msegm in enumerate(manifest_segments):
                    if not msegm is None:
                        segments = sorted([
                            segment
                            for segment in node_segments if segment.ntp == ntp
                        ],
                                          key=lambda x: x.base_offset)
                        self.logger.debug(
                            f"checking manifest segment {msegm} over {node_key} segments {segments}"
                        )
                        found = False
                        for ix in range(0, len(segments)):
                            nsegm = segments[ix]
                            if nsegm.ntp != ntp:
                                continue
                            nsegm_co = -1 if (ix + 1) == len(segments) else (
                                segments[ix + 1].base_offset - 1)
                            self.logger.debug(
                                f"comparing {msegm.base_offset}:{msegm.committed_offset}:{msegm.md5} to {nsegm.base_offset}:{nsegm_co}:{nsegm.md5}"
                            )
                            if msegm.base_offset == nsegm.base_offset and msegm.md5 == nsegm.md5:
                                # Success
                                self.logger.info(
                                    f"found match for segment {msegm.ntp} {msegm.base_offset} on {node_key}"
                                )
                                manifest_segments[mix] = None
                                found = True
                                break
                            if msegm.committed_offset == nsegm_co and msegm.base_offset > nsegm.base_offset:
                                # Found segment with truncated head (due to leadership transition)
                                actual_hash = self._get_partial_checksum(
                                    node_key, nsegm.normalized_path,
                                    msegm.size)
                                self.logger.info(
                                    f"partial hash {actual_hash} retreived, s3 hash {msegm.md5}"
                                )
                                if actual_hash == msegm.md5:
                                    manifest_segments[mix] = None
                                    self.logger.info(f"partial match for segment {msegm.ntp} {msegm.base_offset}-" +\
                                                    f"{msegm.committed_offset} on {node_key}")
                                    found = True
                                    break
                        if not found:
                            self.logger.debug(
                                f"failed to match {msegm.base_offset}:{msegm.committed_offset}"
                            )
                        else:
                            self.logger.debug(
                                f"matched {msegm.base_offset}:{msegm.committed_offset} successfully"
                            )

            # All segments should be matched and set to None
            if any(manifest_segments):
                self.logger.debug(
                    f"manifest segments that fail to validate: {manifest_segments}"
                )
            assert not any(manifest_segments)
            # Verify goal #2, the last segment on a leader node is manifest.last_offset + 1
            ntp_offsets = []
            for node_key, node_segments in nodes.items():
                offsets = [
                    segm.base_offset for segm in node_segments
                    if segm.ntp == ntp
                ]
                if offsets:
                    max_offset = max([
                        segm.base_offset for segm in node_segments
                        if segm.ntp == ntp
                    ])
                    ntp_offsets.append(max_offset)
                    self.logger.debug(
                        f"NTP {ntp} has the largest offset {max_offset} on node {node_key}"
                    )
                else:
                    self.logger.debug(
                        f"NTP {ntp} has no offsets on node {node_key}")

            last_offset = int(manifest['last_offset'])
            self.logger.debug(
                f"last offset: {last_offset}, ntp offsets: {ntp_offsets}")
            assert (last_offset + 1) in ntp_offsets

    def _list_objects(self):
        results = [
            loc.Key for loc in self.s3_client.list_objects(self.s3_bucket)
        ]
        self.logger.debug(f"ListObjects: {results}")
        return results
class PrefixTruncateRecoveryTest(RedpandaTest):
    """
    Verify that a kafka log that's been prefix truncated due to retention policy
    eventually converges with other raft group nodes.
    """
    def __init__(self, test_context):
        extra_rp_conf = dict(
            log_segment_size=1048576,
            retention_bytes=5242880,
            log_compaction_interval_ms=2000,
        )

        topics = dict(topic=dict(cleanup_policy="delete"))

        super(PrefixTruncateRecoveryTest,
              self).__init__(test_context=test_context,
                             num_brokers=3,
                             extra_rp_conf=extra_rp_conf,
                             topics=topics)

        self.kafka_tools = KafkaCliTools(self.redpanda)

    @cluster(num_node=3)
    @matrix(acks=[-1, 1])
    def test_prefix_truncate_recovery(self, acks):
        # produce a little data
        self.kafka_tools.produce("topic", 1024, 1024, acks=acks)

        # stop one of the nodes
        node = self.redpanda.controller()
        self.redpanda.stop_node(node)

        # produce data to the topic until we observe that the retention policy
        # has kicked in and one or more segments has been deleted.
        self.produce_until_deleted(node)

        self.redpanda.start_node(node)
        self.verify_recovery(node)

    def produce_until_deleted(self, ignore_node):
        partitions = {}

        #
        # Produce until at least 3 segments per partition appear on disk.
        #
        def produce_until_segments(count):
            self.kafka_tools.produce("topic", 1000, 1000)
            storage = self.redpanda.storage()
            for p in storage.partitions("kafka", "topic"):
                if p.node == ignore_node:
                    continue
                if p.num not in partitions or len(
                        partitions[p.num].segments) < count:
                    partitions[p.num] = p
            self.logger.debug("Found partitions: %s", partitions)
            return partitions and all(
                map(lambda p: len(p[1].segments) >= count, partitions.items()))

        wait_until(lambda: produce_until_segments(3),
                   timeout_sec=60,
                   backoff_sec=1,
                   err_msg="Expected segments did not materialize")

        def make_segment_sets(partitions):
            return {
                p[0]: {s[0]
                       for s in p[1].segments.items()}
                for p in partitions.items()
            }

        orig_segments = make_segment_sets(partitions)
        self.logger.debug(f"Original segments: {orig_segments}")

        #
        # Continue producing until the original segments above have been deleted
        # because of the retention / cleanup policy.
        #
        def produce_until_segments_deleted():
            self.kafka_tools.produce("topic", 1000, 1000)
            storage = self.redpanda.storage()
            curr_segments = make_segment_sets(
                {p.num: p
                 for p in storage.partitions("kafka", "topic")})
            for p, segs in orig_segments.items():
                self.logger.debug("Partition %d segment set intersection: %s",
                                  p, segs.intersection(curr_segments[p]))
                if not segs.isdisjoint(curr_segments[p]):
                    return False
            return True

        wait_until(lambda: produce_until_segments_deleted(),
                   timeout_sec=60,
                   backoff_sec=1,
                   err_msg="Original segments were not deleted")

    def verify_recovery(self, node):
        # repeat until true
        #  1. collect segment files from quroum members
        #  2. verify byte-for-byte equivalence of common range
        #  3. success
        with tempfile.TemporaryDirectory() as d:
            self.redpanda.copy_data(d, node)
            store = vstorage.Store(d)
            for ntp in store.ntps:
                for path in ntp.segments:
                    try:
                        s = vstorage.Segment(path)
                    except vstorage.CorruptBatchError as e:
                        print("corruption detected in batch {} of segment: {}".
                              format(e.batch.index, path))
                        print("header of corrupt batch: {}".format(
                            e.batch.header))
                        continue
                    print("successfully decoded segment: {}".format(path))
Esempio n. 11
0
    def test_node_recovery(self, recovery_type):
        self.start_redpanda(num_nodes=3)
        kafka_tools = KafkaCliTools(self.redpanda)
        kafka_cat = KafkaCat(self.redpanda)
        # create topics
        topics = []
        for _ in range(0, 6):
            topics.append(TopicSpec(partition_count=random.randint(1, 10)))
        # chose one topic to run the main workload
        DefaultClient(self.redpanda).create_topic(topics)
        self.topic = random.choice(topics).name

        self.start_producer(1)
        self.start_consumer(2)
        self.await_startup()

        # chose another topic and populate it with data
        prepopulated_topic = random.choice(topics)

        while self.topic == prepopulated_topic.name:
            prepopulated_topic = random.choice(topics)

        # populate topic with data
        kafka_tools.produce(prepopulated_topic.name, 20000, 1024)

        def list_offsets():
            offsets = {}
            for p in range(0, prepopulated_topic.partition_count):
                offsets[p] = kafka_cat.list_offsets(prepopulated_topic.name, p)

        # store offsets
        offsets = list_offsets()

        self.redpanda.logger.info(f"Topic offsets: {offsets}")
        # stop one of the nodes and remove its data
        stopped = random.choice(self.redpanda.nodes)
        # prepare seed servers list
        seeds = map(lambda n: {
            "address": n.account.hostname,
            "port": 33145
        }, self.redpanda.nodes)
        seeds = list(
            filter(lambda n: n['address'] != stopped.account.hostname, seeds))

        self.redpanda.stop_node(stopped)
        if recovery_type == FullNodeRecoveryTest.FULL_RECOVERY:
            self.redpanda.clean_node(stopped, preserve_logs=True)

        # produce some more data to make sure that stopped node is behind
        kafka_tools.produce(prepopulated_topic.name, 20000, 1024)

        # start node with the same node id, and not empty seed server list to

        # give node more time to start as it has to recover
        self.redpanda.start_node(stopped,
                                 override_cfg_params={'seed_servers': seeds},
                                 timeout=90)

        def all_topics_recovered():
            metric = self.redpanda.metrics_sample("under_replicated_replicas",
                                                  self.redpanda.nodes)
            under_replicated = filter(lambda s: s.value == 1, metric.samples)
            under_replicated = list(
                map(
                    lambda s: (s.labels['namespace'], s.labels['topic'], s.
                               labels['partition']), under_replicated))
            self.redpanda.logger.info(
                f"under replicated partitions: {list(under_replicated)}")
            return len(under_replicated) == 0

        # wait for prepopulated topic to recover
        wait_until(all_topics_recovered, 60, 1)

        self.run_validation(min_records=20000,
                            enable_idempotence=False,
                            producer_timeout_sec=60,
                            consumer_timeout_sec=180)

        # validate prepopulated topic offsets
        assert offsets == list_offsets()
class PrefixTruncateRecoveryTest(RedpandaTest):
    """
    The purpose of this test is to exercise recovery of partitions which have
    had data reclaimed based on retention policy. The testing strategy is:

       1. Stop 1 out 3 nodes
       2. Produce until retention policy reclaims data
       3. Restart the stopped node
       4. Verify that the stopped node recovers

    Leadership balancing is disabled in this test because the final verification
    step tries to force leadership so that verification may query metadata from
    specific nodes where the kafka protocol only returns state from leaders.
    """
    topics = (TopicSpec(cleanup_policy=TopicSpec.CLEANUP_DELETE), )

    def __init__(self, test_context):
        extra_rp_conf = dict(
            log_segment_size=1048576,
            retention_bytes=3145728,
            log_compaction_interval_ms=1000,
            enable_leader_balancer=False,
        )

        super(PrefixTruncateRecoveryTest,
              self).__init__(test_context=test_context,
                             num_brokers=3,
                             extra_rp_conf=extra_rp_conf)

        self.kafka_tools = KafkaCliTools(self.redpanda)
        self.kafka_cat = KafkaCat(self.redpanda)

    def fully_replicated(self, nodes):
        """
        Test that for each specified node that there are no reported under
        replicated partitions corresponding to the test topic.
        """
        metric = self.redpanda.metrics_sample("under_replicated_replicas",
                                              nodes)
        metric = metric.label_filter(dict(namespace="kafka", topic=self.topic))
        assert len(metric.samples) == len(nodes)
        return all(map(lambda s: s.value == 0, metric.samples))

    def get_segments_deleted(self, nodes):
        """
        Return the values of the log segments removed metric.
        """
        metric = self.redpanda.metrics_sample("log_segments_removed", nodes)
        metric = metric.label_filter(dict(namespace="kafka", topic=self.topic))
        assert len(metric.samples) == len(nodes)
        return [s.value for s in metric.samples]

    def produce_until_reclaim(self, initial_deleted, acks):
        """
        Produce data until we observe that segments have been deleted. The
        initial_deleted parameter is the max number of segments deleted across
        nodes, and we wait for all partitions to report at least initial + 3
        deletions so that all nodes have experienced some deletion.
        """
        deleted = self.get_segments_deleted(self.redpanda.nodes[1:])
        if all(map(lambda d: d >= initial_deleted + 2, deleted)):
            return True
        self.kafka_tools.produce(self.topic, 1024, 1024, acks=acks)
        return False

    @cluster(num_nodes=3, log_allow_list=LOG_ALLOW_LIST)
    @matrix(acks=[-1, 1], start_empty=[True, False])
    def test_prefix_truncate_recovery(self, acks, start_empty):
        # cover boundary conditions of partition being empty/non-empty
        if not start_empty:
            self.kafka_tools.produce(self.topic, 2048, 1024, acks=acks)
            wait_until(lambda: self.fully_replicated(self.redpanda.nodes),
                       timeout_sec=90,
                       backoff_sec=5)

        # stop this unfortunate node
        stopped_node = self.redpanda.nodes[0]
        self.redpanda.stop_node(stopped_node)

        # produce data into the topic until segments are reclaimed
        # by the configured retention policy
        deleted = max(self.get_segments_deleted(self.redpanda.nodes[1:]))
        wait_until(lambda: self.produce_until_reclaim(deleted, acks),
                   timeout_sec=90,
                   backoff_sec=5)

        # we should now observe an under replicated state
        wait_until(lambda: not self.fully_replicated(self.redpanda.nodes[1:]),
                   timeout_sec=90,
                   backoff_sec=5)

        # finally restart the node and wait until fully replicated
        self.redpanda.start_node(stopped_node)
        wait_until(lambda: self.fully_replicated(self.redpanda.nodes),
                   timeout_sec=90,
                   backoff_sec=5)

        self.verify_offsets()

    def verify_offsets(self):
        """
        Test that the ending offset for the partition as seen on each
        node are identical. Since we can only query this from the leader, we
        disable auto leadership balancing, and manually transfer leadership
        before querying.

        Note that because each node applies retention policy independently to a
        prefix of the log we can't reliably compare the starting offsets.
        """
        admin = Admin(self.redpanda)
        offsets = []
        for node in self.redpanda.nodes:
            admin.transfer_leadership_to(namespace="kafka",
                                         topic=self.topic,
                                         partition=0,
                                         target=node)
            # % ERROR: offsets_for_times failed: Local: Unknown partition
            # may occur here presumably because there is an interaction
            # with leadership transfer. the built-in retries in list_offsets
            # appear to deal with this gracefully and we still pass.
            offsets.append(self.kafka_cat.list_offsets(self.topic, 0))
        assert all(map(lambda o: o[1] == offsets[0][1], offsets))