Esempio n. 1
0
def test_table_name_filter() -> None:
    table_name = "table_name"
    message_filter = CdcTableNameMessageFilter(table_name)

    # Messages that math the table should not be dropped.
    assert not message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", [("table", table_name.encode("utf8"))]),
            datetime.now(),
        ))

    # Messages without a table should be dropped.
    assert message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", []),
            datetime.now(),
        ))

    # Messages from a different table should be dropped.
    assert message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", [("table", b"other_table")]),
            datetime.now(),
        ))
Esempio n. 2
0
def test_multistorage_strategy() -> None:
    from snuba.datasets.storages import groupassignees, groupedmessages

    from tests.datasets.cdc.test_groupassignee import TestGroupassignee
    from tests.datasets.cdc.test_groupedmessage import TestGroupedMessage

    commit = Mock()

    storages = [groupassignees.storage, groupedmessages.storage]

    strategy = MultistorageConsumerProcessingStrategyFactory(
        storages,
        10,
        10,
        1,
        int(32 * 1e6),
        int(64 * 1e6),
        TestingMetricsBackend(),
    ).create(commit)

    payloads = [
        KafkaPayload(None, b"{}", [("table", b"ignored")]),
        KafkaPayload(
            None,
            json.dumps(TestGroupassignee.INSERT_MSG).encode("utf8"),
            [("table",
              groupassignees.storage.get_postgres_table().encode("utf8"))],
        ),
        KafkaPayload(
            None,
            json.dumps(TestGroupedMessage.INSERT_MSG).encode("utf8"),
            [("table",
              groupedmessages.storage.get_postgres_table().encode("utf8"))],
        ),
    ]

    messages = [
        Message(Partition(Topic("topic"), 0), offset, payload, datetime.now(),
                offset + 1) for offset, payload in enumerate(payloads)
    ]

    with assert_changes(lambda: get_row_count(groupassignees.storage), 0,
                        1), assert_changes(
                            lambda: get_row_count(groupedmessages.storage), 0,
                            1):

        for message in messages:
            strategy.submit(message)

        with assert_changes(lambda: commit.call_args_list, [],
                            [call({Partition(Topic("topic"), 0): 3})]):
            strategy.close()
            strategy.join()
Esempio n. 3
0
    def test_offsets(self):
        event = self.event

        message: Message[KafkaPayload] = Message(
            Partition(Topic("events"), 456),
            123,
            KafkaPayload(None,
                         json.dumps((2, "insert", event)).encode("utf-8"),
                         []),  # event doesn't really matter
            datetime.now(),
        )

        test_worker = ConsumerWorker(
            self.dataset.get_writable_storage(),
            producer=FakeConfluentKafkaProducer(),
            replacements_topic=Topic(
                enforce_table_writer(self.dataset).get_stream_loader().
                get_replacement_topic_spec().topic_name),
            metrics=self.metrics,
        )
        batch = [test_worker.process_message(message)]
        test_worker.flush_batch(batch)

        clickhouse = (get_storage(
            StorageKey.EVENTS).get_cluster().get_query_connection(
                ClickhouseClientSettings.QUERY))

        assert clickhouse.execute(
            "SELECT project_id, event_id, offset, partition FROM %s" %
            self.table) == [(self.event["project_id"], self.event["event_id"],
                             123, 456)]
Esempio n. 4
0
    def eventstream(*, dataset: Dataset):
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from snuba.consumer import ConsumerWorker

            worker = ConsumerWorker(storage, metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
Esempio n. 5
0
 def _wrap(self, msg: str) -> Message[KafkaPayload]:
     return Message(
         Partition(Topic("replacements"), 0),
         0,
         KafkaPayload(None, json.dumps(msg).encode("utf-8"), []),
         datetime.now(),
     )
Esempio n. 6
0
    def test_skip_too_old(self):
        test_worker = ConsumerWorker(
            self.dataset.get_writable_storage(),
            producer=FakeConfluentKafkaProducer(),
            replacements_topic=Topic(
                enforce_table_writer(self.dataset).get_stream_loader().
                get_replacement_topic_spec().topic_name),
            metrics=self.metrics,
        )

        event = self.event
        old_timestamp = datetime.utcnow() - timedelta(days=300)
        old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        event["datetime"] = old_timestamp_str
        event["data"]["datetime"] = old_timestamp_str
        event["data"]["received"] = int(
            calendar.timegm(old_timestamp.timetuple()))

        message: Message[KafkaPayload] = Message(
            Partition(Topic("events"), 1),
            42,
            KafkaPayload(None,
                         json.dumps((2, "insert", event)).encode("utf-8"), []),
            datetime.now(),
        )

        assert test_worker.process_message(message) is None
Esempio n. 7
0
    def test_flattened_tags(self):
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        # | and = are intentional to test the escaping logic when computing the
        # flattened_tags on tag deletions
        self.event["data"]["tags"] = []
        self.event["data"]["tags"].append(["browser|name", "foo=1"])
        self.event["data"]["tags"].append(["browser|to_delete", "foo=2"])
        self.event["data"]["tags"].append(["notbrowser", "foo\\3"])
        self.event["data"]["tags"].append(["notbrowser2", "foo4"])
        self.write_events([self.event])

        project_id = self.project_id

        def _fetch_flattened_tags():
            return json.loads(
                self.app.post(
                    "/query",
                    data=json.dumps({
                        "project": [project_id],
                        "selected_columns": [
                            "_tags_flattened",
                            "tags.key",
                            "tags.value",
                        ],
                    }),
                ).data)["data"]

        timestamp = datetime.now(tz=pytz.utc)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "end_delete_tag",
                    {
                        "project_id": project_id,
                        "tag": "browser|to_delete",
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert _fetch_flattened_tags() == [{
            "tags.key": ["browser|name", "notbrowser", "notbrowser2"],
            "tags.value": ["foo=1", "foo\\3", "foo4"],
            "_tags_flattened":
            "|browser\\|name=foo\\=1||notbrowser=foo\\\\3||notbrowser2=foo4|",
        }]
Esempio n. 8
0
 def __make_msg(self, partition: int, offset: int, payload: str,
                headers: Headers) -> Message[KafkaPayload]:
     return Message(
         partition=Partition(Topic("topic"), partition),
         offset=offset,
         payload=KafkaPayload(b"key", payload.encode(), headers),
         timestamp=datetime(2019, 6, 19, 6, 46, 28),
     )
Esempio n. 9
0
    def test_delete_tag_promoted_insert(self):
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["data"]["tags"].append(["browser.name", "foo"])
        self.event["data"]["tags"].append(["notbrowser", "foo"])
        self.write_unprocessed_events([self.event])

        project_id = self.project_id

        def _issue_count(total=False):
            return json.loads(
                self.app.post(
                    "/query",
                    data=json.dumps(
                        {
                            "project": [project_id],
                            "aggregations": [["count()", "", "count"]],
                            "conditions": [["tags[browser.name]", "=", "foo"]]
                            if not total
                            else [],
                            "groupby": ["group_id"],
                        }
                    ),
                ).data
            )["data"]

        assert _issue_count() == [{"count": 1, "group_id": 1}]
        assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]

        timestamp = datetime.now(tz=pytz.utc)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps(
                    (
                        2,
                        "end_delete_tag",
                        {
                            "project_id": project_id,
                            "tag": "browser.name",
                            "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                        },
                    )
                ).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert _issue_count() == []
        assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
Esempio n. 10
0
def test_synchronized_consumer_handles_end_of_partition(
    broker: Broker[KafkaPayload], ) -> None:
    topic = Topic("topic")
    commit_log_topic = Topic("commit-log")

    broker.create_topic(topic, partitions=1)
    broker.create_topic(commit_log_topic, partitions=1)

    consumer = broker.get_consumer("consumer", enable_end_of_partition=True)
    producer = broker.get_producer()
    commit_log_consumer = broker.get_consumer("commit-log-consumer")

    messages = [
        producer.produce(topic, KafkaPayload(None, f"{i}".encode("utf8"),
                                             [])).result(1.0) for i in range(2)
    ]

    synchronized_consumer: Consumer[KafkaPayload] = SynchronizedConsumer(
        consumer,
        commit_log_consumer,
        commit_log_topic=commit_log_topic,
        commit_log_groups={"leader"},
    )

    with closing(synchronized_consumer):
        synchronized_consumer.subscribe([topic])

        wait_for_consumer(
            commit_log_consumer,
            producer.produce(
                commit_log_topic,
                commit_codec.encode(
                    Commit("leader", Partition(topic, 0),
                           messages[0].next_offset), ),
            ).result(),
        )

        assert synchronized_consumer.poll(0) == messages[0]

        # If the commit log consumer does not handle EOF, it will have crashed
        # here and will never return the next message.
        wait_for_consumer(
            commit_log_consumer,
            producer.produce(
                commit_log_topic,
                commit_codec.encode(
                    Commit("leader", Partition(topic, 0),
                           messages[1].next_offset), ),
            ).result(),
        )

        assert synchronized_consumer.poll(0) == messages[1]
Esempio n. 11
0
def test_parallel_transform_worker_apply() -> None:
    messages = [
        Message(
            Partition(Topic("test"), 0),
            i,
            KafkaPayload(None, b"\x00" * size, None),
            datetime.now(),
        ) for i, size in enumerate([1000, 1000, 2000, 4000])
    ]

    with SharedMemoryManager() as smm:
        input_block = smm.SharedMemory(8192)
        assert input_block.size == 8192

        input_batch = MessageBatch(input_block)
        for message in messages:
            input_batch.append(message)

        assert len(input_batch) == 4

        output_block = smm.SharedMemory(4096)
        assert output_block.size == 4096

        index, output_batch = parallel_transform_worker_apply(
            transform_payload_expand,
            input_batch,
            output_block,
        )

        # The first batch should be able to fit 2 messages.
        assert index == 2
        assert len(output_batch) == 2

        index, output_batch = parallel_transform_worker_apply(
            transform_payload_expand,
            input_batch,
            output_block,
            index,
        )

        # The second batch should be able to fit one message.
        assert index == 3
        assert len(output_batch) == 1

        # The last message is too large to fit in the batch.
        with pytest.raises(ValueTooLarge):
            parallel_transform_worker_apply(
                transform_payload_expand,
                input_batch,
                output_block,
                index,
            )
Esempio n. 12
0
def get_messages(events_file) -> Sequence[Message[KafkaPayload]]:
    "Create a fake Kafka message for each JSON event in the file."
    messages: MutableSequence[Message[KafkaPayload]] = []
    raw_events = open(events_file).readlines()
    for raw_event in raw_events:
        messages.append(
            Message(
                Partition(Topic("events"), 1),
                0,
                KafkaPayload(None, raw_event.encode("utf-8"), []),
                datetime.now(),
            ), )
    return messages
Esempio n. 13
0
    def eventstream(*, dataset: Dataset) -> RespTuple:
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_default_entity().get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from snuba.consumers.consumer import StreamingConsumerStrategyFactory

            table_writer = storage.get_table_writer()
            stream_loader = table_writer.get_stream_loader()
            strategy = StreamingConsumerStrategyFactory(
                stream_loader.get_pre_filter(),
                stream_loader.get_processor(),
                table_writer.get_batch_writer(metrics),
                metrics,
                max_batch_size=1,
                max_batch_time=1.0,
                processes=None,
                input_block_size=None,
                output_block_size=None,
            ).create(lambda offsets: None)
            strategy.submit(message)
            strategy.close()
            strategy.join()
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, metrics=metrics)
            processed = worker.process_message(message)
            if processed is not None:
                batch = [processed]
                worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
Esempio n. 14
0
    def test_commit_log_consumer(self) -> None:
        # XXX: This would be better as an integration test (or at least a test
        # against an abstract Producer interface) instead of against a test against
        # a mock.
        commit_log_producer = FakeConfluentKafkaProducer()

        consumer: KafkaConsumer = KafkaConsumerWithCommitLog(
            {
                **self.configuration,
                "auto.offset.reset": "earliest",
                "enable.auto.commit": "false",
                "enable.auto.offset.store": "false",
                "enable.partition.eof": "true",
                "group.id": "test",
                "session.timeout.ms": 10000,
            },
            producer=commit_log_producer,
            commit_log_topic=Topic("commit-log"),
        )

        with self.get_topic() as topic, closing(consumer) as consumer:
            with closing(self.get_producer()) as producer:
                producer.produce(topic, next(self.get_payloads())).result(5.0)

            consumer.subscribe([topic])

            message = consumer.poll(
                10.0)  # XXX: getting the subscription is slow
            assert isinstance(message, Message)

            consumer.stage_offsets({message.partition: message.next_offset})

            assert consumer.commit_offsets() == {
                Partition(topic, 0): message.next_offset
            }

            assert len(commit_log_producer.messages) == 1
            commit_message = commit_log_producer.messages[0]
            assert commit_message.topic() == "commit-log"

            assert commit_codec.decode(
                KafkaPayload(
                    commit_message.key(),
                    commit_message.value(),
                    commit_message.headers(),
                )) == Commit("test", Partition(topic, 0), message.next_offset)
Esempio n. 15
0
 def encode(self, value: SubscriptionTaskResult) -> KafkaPayload:
     subscription_id = str(value.task.task.identifier)
     request, result = value.result
     return KafkaPayload(
         subscription_id.encode("utf-8"),
         json.dumps({
             "version": 2,
             "payload": {
                 "subscription_id": subscription_id,
                 "request": {
                     **request.body
                 },
                 "result": result,
                 "timestamp": value.task.timestamp.isoformat(),
             },
         }).encode("utf-8"),
         [],
     )
Esempio n. 16
0
    def test_unmerge_insert(self):
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        self.write_events([self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "end_unmerge",
                    {
                        "project_id": project_id,
                        "previous_group_id": 1,
                        "new_group_id": 2,
                        "hashes": ["a" * 32],
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 2
        }]
Esempio n. 17
0
def test_message_batch() -> None:
    partition = Partition(Topic("test"), 0)

    with SharedMemoryManager() as smm:
        block = smm.SharedMemory(4096)
        assert block.size == 4096

        message = Message(partition, 0, KafkaPayload(None, b"\x00" * 4000,
                                                     None), datetime.now())

        batch: MessageBatch[KafkaPayload] = MessageBatch(block)
        with assert_changes(lambda: len(batch), 0, 1):
            batch.append(message)

        assert batch[0] == message
        assert list(batch) == [message]

        with assert_does_not_change(lambda: len(batch),
                                    1), pytest.raises(ValueTooLarge):
            batch.append(message)
Esempio n. 18
0
    def test_delete_groups_insert(self):
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "end_delete_groups",
                    {
                        "project_id": project_id,
                        "group_ids": [1],
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == []
Esempio n. 19
0
def test_parallel_transform_step() -> None:
    next_step = Mock()

    messages = [
        Message(
            Partition(Topic("test"), 0),
            i,
            KafkaPayload(None, b"\x00" * size, None),
            datetime.now(),
        ) for i, size in enumerate([1000, 1000, 2000, 2000])
    ]

    starting_processes = get_subprocess_count()
    worker_processes = 2
    manager_processes = 1
    metrics = TestingMetricsBackend()

    with assert_changes(
            get_subprocess_count,
            starting_processes,
            starting_processes + worker_processes + manager_processes,
    ), assert_changes(
            lambda: metrics.calls,
        [],
        [
            GaugeCall("batches_in_progress", value, tags=None)
            for value in [0.0, 1.0, 2.0]
        ],
    ):
        transform_step = ParallelTransformStep(
            transform_payload_expand,
            next_step,
            processes=worker_processes,
            max_batch_size=5,
            max_batch_time=60,
            input_block_size=4096,
            output_block_size=4096,
            metrics=metrics,
        )

        for message in messages:
            transform_step.poll()
            transform_step.submit(message)

        transform_step.close()

    metrics.calls.clear()

    with assert_changes(
            get_subprocess_count,
            starting_processes + worker_processes + manager_processes,
            starting_processes,
    ), assert_changes(
            lambda: metrics.calls,
        [],
        [
            GaugeCall("batches_in_progress", value, tags=None)
            for value in [1.0, 0.0]
        ],
    ):
        transform_step.join()

    assert next_step.submit.call_count == len(messages)
Esempio n. 20
0
def test_synchronized_consumer(broker: Broker[KafkaPayload]) -> None:
    topic = Topic("topic")
    commit_log_topic = Topic("commit-log")

    broker.create_topic(topic, partitions=1)
    broker.create_topic(commit_log_topic, partitions=1)

    consumer = broker.get_consumer("consumer")
    producer = broker.get_producer()
    commit_log_consumer = broker.get_consumer("commit-log-consumer")

    messages = [
        producer.produce(topic, KafkaPayload(None, f"{i}".encode("utf8"),
                                             [])).result(1.0) for i in range(6)
    ]

    synchronized_consumer: Consumer[KafkaPayload] = SynchronizedConsumer(
        consumer,
        commit_log_consumer,
        commit_log_topic=commit_log_topic,
        commit_log_groups={"leader-a", "leader-b"},
    )

    with closing(synchronized_consumer):
        synchronized_consumer.subscribe([topic])

        # The consumer should not consume any messages until it receives a
        # commit from both groups that are being followed.
        with assert_changes(consumer.paused, [],
                            [Partition(topic, 0)]), assert_changes(
                                consumer.tell, {},
                                {Partition(topic, 0): messages[0].offset}):
            assert synchronized_consumer.poll(0.0) is None

        wait_for_consumer(
            commit_log_consumer,
            producer.produce(
                commit_log_topic,
                commit_codec.encode(
                    Commit("leader-a", Partition(topic, 0),
                           messages[0].next_offset)),
            ).result(),
        )

        # The consumer should remain paused, since it needs both groups to
        # advance before it may continue.
        with assert_does_not_change(
                consumer.paused,
            [Partition(topic, 0)]), assert_does_not_change(
                consumer.tell, {Partition(topic, 0): messages[0].offset}):
            assert synchronized_consumer.poll(0.0) is None

        wait_for_consumer(
            commit_log_consumer,
            producer.produce(
                commit_log_topic,
                commit_codec.encode(
                    Commit("leader-b", Partition(topic, 0),
                           messages[0].next_offset)),
            ).result(),
        )

        # The consumer should be able to resume consuming, since both consumers
        # have processed the first message.
        with assert_changes(consumer.paused, [Partition(topic, 0)],
                            []), assert_changes(
                                consumer.tell,
                                {Partition(topic, 0): messages[0].offset},
                                {Partition(topic, 0): messages[0].next_offset},
                            ):
            assert synchronized_consumer.poll(0.0) == messages[0]

        # After consuming the one available message, the consumer should be
        # paused again until the remote offsets advance.
        with assert_changes(consumer.paused, [],
                            [Partition(topic, 0)]), assert_does_not_change(
                                consumer.tell,
                                {Partition(topic, 0): messages[1].offset}):
            assert synchronized_consumer.poll(0.0) is None

        # Emulate the unlikely (but possible) scenario of the leader offsets
        # being within a series of compacted (deleted) messages by:
        # 1. moving the remote offsets forward, so that the partition is resumed
        # 2. seeking the consumer beyond the remote offsets

        producer.produce(
            commit_log_topic,
            commit_codec.encode(
                Commit("leader-a", Partition(topic, 0), messages[3].offset)),
        ).result()

        wait_for_consumer(
            commit_log_consumer,
            producer.produce(
                commit_log_topic,
                commit_codec.encode(
                    Commit("leader-b", Partition(topic, 0),
                           messages[5].offset)),
            ).result(),
        )

        # The consumer should be able to resume consuming, since both consumers
        # have processed the first message.
        with assert_changes(consumer.paused, [Partition(topic, 0)],
                            []), assert_changes(
                                consumer.tell,
                                {Partition(topic, 0): messages[1].offset},
                                {Partition(topic, 0): messages[1].next_offset},
                            ):
            assert synchronized_consumer.poll(0.0) == messages[1]

        # At this point, we manually seek the consumer offset, to emulate messages being skipped.
        with assert_changes(
                consumer.tell,
            {Partition(topic, 0): messages[2].offset},
            {Partition(topic, 0): messages[4].offset},
        ):
            consumer.seek({Partition(topic, 0): messages[4].offset})

        # Since the (effective) remote offset is the offset for message #3 (via
        # ``leader-a``), and the local offset is the offset of message #4, when
        # message #4 is consumed, it should be discarded and the offset should
        # be rolled back to wait for the commit log to advance.
        with assert_changes(consumer.paused, [],
                            [Partition(topic, 0)]), assert_does_not_change(
                                consumer.tell,
                                {Partition(topic, 0): messages[4].offset}):
            assert synchronized_consumer.poll(0.0) is None

        wait_for_consumer(
            commit_log_consumer,
            producer.produce(
                commit_log_topic,
                commit_codec.encode(
                    Commit("leader-a", Partition(topic, 0),
                           messages[5].offset)),
            ).result(),
        )

        # The consumer should be able to resume consuming.
        with assert_changes(consumer.paused, [Partition(topic, 0)],
                            []), assert_changes(
                                consumer.tell,
                                {Partition(topic, 0): messages[4].offset},
                                {Partition(topic, 0): messages[4].next_offset},
                            ):
            assert synchronized_consumer.poll(0.0) == messages[4]
Esempio n. 21
0
def test_payload_equality() -> None:
    assert KafkaPayload(None, b"", []) == KafkaPayload(None, b"", [])
    assert KafkaPayload(b"key", b"value",
                        []) == KafkaPayload(b"key", b"value", [])
    assert KafkaPayload(None, b"", [("key", b"value")]) == KafkaPayload(
        None, b"", [("key", b"value")])
    assert not KafkaPayload(None, b"a", []) == KafkaPayload(None, b"b", [])
    assert not KafkaPayload(b"this", b"", []) == KafkaPayload(b"that", b"", [])
    assert not KafkaPayload(None, b"", [("key", b"this")]) == KafkaPayload(
        None, b"", [("key", b"that")])
Esempio n. 22
0
def test_synchronized_consumer_pause_resume(
        broker: Broker[KafkaPayload]) -> None:
    topic = Topic("topic")
    commit_log_topic = Topic("commit-log")

    broker.create_topic(topic, partitions=1)
    broker.create_topic(commit_log_topic, partitions=1)

    consumer = broker.get_consumer("consumer")
    producer = broker.get_producer()
    commit_log_consumer = broker.get_consumer("commit-log-consumer")

    messages = [
        producer.produce(topic, KafkaPayload(None, f"{i}".encode("utf8"),
                                             [])).result(1.0) for i in range(2)
    ]

    synchronized_consumer: Consumer[KafkaPayload] = SynchronizedConsumer(
        consumer,
        commit_log_consumer,
        commit_log_topic=commit_log_topic,
        commit_log_groups={"leader"},
    )

    with closing(synchronized_consumer):

        def assignment_callback(offsets: Mapping[Partition, int]) -> None:
            synchronized_consumer.pause([Partition(topic, 0)])

        synchronized_consumer.subscribe([topic], on_assign=assignment_callback)

        with assert_changes(synchronized_consumer.paused, [],
                            [Partition(topic, 0)]), assert_changes(
                                consumer.paused, [], [Partition(topic, 0)]):
            assert synchronized_consumer.poll(0.0) is None

        # Advancing the commit log offset should not cause the consumer to
        # resume, since it has been explicitly paused.
        wait_for_consumer(
            commit_log_consumer,
            producer.produce(
                commit_log_topic,
                commit_codec.encode(
                    Commit("leader", Partition(topic, 0),
                           messages[0].next_offset)),
            ).result(),
        )

        with assert_does_not_change(consumer.paused, [Partition(topic, 0)]):
            assert synchronized_consumer.poll(0) is None

        # Resuming the partition does not immediately cause the partition to
        # resume, but it should look as if it is resumed to the caller.
        with assert_changes(synchronized_consumer.paused,
                            [Partition(topic, 0)], []), assert_does_not_change(
                                consumer.paused, [Partition(topic, 0)]):
            synchronized_consumer.resume([Partition(topic, 0)])

        # The partition should be resumed on the next poll call, however.
        with assert_changes(consumer.paused, [Partition(topic, 0)], []):
            assert synchronized_consumer.poll(0) == messages[0]

        # Pausing due to hitting the offset fence should not appear as a paused
        # partition to the caller.
        with assert_does_not_change(synchronized_consumer.paused,
                                    []), assert_changes(
                                        consumer.paused, [],
                                        [Partition(topic, 0)]):
            assert synchronized_consumer.poll(0) is None

        # Other pause and resume actions should not cause the inner consumer to
        # change its state while up against the fence.
        with assert_changes(synchronized_consumer.paused, [],
                            [Partition(topic, 0)]), assert_does_not_change(
                                consumer.paused, [Partition(topic, 0)]):
            synchronized_consumer.pause([Partition(topic, 0)])

        with assert_changes(synchronized_consumer.paused,
                            [Partition(topic, 0)], []), assert_does_not_change(
                                consumer.paused, [Partition(topic, 0)]):
            synchronized_consumer.resume([Partition(topic, 0)])
Esempio n. 23
0
def test_streaming_consumer_strategy() -> None:
    messages = (Message(
        Partition(Topic("events"), 0),
        i,
        KafkaPayload(None, b"{}", None),
        datetime.now(),
    ) for i in itertools.count())

    replacements_producer = FakeConfluentKafkaProducer()

    processor = Mock()
    processor.process_message.side_effect = [
        None,
        InsertBatch([{}]),
        ReplacementBatch("key", [{}]),
    ]

    writer = Mock()

    metrics = TestingMetricsBackend()

    factory = StreamingConsumerStrategyFactory(
        None,
        processor,
        writer,
        metrics,
        max_batch_size=10,
        max_batch_time=60,
        processes=None,
        input_block_size=None,
        output_block_size=None,
        replacements_producer=replacements_producer,
        replacements_topic=Topic("replacements"),
    )

    commit_function = Mock()
    strategy = factory.create(commit_function)

    for i in range(3):
        strategy.poll()
        strategy.submit(next(messages))

    assert metrics.calls == []

    processor.process_message.side_effect = [{}]

    with pytest.raises(TypeError):
        strategy.poll()
        strategy.submit(next(messages))

    def get_number_of_insertion_metrics() -> int:
        count = 0
        for call in metrics.calls:
            if isinstance(call,
                          Timing) and call.name == "insertions.latency_ms":
                count += 1
        return count

    expected_write_count = 1

    with assert_changes(get_number_of_insertion_metrics, 0,
                        expected_write_count), assert_changes(
                            lambda: writer.write.call_count, 0,
                            expected_write_count), assert_changes(
                                lambda: len(replacements_producer.messages), 0,
                                1):
        strategy.close()
        strategy.join()
Esempio n. 24
0
def test_payload_pickle_simple() -> None:
    payload = KafkaPayload(b"key", b"value", [])
    assert pickle.loads(pickle.dumps(payload)) == payload
Esempio n. 25
0
    def test_reprocessing_flow_insert(self) -> None:
        # We have a group that contains two events, 1 and 2.
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["event_id"] = event_id = "00e24a150d7f4ee4b142b61b4d893b6d"
        write_unprocessed_events(self.storage, [self.event])
        self.event["event_id"] = event_id2 = "00e24a150d7f4ee4b142b61b4d893b6e"
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 2,
            "group_id": 1
        }]

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "tombstone_events",
                    {
                        "project_id": project_id,
                        "event_ids": [event_id]
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        # The user chooses to reprocess a subset of the group and throw away
        # the other events. Event 1 gets manually tombstoned by Sentry while
        # Event 2 prevails.
        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        # At this point the count doesn't make any sense but we don't care.
        assert self._issue_count(self.project_id) == [{
            "count": 2,
            "group_id": 1
        }]

        # The reprocessed event is inserted with a guaranteed-new group ID but
        # the *same* event ID (this is why we need to skip tombstoning this
        # event ID)
        self.event["group_id"] = 2
        write_unprocessed_events(self.storage, [self.event])

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "exclude_groups",
                    {
                        "project_id": project_id,
                        "group_ids": [1]
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        # Group 1 is excluded from queries. At this point we have almost a
        # regular group deletion, except only a subset of events have been
        # tombstoned (the ones that will *not* be reprocessed).
        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        # Group 2 should contain the one event that the user chose to
        # reprocess, and Group 1 should be gone. (Note: In the product Group 2
        # looks identical to Group 1, including short ID).
        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 2
        }]
        assert self._get_group_id(project_id, event_id2) == 2
        assert not self._get_group_id(project_id, event_id)
Esempio n. 26
0
def test_payload_pickle_out_of_band() -> None:
    payload = KafkaPayload(b"key", b"value", [])
    buffers: MutableSequence[PickleBuffer] = []
    data = pickle.dumps(payload, protocol=5, buffer_callback=buffers.append)
    assert pickle.loads(data, buffers=[b.raw() for b in buffers]) == payload
Esempio n. 27
0
 def get_payloads(self) -> Iterator[KafkaPayload]:
     for i in itertools.count():
         yield KafkaPayload(None, f"{i}".encode("utf8"), [])
Esempio n. 28
0
def transform_payload_expand(message: Message[KafkaPayload]) -> KafkaPayload:
    return KafkaPayload(
        message.payload.key,
        message.payload.value * 2,
        message.payload.headers,
    )