Exemple #1
0
def test_process_messages(mock_indexer, mock_task) -> None:
    message_payloads = [counter_payload, distribution_payload, set_payload]
    message_batch = [
        Message(
            Partition(Topic("topic"), 0),
            i + 1,
            KafkaPayload(None,
                         json.dumps(payload).encode("utf-8"), []),
            datetime.now(),
        ) for i, payload in enumerate(message_payloads)
    ]
    # the outer message uses the last message's partition, offset, and timestamp
    last = message_batch[-1]
    outer_message = Message(last.partition, last.offset, message_batch,
                            last.timestamp)

    new_batch = process_messages(outer_message=outer_message)
    expected_new_batch = [
        Message(
            m.partition,
            m.offset,
            KafkaPayload(
                None,
                json.dumps(__translated_payload(
                    message_payloads[i])).encode("utf-8"),
                [],
            ),
            m.timestamp,
        ) for i, m in enumerate(message_batch)
    ]

    assert new_batch == expected_new_batch
def test_table_name_filter() -> None:
    table_name = "table_name"
    message_filter = CdcTableNameMessageFilter(table_name)

    # Messages that math the table should not be dropped.
    assert not message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", [("table", table_name.encode("utf8"))]),
            datetime.now(),
        ))

    # Messages without a table should be dropped.
    assert message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", []),
            datetime.now(),
        ))

    # Messages from a different table should be dropped.
    assert message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", [("table", b"other_table")]),
            datetime.now(),
        ))
Exemple #3
0
def test_multistorage_strategy(
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
) -> None:
    from snuba.datasets.storages import groupassignees, groupedmessages
    from tests.datasets.cdc.test_groupassignee import TestGroupassignee
    from tests.datasets.cdc.test_groupedmessage import TestGroupedMessage

    commit = Mock()

    storages = [groupassignees.storage, groupedmessages.storage]

    strategy = MultistorageConsumerProcessingStrategyFactory(
        storages,
        10,
        10,
        processes,
        input_block_size,
        output_block_size,
        TestingMetricsBackend(),
    ).create(commit)

    payloads = [
        KafkaPayload(None, b"{}", [("table", b"ignored")]),
        KafkaPayload(
            None,
            json.dumps(TestGroupassignee.INSERT_MSG).encode("utf8"),
            [("table", groupassignees.storage.get_postgres_table().encode("utf8"))],
        ),
        KafkaPayload(
            None,
            json.dumps(TestGroupedMessage.INSERT_MSG).encode("utf8"),
            [("table", groupedmessages.storage.get_postgres_table().encode("utf8"))],
        ),
    ]

    messages = [
        Message(
            Partition(Topic("topic"), 0), offset, payload, datetime.now(), offset + 1
        )
        for offset, payload in enumerate(payloads)
    ]

    with assert_changes(
        lambda: get_row_count(groupassignees.storage), 0, 1
    ), assert_changes(lambda: get_row_count(groupedmessages.storage), 0, 1):

        for message in messages:
            strategy.submit(message)

        with assert_changes(
            lambda: commit.call_args_list, [], [call({Partition(Topic("topic"), 0): 3})]
        ):
            strategy.close()
            strategy.join()
Exemple #4
0
def _batch_message_set_up(next_step: Mock,
                          max_batch_time: float = 100.0,
                          max_batch_size: int = 2):
    # batch time is in seconds
    batch_messages_step = BatchMessages(next_step=next_step,
                                        max_batch_time=max_batch_time,
                                        max_batch_size=max_batch_size)

    message1 = Message(Partition(Topic("topic"), 0), 1,
                       KafkaPayload(None, b"some value", []), datetime.now())
    message2 = Message(Partition(Topic("topic"), 0), 2,
                       KafkaPayload(None, b"another value", []),
                       datetime.now())
    return (batch_messages_step, message1, message2)
Exemple #5
0
    def test_delete_tag_promoted_insert(self) -> None:
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["data"]["tags"].append(["browser.name", "foo"])
        self.event["data"]["tags"].append(["notbrowser", "foo"])
        write_unprocessed_events(self.storage, [self.event])

        project_id = self.project_id

        def _issue_count(total: bool = False) -> Sequence[Mapping[str, Any]]:
            clickhouse = self.storage.get_cluster().get_query_connection(
                ClickhouseClientSettings.QUERY)

            total_cond = (
                "AND has(_tags_hash_map, cityHash64('browser.name=foo'))"
                if not total else "")

            data = clickhouse.execute(f"""
                SELECT group_id, count()
                FROM errors_local
                FINAL
                WHERE deleted = 0
                AND project_id = {project_id}
                {total_cond}
                GROUP BY group_id
                """).results

            return [{"group_id": row[0], "count": row[1]} for row in data]

        assert _issue_count() == [{"count": 1, "group_id": 1}]
        assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]

        timestamp = datetime.now(tz=pytz.utc)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_DELETE_TAG,
                    {
                        "project_id": project_id,
                        "tag": "browser.name",
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        assert processed is not None
        self.replacer.flush_batch([processed])

        assert _issue_count() == []
        assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
Exemple #6
0
 def _wrap(self, msg: Tuple[Any, ...]) -> Message[KafkaPayload]:
     return Message(
         Partition(Topic("replacements"), 0),
         0,
         KafkaPayload(None,
                      json.dumps(msg).encode("utf-8"), []),
         datetime.now(),
     )
Exemple #7
0
    def test_offset_already_processed(self) -> None:
        """
        Don't process an offset that already exists in Redis.
        """
        set_config("skip_seen_offsets", True)
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        key = f"replacement:{CONSUMER_GROUP}:errors:1"
        redis_client.set(key, 42)

        old_offset: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            41,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE,
                    {},
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        same_offset: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE,
                    {},
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        assert self.replacer.process_message(old_offset) is None
        assert self.replacer.process_message(same_offset) is None
Exemple #8
0
def test_metrics_batch_builder():
    max_batch_time = 3.0  # seconds
    max_batch_size = 2

    # 1. Ready when max_batch_size is reached
    batch_builder_size = MetricsBatchBuilder(max_batch_size=max_batch_size,
                                             max_batch_time=max_batch_time)

    assert not batch_builder_size.ready()

    message1 = Message(Partition(Topic("topic"), 0), 1,
                       KafkaPayload(None, b"some value", []), datetime.now())
    batch_builder_size.append(message1)
    assert not batch_builder_size.ready()

    message2 = Message(Partition(Topic("topic"), 0), 2,
                       KafkaPayload(None, b"another value", []),
                       datetime.now())
    batch_builder_size.append(message2)
    assert batch_builder_size.ready()

    # 2. Ready when max_batch_time is reached
    batch_builder_time = MetricsBatchBuilder(max_batch_size=max_batch_size,
                                             max_batch_time=max_batch_time)

    assert not batch_builder_time.ready()

    message1 = Message(Partition(Topic("topic"), 0), 1,
                       KafkaPayload(None, b"some value", []), datetime.now())
    batch_builder_time.append(message1)
    assert not batch_builder_time.ready()

    time.sleep(3)
    assert batch_builder_time.ready()

    # 3. Adding the same message twice to the same batch
    batch_builder_time = MetricsBatchBuilder(max_batch_size=max_batch_size,
                                             max_batch_time=max_batch_time)
    message1 = Message(Partition(Topic("topic"), 0), 1,
                       KafkaPayload(None, b"some value", []), datetime.now())
    batch_builder_time.append(message1)
    with pytest.raises(DuplicateMessage):
        batch_builder_time.append(message1)
Exemple #9
0
    def eventstream(*, dataset: Dataset) -> RespTuple:
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_default_entity().get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from arroyo.processing.strategies.streaming import (
                KafkaConsumerStrategyFactory,
            )

            from snuba.consumers.consumer import build_batch_writer, process_message

            table_writer = storage.get_table_writer()
            stream_loader = table_writer.get_stream_loader()
            strategy = KafkaConsumerStrategyFactory(
                stream_loader.get_pre_filter(),
                functools.partial(
                    process_message, stream_loader.get_processor(), "consumer_grouup"
                ),
                build_batch_writer(table_writer, metrics=metrics),
                max_batch_size=1,
                max_batch_time=1.0,
                processes=None,
                input_block_size=None,
                output_block_size=None,
            ).create(lambda offsets: None)
            strategy.submit(message)
            strategy.close()
            strategy.join()
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, "consumer_group", metrics=metrics)
            processed = worker.process_message(message)
            if processed is not None:
                batch = [processed]
                worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
Exemple #10
0
 def __format_payload(
     self,
     message: Message[Tuple[StorageKey, Union[None, BytesInsertBatch,
                                              ReplacementBatch]]],
 ) -> List[KafkaPayload]:
     kafka_payloads: List[KafkaPayload] = []
     storage_key, payload = message.payload
     if isinstance(payload, BytesInsertBatch):
         for row in payload.rows:
             kafka_payloads.append(
                 KafkaPayload(storage_key.value.encode("utf-8"), row, []))
     return kafka_payloads
Exemple #11
0
def test_commit_log_consumer() -> None:
    # XXX: This would be better as an integration test (or at least a test
    # against an abstract Producer interface) instead of against a test against
    # a mock.
    commit_log_producer = FakeConfluentKafkaProducer()

    configuration = get_default_kafka_configuration()

    consumer: KafkaConsumer = KafkaConsumerWithCommitLog(
        {
            **configuration,
            "auto.offset.reset": "earliest",
            "enable.auto.commit": "false",
            "enable.auto.offset.store": "false",
            "enable.partition.eof": "true",
            "group.id": "test",
            "session.timeout.ms": 10000,
        },
        producer=commit_log_producer,
        commit_log_topic=Topic("commit-log"),
    )

    producer = KafkaProducer(configuration)

    topic = Topic("topic")

    with closing(consumer) as consumer:
        with closing(producer) as producer:
            producer.produce(topic, next(get_payloads())).result(5.0)

        consumer.subscribe([topic])

        message = consumer.poll(10.0)  # XXX: getting the subscription is slow
        assert isinstance(message, Message)

        now = datetime.now()

        position = Position(message.next_offset, now)

        consumer.stage_positions({message.partition: position})

        assert consumer.commit_positions() == {Partition(topic, 0): position}

        assert len(commit_log_producer.messages) == 1
        commit_message = commit_log_producer.messages[0]
        assert commit_message.topic() == "commit-log"

        assert commit_codec.decode(
            KafkaPayload(
                commit_message.key(),
                commit_message.value(),
                commit_message.headers(),
            )) == Commit("test", Partition(topic, 0), message.next_offset, now)
Exemple #12
0
    def test_multiple_partitions(self) -> None:
        """
        Different partitions should have independent offset checks.
        """
        set_config("skip_seen_offsets", True)
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        payload = KafkaPayload(
            None,
            json.dumps((
                2,
                ReplacementType.END_UNMERGE,
                {
                    "project_id":
                    self.project_id,
                    "previous_group_id":
                    1,
                    "new_group_id":
                    2,
                    "hashes": ["a" * 32],
                    "datetime":
                    datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT),
                },
            )).encode("utf-8"),
            [],
        )
        offset = 42
        timestamp = datetime.now()

        partition_one: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            offset,
            payload,
            timestamp,
        )
        partition_two: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 2),
            offset,
            payload,
            timestamp,
        )

        processed = self.replacer.process_message(partition_one)
        self.replacer.flush_batch([processed])
        # different partition should be unaffected even if it's the same offset
        assert self.replacer.process_message(partition_two) is not None
Exemple #13
0
def test_kafka_filter_header_with_bypass() -> None:
    header_filter = KafkaHeaderFilterWithBypass("should_drop", "1", 5)
    message = Message(
        Partition(Topic("random"), 1),
        1,
        KafkaPayload(b"key", b"value", [("should_drop", b"1")]),
        datetime.now(),
    )

    for _ in range(3):
        assert header_filter.should_drop(message) is True
        assert header_filter.should_drop(message) is True
        assert header_filter.should_drop(message) is True
        assert header_filter.should_drop(message) is True
        assert header_filter.should_drop(message) is False
Exemple #14
0
 def test_skip_kafka_message(self) -> None:
     state.set_config("kafka_messages_to_skip",
                      "[snuba-test-lol:1:2,snuba-test-yeet:0:1]")
     assert skip_kafka_message(
         Message(
             Partition(Topic("snuba-test-lol"), 1),
             2,
             KafkaPayload(None, b"", []),
             datetime.now(),
         ))
     assert skip_kafka_message(
         Message(
             Partition(Topic("snuba-test-yeet"), 0),
             1,
             KafkaPayload(None, b"", []),
             datetime.now(),
         ))
     assert not skip_kafka_message(
         Message(
             Partition(Topic("snuba-test-lol"), 2),
             1,
             KafkaPayload(None, b"", []),
             datetime.now(),
         ))
Exemple #15
0
    def test_unmerge_hierarchical_insert(self) -> None:
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "b" * 32
        self.event["data"]["hierarchical_hashes"] = ["a" * 32]
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE_HIERARCHICAL,
                    {
                        "project_id": project_id,
                        "previous_group_id": 1,
                        "new_group_id": 2,
                        "hierarchical_hash": "a" * 32,
                        "primary_hash": "b" * 32,
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        assert processed is not None
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 2
        }]
Exemple #16
0
    def encode(self, value: ScheduledSubscriptionTask) -> KafkaPayload:
        entity, subscription, tick_upper_offset = value.task

        return KafkaPayload(
            str(subscription.identifier).encode("utf-8"),
            cast(
                str,
                rapidjson.dumps(
                    {
                        "timestamp": value.timestamp.isoformat(),
                        "entity": entity.value,
                        "task": {"data": subscription.data.to_dict()},
                        "tick_upper_offset": tick_upper_offset,
                    }
                ),
            ).encode("utf-8"),
            [],
        )
Exemple #17
0
 def encode(self, value: SubscriptionTaskResult) -> KafkaPayload:
     subscription_id = str(value.task.task.identifier)
     request, result = value.result
     return KafkaPayload(
         subscription_id.encode("utf-8"),
         json.dumps({
             "version": 2,
             "payload": {
                 "subscription_id": subscription_id,
                 "request": {
                     **request.body
                 },
                 "result": result,
                 "timestamp": value.task.timestamp.isoformat(),
             },
         }).encode("utf-8"),
         [],
     )
Exemple #18
0
    def test_unmerge_insert(self) -> None:
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "end_unmerge",
                    {
                        "project_id": project_id,
                        "previous_group_id": 1,
                        "new_group_id": 2,
                        "hashes": ["a" * 32],
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 2
        }]
Exemple #19
0
    def test_delete_groups_insert(self) -> None:
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.utcnow()

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_DELETE_GROUPS,
                    {
                        "project_id": project_id,
                        "group_ids": [1],
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == []

        # Count is still zero after Redis flushed and parts merged
        self._clear_redis_and_force_merge()
        assert self._issue_count(self.project_id) == []
Exemple #20
0
    def test_reset_consumer_group_offset_check(self) -> None:
        set_config("skip_seen_offsets", True)
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE,
                    {
                        "project_id":
                        self.project_id,
                        "previous_group_id":
                        1,
                        "new_group_id":
                        2,
                        "hashes": ["a" * 32],
                        "datetime":
                        datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        self.replacer.flush_batch([self.replacer.process_message(message)])

        set_config(replacer.RESET_CHECK_CONFIG, f"[{CONSUMER_GROUP}]")

        # Offset to check against should be reset so this message shouldn't be skipped
        assert self.replacer.process_message(message) is not None
Exemple #21
0
    def test_process_offset_twice(self) -> None:
        set_config("skip_seen_offsets", True)
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE,
                    {
                        "project_id":
                        self.project_id,
                        "previous_group_id":
                        1,
                        "new_group_id":
                        2,
                        "hashes": ["a" * 32],
                        "datetime":
                        datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        # should be None since the offset should be in Redis, indicating it should be skipped
        assert self.replacer.process_message(message) is None
def test_invalid_commit_log_message(caplog: Any) -> None:
    clock = TestingClock()
    broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock)

    topic = Topic("messages")
    followed_consumer_group = "events"
    partition = Partition(topic, 0)

    broker.create_topic(topic, partitions=1)

    producer = broker.get_producer()

    inner_consumer = broker.get_consumer("group")

    consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group)

    def _assignment_callback(offsets: Mapping[Partition, int]) -> None:
        assert inner_consumer.tell() == {partition: 0}
        assert consumer.tell() == {partition: 0}

    assignment_callback = mock.Mock(side_effect=_assignment_callback)

    consumer.subscribe([topic], on_assign=assignment_callback)

    # produce invalid payload to commit log topic (key should not be None)
    producer.produce(
        partition,
        KafkaPayload(None, b"some-value", []),
    ).result()

    clock.sleep(1)

    with caplog.at_level(logging.ERROR):
        assert consumer.poll() is None

    assert followed_consumer_group in caplog.text
Exemple #23
0
def test_produce_step() -> None:
    topic = Topic("snuba-metrics")
    partition = Partition(topic, 0)

    clock = Clock()
    broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage()
    broker: Broker[KafkaPayload] = Broker(broker_storage, clock)
    broker.create_topic(topic, partitions=1)
    producer = broker.get_producer()

    commit = Mock()

    produce_step = ProduceStep(commit_function=commit, producer=producer)

    message_payloads = [counter_payload, distribution_payload, set_payload]
    message_batch = [
        Message(
            Partition(Topic("topic"), 0),
            i + 1,
            KafkaPayload(
                None,
                json.dumps(__translated_payload(
                    message_payloads[i])).encode("utf-8"), []),
            datetime.now(),
        ) for i, payload in enumerate(message_payloads)
    ]
    # the outer message uses the last message's partition, offset, and timestamp
    last = message_batch[-1]
    outer_message = Message(last.partition, last.offset, message_batch,
                            last.timestamp)

    # 1. Submit the message (that would have been generated from process_messages)
    produce_step.submit(outer_message=outer_message)

    # 2. Check that submit created the same number of futures as
    #    messages in the outer_message (3 in this test). Also check
    #    that the produced message payloads are as expected.
    assert len(produce_step._ProduceStep__futures) == 3

    first_message = broker_storage.consume(partition, 0)
    assert first_message is not None

    second_message = broker_storage.consume(partition, 1)
    assert second_message is not None

    third_message = broker_storage.consume(partition, 2)
    assert third_message is not None

    assert broker_storage.consume(partition, 3) is None

    produced_messages = [
        json.loads(msg.payload.value.decode("utf-8"), use_rapid_json=True)
        for msg in [first_message, second_message, third_message]
    ]
    expected_produced_messages = []
    for payload in message_payloads:
        translated = __translated_payload(payload)
        tags: Mapping[str, int] = {
            str(k): v
            for k, v in translated["tags"].items()
        }
        translated.update(**{"tags": tags})
        expected_produced_messages.append(translated)

    assert produced_messages == expected_produced_messages

    # 3. Call poll method, and check that doing so checked that
    #    futures were ready and successful and therefore messages
    #    were committed.
    produce_step.poll()
    expected_commit_calls = [
        call({message.partition: Position(message.offset, message.timestamp)})
        for message in message_batch
    ]
    assert commit.call_args_list == expected_commit_calls

    produce_step.close()
    produce_step.join()
Exemple #24
0
import pytest
from arroyo import Message, Partition, Topic
from arroyo.backends.kafka import KafkaPayload

from snuba.datasets.message_filters import (
    KafkaHeaderFilter,
    KafkaHeaderFilterWithBypass,
)

test_data = [
    pytest.param(
        KafkaHeaderFilter("should_drop", "1"),
        Message(
            Partition(Topic("random"), 1),
            1,
            KafkaPayload(b"key", b"value", [("should_drop", b"1")]),
            datetime.now(),
        ),
        True,
        id="matching-headers",
    ),
    pytest.param(
        KafkaHeaderFilter("should_drop", "0"),
        Message(
            Partition(Topic("random"), 1),
            1,
            KafkaPayload(b"key", b"value", [("should_drop", b"1")]),
            datetime.now(),
        ),
        False,
        id="mismatched-headers",
Exemple #25
0
def test_streaming_consumer_strategy() -> None:
    messages = (
        Message(
            Partition(Topic("events"), 0),
            i,
            KafkaPayload(None, b"{}", []),
            datetime.now(),
        )
        for i in itertools.count()
    )

    replacements_producer = FakeConfluentKafkaProducer()

    processor = Mock()
    processor.process_message.side_effect = [
        None,
        InsertBatch([{}], None),
        ReplacementBatch("key", [{}]),
    ]

    writer = Mock()

    metrics = TestingMetricsBackend()

    def write_step() -> ProcessedMessageBatchWriter:
        return ProcessedMessageBatchWriter(
            insert_batch_writer=InsertBatchWriter(
                writer, MetricsWrapper(metrics, "insertions")
            ),
            replacement_batch_writer=ReplacementBatchWriter(
                replacements_producer, Topic("replacements")
            ),
        )

    factory = KafkaConsumerStrategyFactory(
        None,
        functools.partial(process_message, processor),
        write_step,
        max_batch_size=10,
        max_batch_time=60,
        processes=None,
        input_block_size=None,
        output_block_size=None,
    )

    commit_function = Mock()
    strategy = factory.create(commit_function)

    for i in range(3):
        strategy.poll()
        strategy.submit(next(messages))

    assert metrics.calls == []

    processor.process_message.side_effect = [{}]

    with pytest.raises(TypeError):
        strategy.poll()
        strategy.submit(next(messages))

    def get_number_of_insertion_metrics() -> int:
        count = 0
        for c in metrics.calls:
            if isinstance(c, Timing) and c.name == "insertions.latency_ms":
                count += 1
        return count

    expected_write_count = 1

    with assert_changes(
        get_number_of_insertion_metrics, 0, expected_write_count
    ), assert_changes(
        lambda: writer.write.call_count, 0, expected_write_count
    ), assert_changes(
        lambda: len(replacements_producer.messages), 0, 1
    ):
        strategy.close()
        strategy.join()
    process_message_multistorage,
    process_message_multistorage_identical_storages,
)
from snuba.datasets.storages import StorageKey
from tests.fixtures import get_raw_event, get_raw_transaction

test_data = [
    pytest.param(
        Message(
            Partition(Topic("errors"), 1),
            1,
            MultistorageKafkaPayload(
                [StorageKey.ERRORS, StorageKey.ERRORS_V2],
                KafkaPayload(
                    None,
                    json.dumps((2, "insert", get_raw_event())).encode("utf-8"),
                    [],
                ),
            ),
            datetime.now(),
        ),
        True,
        id="both errors storage",
    ),
    pytest.param(
        Message(
            Partition(Topic("errors"), 1),
            1,
            MultistorageKafkaPayload(
                [StorageKey.TRANSACTIONS, StorageKey.TRANSACTIONS_V2],
                KafkaPayload(
def get_payloads() -> Iterator[KafkaPayload]:
    for i in itertools.count():
        yield KafkaPayload(None, f"{i}".encode("utf8"), [])
Exemple #28
0
    def test_reprocessing_flow_insert(self) -> None:
        # We have a group that contains two events, 1 and 2.
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["event_id"] = event_id = "00e24a150d7f4ee4b142b61b4d893b6d"
        write_unprocessed_events(self.storage, [self.event])
        self.event["event_id"] = event_id2 = "00e24a150d7f4ee4b142b61b4d893b6e"
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 2,
            "group_id": 1
        }]

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            41,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.TOMBSTONE_EVENTS,
                    {
                        "project_id": project_id,
                        "event_ids": [event_id]
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        # The user chooses to reprocess a subset of the group and throw away
        # the other events. Event 1 gets manually tombstoned by Sentry while
        # Event 2 prevails.
        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        # At this point the count doesn't make any sense but we don't care.
        assert self._issue_count(self.project_id) == [{
            "count": 2,
            "group_id": 1
        }]

        # The reprocessed event is inserted with a guaranteed-new group ID but
        # the *same* event ID (this is why we need to skip tombstoning this
        # event ID)
        self.event["group_id"] = 2
        write_unprocessed_events(self.storage, [self.event])

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.EXCLUDE_GROUPS,
                    {
                        "project_id": project_id,
                        "group_ids": [1]
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        # Group 1 is excluded from queries. At this point we have almost a
        # regular group deletion, except only a subset of events have been
        # tombstoned (the ones that will *not* be reprocessed).
        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        # Group 2 should contain the one event that the user chose to
        # reprocess, and Group 1 should be gone. (Note: In the product Group 2
        # looks identical to Group 1, including short ID).
        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 2
        }]
        assert self._get_group_id(project_id, event_id2) == 2
        assert not self._get_group_id(project_id, event_id)
Exemple #29
0
def process_messages(outer_message: Message[MessageBatch], ) -> MessageBatch:
    """
    We have an outer_message Message() whose payload is a batch of Message() objects.

        Message(
            partition=...,
            offset=...
            timestamp=...
            payload=[Message(...), Message(...), etc]
        )

    The inner messages payloads are KafkaPayload's that have:
        * key
        * headers
        * value

    The value of the message is what we need to parse and then translate
    using the indexer.
    """
    indexer = get_indexer()
    metrics = get_metrics()

    org_strings = defaultdict(set)
    strings = set()
    with metrics.timer("process_messages.parse_outer_message"):
        parsed_payloads_by_offset = {
            msg.offset: json.loads(msg.payload.value.decode("utf-8"),
                                   use_rapid_json=True)
            for msg in outer_message.payload
        }
        for message in parsed_payloads_by_offset.values():
            metric_name = message["name"]
            org_id = message["org_id"]
            tags = message.get("tags", {})

            parsed_strings = {
                metric_name,
                *tags.keys(),
                *tags.values(),
            }
            org_strings[org_id].update(parsed_strings)
            strings.update(parsed_strings)

    metrics.incr("process_messages.total_strings_indexer_lookup",
                 amount=len(strings))

    with metrics.timer("metrics_consumer.bulk_record"):
        mapping = indexer.bulk_record(org_strings)

    new_messages: List[Message[KafkaPayload]] = []

    with metrics.timer("process_messages.reconstruct_messages"):
        for message in outer_message.payload:
            parsed_payload_value = parsed_payloads_by_offset[message.offset]
            new_payload_value = deepcopy(parsed_payload_value)

            metric_name = parsed_payload_value["name"]
            tags = parsed_payload_value.get("tags", {})

            try:
                new_tags: Mapping[int, int] = {
                    mapping[k]: mapping[v]
                    for k, v in tags.items()
                }
            except KeyError:
                logger.error("process_messages.key_error",
                             extra={"tags": tags},
                             exc_info=True)
                continue

            new_payload_value["tags"] = new_tags
            new_payload_value["metric_id"] = mapping[metric_name]
            new_payload_value["retention_days"] = 90

            del new_payload_value["name"]

            new_payload = KafkaPayload(
                key=message.payload.key,
                value=json.dumps(new_payload_value).encode(),
                headers=message.payload.headers,
            )
            new_message = Message(
                partition=message.partition,
                offset=message.offset,
                payload=new_payload,
                timestamp=message.timestamp,
            )
            new_messages.append(new_message)

    metrics.incr("metrics_consumer.process_message.messages_seen",
                 amount=len(new_messages))

    return new_messages
def invalid_message() -> Message[KafkaPayload]:
    invalid_payload = KafkaPayload(None, b"", [])
    return Message(Partition(Topic(""), 0), 0, invalid_payload, datetime.now())