def test_table_name_filter() -> None:
    table_name = "table_name"
    message_filter = CdcTableNameMessageFilter(table_name)

    # Messages that math the table should not be dropped.
    assert not message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", [("table", table_name.encode("utf8"))]),
            datetime.now(),
        ))

    # Messages without a table should be dropped.
    assert message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", []),
            datetime.now(),
        ))

    # Messages from a different table should be dropped.
    assert message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", [("table", b"other_table")]),
            datetime.now(),
        ))
Example #2
0
def test_multistorage_strategy(
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
) -> None:
    from snuba.datasets.storages import groupassignees, groupedmessages
    from tests.datasets.cdc.test_groupassignee import TestGroupassignee
    from tests.datasets.cdc.test_groupedmessage import TestGroupedMessage

    commit = Mock()

    storages = [groupassignees.storage, groupedmessages.storage]

    strategy = MultistorageConsumerProcessingStrategyFactory(
        storages,
        10,
        10,
        processes,
        input_block_size,
        output_block_size,
        TestingMetricsBackend(),
    ).create(commit)

    payloads = [
        KafkaPayload(None, b"{}", [("table", b"ignored")]),
        KafkaPayload(
            None,
            json.dumps(TestGroupassignee.INSERT_MSG).encode("utf8"),
            [("table", groupassignees.storage.get_postgres_table().encode("utf8"))],
        ),
        KafkaPayload(
            None,
            json.dumps(TestGroupedMessage.INSERT_MSG).encode("utf8"),
            [("table", groupedmessages.storage.get_postgres_table().encode("utf8"))],
        ),
    ]

    messages = [
        Message(
            Partition(Topic("topic"), 0), offset, payload, datetime.now(), offset + 1
        )
        for offset, payload in enumerate(payloads)
    ]

    with assert_changes(
        lambda: get_row_count(groupassignees.storage), 0, 1
    ), assert_changes(lambda: get_row_count(groupedmessages.storage), 0, 1):

        for message in messages:
            strategy.submit(message)

        with assert_changes(
            lambda: commit.call_args_list, [], [call({Partition(Topic("topic"), 0): 3})]
        ):
            strategy.close()
            strategy.join()
Example #3
0
def test_commit_log_consumer() -> None:
    # XXX: This would be better as an integration test (or at least a test
    # against an abstract Producer interface) instead of against a test against
    # a mock.
    commit_log_producer = FakeConfluentKafkaProducer()

    configuration = get_default_kafka_configuration()

    consumer: KafkaConsumer = KafkaConsumerWithCommitLog(
        {
            **configuration,
            "auto.offset.reset": "earliest",
            "enable.auto.commit": "false",
            "enable.auto.offset.store": "false",
            "enable.partition.eof": "true",
            "group.id": "test",
            "session.timeout.ms": 10000,
        },
        producer=commit_log_producer,
        commit_log_topic=Topic("commit-log"),
    )

    producer = KafkaProducer(configuration)

    topic = Topic("topic")

    with closing(consumer) as consumer:
        with closing(producer) as producer:
            producer.produce(topic, next(get_payloads())).result(5.0)

        consumer.subscribe([topic])

        message = consumer.poll(10.0)  # XXX: getting the subscription is slow
        assert isinstance(message, Message)

        now = datetime.now()

        position = Position(message.next_offset, now)

        consumer.stage_positions({message.partition: position})

        assert consumer.commit_positions() == {Partition(topic, 0): position}

        assert len(commit_log_producer.messages) == 1
        commit_message = commit_log_producer.messages[0]
        assert commit_message.topic() == "commit-log"

        assert commit_codec.decode(
            KafkaPayload(
                commit_message.key(),
                commit_message.value(),
                commit_message.headers(),
            )) == Commit("test", Partition(topic, 0), message.next_offset, now)
Example #4
0
def test_subscription_worker_consistent(
        subscription_data: SubscriptionData) -> None:
    state.set_config("event_subscription_non_consistent_sample_rate", 1)
    broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(),
                                                    TestingClock())

    result_topic = Topic("subscription-results")

    broker.create_topic(result_topic, partitions=1)

    frequency = timedelta(minutes=1)
    evaluations = 1

    subscription = Subscription(
        SubscriptionIdentifier(PartitionId(0), uuid1()),
        subscription_data,
    )

    store = DummySubscriptionDataStore()
    store.create(subscription.identifier.uuid, subscription.data)

    metrics = TestingMetricsBackend()

    dataset = get_dataset("events")
    worker = SubscriptionWorker(
        dataset,
        ThreadPoolExecutor(),
        {
            0:
            SubscriptionScheduler(store, PartitionId(0), timedelta(),
                                  DummyMetricsBackend(strict=True))
        },
        broker.get_producer(),
        result_topic,
        metrics,
    )

    now = datetime(2000, 1, 1)

    tick = Tick(
        offsets=Interval(0, 1),
        timestamps=Interval(now - (frequency * evaluations), now),
    )

    worker.process_message(Message(Partition(Topic("events"), 0), 0, tick,
                                   now))

    time.sleep(0.1)

    assert (len([
        m for m in metrics.calls
        if isinstance(m, Increment) and m.name == "consistent"
    ]) == 1)
Example #5
0
    def test_multiple_partitions(self) -> None:
        """
        Different partitions should have independent offset checks.
        """
        set_config("skip_seen_offsets", True)
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        payload = KafkaPayload(
            None,
            json.dumps((
                2,
                ReplacementType.END_UNMERGE,
                {
                    "project_id":
                    self.project_id,
                    "previous_group_id":
                    1,
                    "new_group_id":
                    2,
                    "hashes": ["a" * 32],
                    "datetime":
                    datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT),
                },
            )).encode("utf-8"),
            [],
        )
        offset = 42
        timestamp = datetime.now()

        partition_one: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            offset,
            payload,
            timestamp,
        )
        partition_two: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 2),
            offset,
            payload,
            timestamp,
        )

        processed = self.replacer.process_message(partition_one)
        self.replacer.flush_batch([processed])
        # different partition should be unaffected even if it's the same offset
        assert self.replacer.process_message(partition_two) is not None
Example #6
0
    def test_delete_tag_promoted_insert(self) -> None:
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["data"]["tags"].append(["browser.name", "foo"])
        self.event["data"]["tags"].append(["notbrowser", "foo"])
        write_unprocessed_events(self.storage, [self.event])

        project_id = self.project_id

        def _issue_count(total: bool = False) -> Sequence[Mapping[str, Any]]:
            clickhouse = self.storage.get_cluster().get_query_connection(
                ClickhouseClientSettings.QUERY)

            total_cond = (
                "AND has(_tags_hash_map, cityHash64('browser.name=foo'))"
                if not total else "")

            data = clickhouse.execute(f"""
                SELECT group_id, count()
                FROM errors_local
                FINAL
                WHERE deleted = 0
                AND project_id = {project_id}
                {total_cond}
                GROUP BY group_id
                """).results

            return [{"group_id": row[0], "count": row[1]} for row in data]

        assert _issue_count() == [{"count": 1, "group_id": 1}]
        assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]

        timestamp = datetime.now(tz=pytz.utc)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_DELETE_TAG,
                    {
                        "project_id": project_id,
                        "tag": "browser.name",
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        assert processed is not None
        self.replacer.flush_batch([processed])

        assert _issue_count() == []
        assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
Example #7
0
def test_encoding_producer() -> None:
    broker: Broker[str] = Broker(MemoryMessageStorage(), TestingClock())

    topic = Topic("test")
    broker.create_topic(topic, 1)

    class ReverseEncoder(Encoder[str, str]):
        def encode(self, value: str) -> str:
            return "".join(value[::-1])

    producer = ProducerEncodingWrapper(broker.get_producer(), ReverseEncoder())
    decoded_message = producer.produce(topic, "hello").result()
    assert decoded_message.payload == "hello"

    consumer = broker.get_consumer("group")
    consumer.subscribe([topic])

    encoded_message = consumer.poll()
    assert encoded_message is not None

    # The payload returned by the consumer should not be decoded.
    assert encoded_message.payload == "olleh"

    # All other attributes should be the same.
    for attribute in set(Message.__slots__) - {"payload"}:
        assert getattr(encoded_message,
                       attribute) == getattr(decoded_message, attribute)
Example #8
0
def test_tick_consumer_min_interval() -> None:
    clock = TestingClock()
    broker: Broker[int] = Broker(MemoryMessageStorage(), clock)

    topic = Topic("messages")

    broker.create_topic(topic, partitions=2)

    producer = broker.get_producer()
    for payload in range(3):
        producer.produce(Partition(topic, 0), payload).result()
        clock.sleep(1.0)

    inner_consumer = broker.get_consumer("group")

    consumer = TickConsumer(inner_consumer, min_interval=timedelta(seconds=2))

    consumer.subscribe([topic])

    assert consumer.poll() is None
    assert consumer.poll() is None
    message = consumer.poll()
    assert message is not None
    tick = message.payload
    assert tick.offsets.upper - tick.offsets.lower == 2
    assert tick.timestamps.upper - tick.timestamps.lower == timedelta(seconds=2)
def test_execute_and_produce_result() -> None:
    state.set_config("subscription_mode_events", "new")
    dataset = get_dataset("events")
    entity_names = ["events"]
    max_concurrent_queries = 2
    total_concurrent_queries = 2
    metrics = TestingMetricsBackend()

    scheduled_topic = Topic("scheduled-subscriptions-events")
    result_topic = Topic("events-subscriptions-results")
    clock = TestingClock()
    broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage()
    broker: Broker[KafkaPayload] = Broker(broker_storage, clock)
    broker.create_topic(scheduled_topic, partitions=1)
    broker.create_topic(result_topic, partitions=1)
    producer = broker.get_producer()

    commit = mock.Mock()

    strategy = ExecuteQuery(
        dataset,
        entity_names,
        max_concurrent_queries,
        total_concurrent_queries,
        None,
        metrics,
        ProduceResult(producer, result_topic.name, commit),
        commit,
    )

    subscription_identifier = SubscriptionIdentifier(PartitionId(0),
                                                     uuid.uuid1())

    make_message = generate_message(EntityKey.EVENTS, subscription_identifier)
    message = next(make_message)
    strategy.submit(message)

    # Eventually a message should be produced and offsets committed
    while (broker_storage.consume(Partition(result_topic, 0), 0) is None
           or commit.call_count == 0):
        strategy.poll()

    produced_message = broker_storage.consume(Partition(result_topic, 0), 0)
    assert produced_message is not None
    assert produced_message.payload.key == str(subscription_identifier).encode(
        "utf-8")
    assert commit.call_count == 1
Example #10
0
 def _wrap(self, msg: Tuple[Any, ...]) -> Message[KafkaPayload]:
     return Message(
         Partition(Topic("replacements"), 0),
         0,
         KafkaPayload(None,
                      json.dumps(msg).encode("utf-8"), []),
         datetime.now(),
     )
Example #11
0
    def test_offset_already_processed(self) -> None:
        """
        Don't process an offset that already exists in Redis.
        """
        set_config("skip_seen_offsets", True)
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        key = f"replacement:{CONSUMER_GROUP}:errors:1"
        redis_client.set(key, 42)

        old_offset: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            41,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE,
                    {},
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        same_offset: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE,
                    {},
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        assert self.replacer.process_message(old_offset) is None
        assert self.replacer.process_message(same_offset) is None
Example #12
0
 def write_step() -> ProcessedMessageBatchWriter:
     return ProcessedMessageBatchWriter(
         insert_batch_writer=InsertBatchWriter(
             writer, MetricsWrapper(metrics, "insertions")
         ),
         replacement_batch_writer=ReplacementBatchWriter(
             replacements_producer, Topic("replacements")
         ),
     )
Example #13
0
def test_skip_stale_message() -> None:
    dataset = get_dataset("events")
    entity_names = ["events"]
    max_concurrent_queries = 2
    total_concurrent_queries = 2
    metrics = TestingMetricsBackend()

    scheduled_topic = Topic("scheduled-subscriptions-events")
    result_topic = Topic("events-subscriptions-results")
    clock = TestingClock()
    broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage()
    broker: Broker[KafkaPayload] = Broker(broker_storage, clock)
    broker.create_topic(scheduled_topic, partitions=1)
    broker.create_topic(result_topic, partitions=1)
    producer = broker.get_producer()

    commit = mock.Mock()

    stale_threshold_seconds = 60

    strategy = ExecuteQuery(
        dataset,
        entity_names,
        max_concurrent_queries,
        total_concurrent_queries,
        stale_threshold_seconds,
        metrics,
        ProduceResult(producer, result_topic.name, commit),
        commit,
    )

    subscription_identifier = SubscriptionIdentifier(PartitionId(0),
                                                     uuid.uuid1())

    make_message = generate_message(EntityKey.EVENTS, subscription_identifier)
    message = next(make_message)
    strategy.submit(message)

    # No message will be produced
    strategy.poll()
    assert broker_storage.consume(Partition(result_topic, 0), 0) is None
    assert Increment("skipped_execution", 1,
                     {"entity": "events"}) in metrics.calls
Example #14
0
    def eventstream(*, dataset: Dataset) -> RespTuple:
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_default_entity().get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from arroyo.processing.strategies.streaming import (
                KafkaConsumerStrategyFactory,
            )

            from snuba.consumers.consumer import build_batch_writer, process_message

            table_writer = storage.get_table_writer()
            stream_loader = table_writer.get_stream_loader()
            strategy = KafkaConsumerStrategyFactory(
                stream_loader.get_pre_filter(),
                functools.partial(
                    process_message, stream_loader.get_processor(), "consumer_grouup"
                ),
                build_batch_writer(table_writer, metrics=metrics),
                max_batch_size=1,
                max_batch_time=1.0,
                processes=None,
                input_block_size=None,
                output_block_size=None,
            ).create(lambda offsets: None)
            strategy.submit(message)
            strategy.close()
            strategy.join()
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, "consumer_group", metrics=metrics)
            processed = worker.process_message(message)
            if processed is not None:
                batch = [processed]
                worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
Example #15
0
def test_kafka_filter_header_with_bypass() -> None:
    header_filter = KafkaHeaderFilterWithBypass("should_drop", "1", 5)
    message = Message(
        Partition(Topic("random"), 1),
        1,
        KafkaPayload(b"key", b"value", [("should_drop", b"1")]),
        datetime.now(),
    )

    for _ in range(3):
        assert header_filter.should_drop(message) is True
        assert header_filter.should_drop(message) is True
        assert header_filter.should_drop(message) is True
        assert header_filter.should_drop(message) is True
        assert header_filter.should_drop(message) is False
Example #16
0
 def test_skip_kafka_message(self) -> None:
     state.set_config("kafka_messages_to_skip",
                      "[snuba-test-lol:1:2,snuba-test-yeet:0:1]")
     assert skip_kafka_message(
         Message(
             Partition(Topic("snuba-test-lol"), 1),
             2,
             KafkaPayload(None, b"", []),
             datetime.now(),
         ))
     assert skip_kafka_message(
         Message(
             Partition(Topic("snuba-test-yeet"), 0),
             1,
             KafkaPayload(None, b"", []),
             datetime.now(),
         ))
     assert not skip_kafka_message(
         Message(
             Partition(Topic("snuba-test-lol"), 2),
             1,
             KafkaPayload(None, b"", []),
             datetime.now(),
         ))
Example #17
0
    def test_unmerge_hierarchical_insert(self) -> None:
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "b" * 32
        self.event["data"]["hierarchical_hashes"] = ["a" * 32]
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE_HIERARCHICAL,
                    {
                        "project_id": project_id,
                        "previous_group_id": 1,
                        "new_group_id": 2,
                        "hierarchical_hash": "a" * 32,
                        "primary_hash": "b" * 32,
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        assert processed is not None
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 2
        }]
Example #18
0
    def test_unmerge_insert(self) -> None:
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "end_unmerge",
                    {
                        "project_id": project_id,
                        "previous_group_id": 1,
                        "new_group_id": 2,
                        "hashes": ["a" * 32],
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 2
        }]
Example #19
0
    def __build_batch_writer(
            self,
            storage: WritableTableStorage) -> ProcessedMessageBatchWriter:
        replacement_batch_writer: Optional[ReplacementBatchWriter]
        stream_loader = storage.get_table_writer().get_stream_loader()
        replacement_topic_spec = stream_loader.get_replacement_topic_spec()
        default_topic_spec = stream_loader.get_default_topic_spec()
        if replacement_topic_spec is not None:
            # XXX: The producer is flushed when closed on strategy teardown
            # after an assignment is revoked, but never explicitly closed.
            # XXX: This assumes that the Kafka cluster used for the input topic
            # to the storage is the same as the replacement topic.
            replacement_batch_writer = ReplacementBatchWriter(
                ConfluentKafkaProducer(
                    build_kafka_producer_configuration(
                        default_topic_spec.topic,
                        override_params={
                            "partitioner": "consistent",
                            "message.max.bytes":
                            50000000,  # 50MB, default is 1MB
                        },
                    )),
                Topic(replacement_topic_spec.topic_name),
            )
        else:
            replacement_batch_writer = None

        return ProcessedMessageBatchWriter(
            InsertBatchWriter(
                storage.get_table_writer().get_batch_writer(
                    self.__metrics,
                    {
                        "load_balancing": "in_order",
                        "insert_distributed_sync": 1
                    },
                ),
                MetricsWrapper(
                    self.__metrics,
                    "insertions",
                    {"storage": storage.get_storage_key().value},
                ),
            ),
            replacement_batch_writer,
        )
Example #20
0
    def test_delete_groups_insert(self) -> None:
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.utcnow()

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_DELETE_GROUPS,
                    {
                        "project_id": project_id,
                        "group_ids": [1],
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == []

        # Count is still zero after Redis flushed and parts merged
        self._clear_redis_and_force_merge()
        assert self._issue_count(self.project_id) == []
Example #21
0
def generate_message(
    entity_key: EntityKey,
    subscription_identifier: Optional[SubscriptionIdentifier] = None,
) -> Iterator[Message[KafkaPayload]]:
    codec = SubscriptionScheduledTaskEncoder()
    epoch = datetime(1970, 1, 1)
    i = 0

    if subscription_identifier is None:
        subscription_identifier = SubscriptionIdentifier(
            PartitionId(1), uuid.uuid1())

    data_dict = {}
    if entity_key in (EntityKey.METRICS_SETS, EntityKey.METRICS_COUNTERS):
        data_dict = {"organization": 1}

    entity_subscription = ENTITY_KEY_TO_SUBSCRIPTION_MAPPER[entity_key](
        data_dict=data_dict)

    while True:
        payload = codec.encode(
            ScheduledSubscriptionTask(
                epoch + timedelta(minutes=i),
                SubscriptionWithMetadata(
                    entity_key,
                    Subscription(
                        subscription_identifier,
                        SubscriptionData(
                            project_id=1,
                            time_window_sec=60,
                            resolution_sec=60,
                            query=f"MATCH ({entity_key.value}) SELECT count()",
                            entity_subscription=entity_subscription,
                        ),
                    ),
                    i + 1,
                ),
            ))

        yield Message(Partition(Topic("test"), 0), i, payload, epoch)
        i += 1
Example #22
0
    def __init__(
        self,
        producer: Producer[KafkaPayload],
        result_topic: str,
        commit: Callable[[Mapping[Partition, Position]], None],
    ):
        self.__producer = producer
        self.__result_topic = Topic(result_topic)
        self.__commit = commit
        self.__commit_data: MutableMapping[Partition, Position] = {}

        # Time we last called commit
        self.__last_committed: Optional[float] = None

        self.__encoder = SubscriptionTaskResultEncoder()

        self.__queue: Deque[Tuple[Message[SubscriptionTaskResult],
                                  Future[Message[KafkaPayload]]]] = deque()

        self.__max_buffer_size = 10000
        self.__closed = False
Example #23
0
    def test_reset_consumer_group_offset_check(self) -> None:
        set_config("skip_seen_offsets", True)
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE,
                    {
                        "project_id":
                        self.project_id,
                        "previous_group_id":
                        1,
                        "new_group_id":
                        2,
                        "hashes": ["a" * 32],
                        "datetime":
                        datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        self.replacer.flush_batch([self.replacer.process_message(message)])

        set_config(replacer.RESET_CHECK_CONFIG, f"[{CONSUMER_GROUP}]")

        # Offset to check against should be reset so this message shouldn't be skipped
        assert self.replacer.process_message(message) is not None
Example #24
0
    def __init__(
        self,
        schedulers: Mapping[int, SubscriptionScheduler],
        producer: Producer[KafkaPayload],
        scheduled_topic_spec: KafkaTopicSpec,
        commit: Callable[[Mapping[Partition, Position]], None],
        stale_threshold_seconds: Optional[int],
        metrics: MetricsBackend,
    ) -> None:
        self.__schedulers = schedulers
        self.__encoder = SubscriptionScheduledTaskEncoder()
        self.__producer = producer
        self.__scheduled_topic = Topic(scheduled_topic_spec.topic_name)
        self.__commit = commit
        self.__stale_threshold_seconds = stale_threshold_seconds
        self.__metrics = metrics
        self.__closed = False

        # Stores each tick with it's futures
        self.__queue = ScheduledSubscriptionQueue()

        # Not a hard max
        self.__max_buffer_size = 10000
Example #25
0
    def test_process_offset_twice(self) -> None:
        set_config("skip_seen_offsets", True)
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        write_unprocessed_events(self.storage, [self.event])

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    ReplacementType.END_UNMERGE,
                    {
                        "project_id":
                        self.project_id,
                        "previous_group_id":
                        1,
                        "new_group_id":
                        2,
                        "hashes": ["a" * 32],
                        "datetime":
                        datetime.utcnow().strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        # should be None since the offset should be in Redis, indicating it should be skipped
        assert self.replacer.process_message(message) is None
def test_invalid_commit_log_message(caplog: Any) -> None:
    clock = TestingClock()
    broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock)

    topic = Topic("messages")
    followed_consumer_group = "events"
    partition = Partition(topic, 0)

    broker.create_topic(topic, partitions=1)

    producer = broker.get_producer()

    inner_consumer = broker.get_consumer("group")

    consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group)

    def _assignment_callback(offsets: Mapping[Partition, int]) -> None:
        assert inner_consumer.tell() == {partition: 0}
        assert consumer.tell() == {partition: 0}

    assignment_callback = mock.Mock(side_effect=_assignment_callback)

    consumer.subscribe([topic], on_assign=assignment_callback)

    # produce invalid payload to commit log topic (key should not be None)
    producer.produce(
        partition,
        KafkaPayload(None, b"some-value", []),
    ).result()

    clock.sleep(1)

    with caplog.at_level(logging.ERROR):
        assert consumer.poll() is None

    assert followed_consumer_group in caplog.text
Example #27
0
def multistorage_consumer(
    storage_names: Sequence[str],
    consumer_group: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
) -> None:

    DEFAULT_BLOCK_SIZE = int(32 * 1e6)

    if processes is not None:
        if input_block_size is None:
            input_block_size = DEFAULT_BLOCK_SIZE

        if output_block_size is None:
            output_block_size = DEFAULT_BLOCK_SIZE

    setup_logging(log_level)
    setup_sentry()

    storages = {
        key: get_writable_storage(key)
        for key in (getattr(StorageKey, name.upper())
                    for name in storage_names)
    }

    topics = {
        storage.get_table_writer().get_stream_loader().get_default_topic_spec(
        ).topic_name
        for storage in storages.values()
    }

    # XXX: The ``StreamProcessor`` only supports a single topic at this time,
    # but is easily modified. The topic routing in the processing strategy is a
    # bit trickier (but also shouldn't be too bad.)
    topic = Topic(topics.pop())
    if topics:
        raise ValueError("only one topic is supported")

    # XXX: The ``CommitLogConsumer`` also only supports a single topic at this
    # time. (It is less easily modified.) This also assumes the commit log
    # topic is on the same Kafka cluster as the input topic.
    commit_log_topics = {
        spec.topic_name
        for spec in (storage.get_table_writer().get_stream_loader(
        ).get_commit_log_topic_spec() for storage in storages.values())
        if spec is not None
    }

    commit_log_topic: Optional[Topic]
    if commit_log_topics:
        commit_log_topic = Topic(commit_log_topics.pop())
    else:
        commit_log_topic = None

    if commit_log_topics:
        raise ValueError("only one commit log topic is supported")

    # XXX: This requires that all storages are associated with the same Kafka
    # cluster so that they can be consumed by the same consumer instance.
    # Unfortunately, we don't have the concept of independently configurable
    # Kafka clusters in settings, only consumer configurations that are
    # associated with storages and/or global default configurations. To avoid
    # implementing yet another method of configuring Kafka clusters, this just
    # piggybacks on the existing configuration method(s), with the assumption
    # that most deployments are going to be using the default configuration.
    storage_keys = [*storages.keys()]

    kafka_topic = (storages[storage_keys[0]].get_table_writer().
                   get_stream_loader().get_default_topic_spec().topic)

    consumer_configuration = build_kafka_consumer_configuration(
        kafka_topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
    )

    for storage_key in storage_keys[1:]:
        if (build_kafka_consumer_configuration(
                storages[storage_key].get_table_writer().get_stream_loader().
                get_default_topic_spec().topic,
                consumer_group,
        )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]):
            raise ValueError(
                "storages cannot be located on different Kafka clusters")

    if commit_log_topic is None:
        consumer = KafkaConsumer(consumer_configuration)
    else:
        # XXX: This relies on the assumptions that a.) all storages are
        # located on the same Kafka cluster (validated above.)

        commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer(
        ).get_stream_loader().get_commit_log_topic_spec())
        assert commit_log_topic_spec is not None

        producer = ConfluentKafkaProducer(
            build_kafka_producer_configuration(commit_log_topic_spec.topic))
        consumer = KafkaConsumerWithCommitLog(
            consumer_configuration,
            producer=producer,
            commit_log_topic=commit_log_topic,
        )

    metrics = MetricsWrapper(environment.metrics, "consumer")

    configure_metrics(StreamMetricsAdapter(metrics))
    processor = StreamProcessor(
        consumer,
        topic,
        MultistorageConsumerProcessingStrategyFactory(
            [*storages.values()],
            max_batch_size,
            max_batch_time_ms / 1000.0,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
            metrics=metrics,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    processor.run()
Example #28
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    max_query_workers: Optional[int],
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
    delay_seconds: Optional[int],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    storage = dataset.get_default_entity().get_writable_storage()
    assert (
        storage is not None
    ), f"Dataset {dataset_name} does not have a writable storage by default."

    loader = enforce_table_writer(dataset).get_stream_loader()
    commit_log_topic_spec = loader.get_commit_log_topic_spec()
    assert commit_log_topic_spec is not None

    result_topic_spec = loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions",
        tags={
            "group": consumer_group,
            "dataset": dataset_name
        },
    )

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    loader.get_default_topic_spec().topic,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                    bootstrap_servers=bootstrap_servers,
                ), ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    commit_log_topic_spec.topic,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                    bootstrap_servers=bootstrap_servers,
                ), ),
            (Topic(commit_log_topic) if commit_log_topic is not None else
             Topic(commit_log_topic_spec.topic_name)),
            set(commit_log_groups),
        ),
        time_shift=(timedelta(seconds=delay_seconds *
                              -1) if delay_seconds is not None else None),
    )

    producer = ProducerEncodingWrapper(
        KafkaProducer(
            build_kafka_producer_configuration(
                loader.get_default_topic_spec().topic,
                bootstrap_servers=bootstrap_servers,
                override_params={
                    "partitioner": "consistent",
                    "message.max.bytes": 50000000,  # 50MB, default is 1MB
                },
            )),
        SubscriptionTaskResultEncoder(),
    )

    executor = ThreadPoolExecutor(max_workers=max_query_workers)
    logger.debug("Starting %r with %s workers...", executor,
                 getattr(executor, "_max_workers", 0))
    metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0))

    with closing(consumer), executor, closing(producer):
        from arroyo import configure_metrics

        configure_metrics(StreamMetricsAdapter(metrics))
        batching_consumer = StreamProcessor(
            consumer,
            (Topic(topic) if topic is not None else Topic(
                loader.get_default_topic_spec().topic_name)),
            BatchProcessingStrategyFactory(
                SubscriptionWorker(
                    dataset,
                    executor,
                    {
                        index: SubscriptionScheduler(
                            RedisSubscriptionDataStore(redis_client, dataset,
                                                       PartitionId(index)),
                            PartitionId(index),
                            cache_ttl=timedelta(seconds=schedule_ttl),
                            metrics=metrics,
                        )
                        for index in
                        range(partitions if partitions is not None else loader.
                              get_default_topic_spec().partitions_number)
                    },
                    producer,
                    Topic(result_topic) if result_topic is not None else Topic(
                        result_topic_spec.topic_name),
                    metrics,
                ),
                max_batch_size,
                max_batch_time_ms,
            ),
        )

        def handler(signum: int, frame: Optional[Any]) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
def test_scheduler_consumer() -> None:
    settings.TOPIC_PARTITION_COUNTS = {"events": 2}
    importlib.reload(scheduler_consumer)

    admin_client = AdminClient(get_default_kafka_configuration())
    create_topics(admin_client, [SnubaTopic.COMMIT_LOG])

    metrics_backend = TestingMetricsBackend()
    entity_name = "events"
    entity = get_entity(EntityKey(entity_name))
    storage = entity.get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()

    commit_log_topic = Topic("snuba-commit-log")

    mock_scheduler_producer = mock.Mock()

    from snuba.redis import redis_client
    from snuba.subscriptions.data import PartitionId, SubscriptionData
    from snuba.subscriptions.entity_subscription import EventsSubscription
    from snuba.subscriptions.store import RedisSubscriptionDataStore

    entity_key = EntityKey(entity_name)
    partition_index = 0

    store = RedisSubscriptionDataStore(redis_client, entity_key,
                                       PartitionId(partition_index))
    store.create(
        uuid.uuid4(),
        SubscriptionData(
            project_id=1,
            time_window_sec=60,
            resolution_sec=60,
            query="MATCH events SELECT count()",
            entity_subscription=EventsSubscription(data_dict={}),
        ),
    )

    builder = scheduler_consumer.SchedulerBuilder(
        entity_name,
        str(uuid.uuid1().hex),
        "events",
        mock_scheduler_producer,
        "latest",
        False,
        60 * 5,
        None,
        None,
        metrics_backend,
    )
    scheduler = builder.build_consumer()
    time.sleep(2)
    scheduler._run_once()
    scheduler._run_once()
    scheduler._run_once()

    epoch = datetime(1970, 1, 1)

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            stream_loader.get_default_topic_spec().topic, ))

    for (partition, offset, orig_message_ts) in [
        (0, 0, epoch),
        (1, 0, epoch + timedelta(minutes=1)),
        (0, 1, epoch + timedelta(minutes=2)),
        (1, 1, epoch + timedelta(minutes=3)),
    ]:
        fut = producer.produce(
            commit_log_topic,
            payload=commit_codec.encode(
                Commit(
                    "events",
                    Partition(commit_log_topic, partition),
                    offset,
                    orig_message_ts,
                )),
        )
        fut.result()

    producer.close()

    for _ in range(5):
        scheduler._run_once()

    scheduler._shutdown()

    assert mock_scheduler_producer.produce.call_count == 2

    settings.TOPIC_PARTITION_COUNTS = {}
def test_tick_consumer_non_monotonic() -> None:
    clock = TestingClock()
    broker: Broker[KafkaPayload] = Broker(MemoryMessageStorage(), clock)

    epoch = datetime.fromtimestamp(clock.time())

    topic = Topic("messages")
    followed_consumer_group = "events"
    partition = Partition(topic, 0)

    broker.create_topic(topic, partitions=1)

    producer = broker.get_producer()

    inner_consumer = broker.get_consumer("group")

    consumer = CommitLogTickConsumer(inner_consumer, followed_consumer_group)

    def _assignment_callback(offsets: Mapping[Partition, int]) -> None:
        assert inner_consumer.tell() == {partition: 0}
        assert consumer.tell() == {partition: 0}

    assignment_callback = mock.Mock(side_effect=_assignment_callback)

    consumer.subscribe([topic], on_assign=assignment_callback)

    producer.produce(
        partition,
        commit_codec.encode(
            Commit(followed_consumer_group, partition, 0, epoch)),
    ).result()

    clock.sleep(1)

    producer.produce(
        partition,
        commit_codec.encode(
            Commit(followed_consumer_group, partition, 1,
                   epoch + timedelta(seconds=1))),
    ).result()

    with assert_changes(lambda: assignment_callback.called, False, True):
        assert consumer.poll() is None

    assert consumer.tell() == {partition: 1}

    with assert_changes(consumer.tell, {partition: 1}, {partition: 2}):
        assert consumer.poll() == Message(
            partition,
            1,
            Tick(
                0,
                offsets=Interval(0, 1),
                timestamps=Interval(epoch, epoch + timedelta(seconds=1)),
            ),
            epoch + timedelta(seconds=1),
        )

    clock.sleep(-1)

    producer.produce(
        partition,
        commit_codec.encode(
            Commit(followed_consumer_group, partition, 2, epoch)),
    ).result()

    with assert_changes(consumer.tell, {partition: 2}, {partition: 3}):
        assert consumer.poll() is None

    clock.sleep(2)

    producer.produce(
        partition,
        commit_codec.encode(
            Commit(followed_consumer_group, partition, 3,
                   epoch + timedelta(seconds=2))),
    ).result()

    with assert_changes(consumer.tell, {partition: 3}, {partition: 4}):
        assert consumer.poll() == Message(
            partition,
            3,
            Tick(
                0,
                offsets=Interval(1, 3),
                timestamps=Interval(epoch + timedelta(seconds=1),
                                    epoch + timedelta(seconds=2)),
            ),
            epoch + timedelta(seconds=2),
        )