Exemple #1
0
    def test_offsets(self):
        event = self.event

        message: Message[KafkaPayload] = Message(
            Partition(Topic("events"), 456),
            123,
            KafkaPayload(None,
                         json.dumps((2, "insert", event)).encode("utf-8"),
                         []),  # event doesn't really matter
            datetime.now(),
        )

        test_worker = ConsumerWorker(
            self.dataset.get_writable_storage(),
            producer=FakeConfluentKafkaProducer(),
            replacements_topic=Topic(
                enforce_table_writer(self.dataset).get_stream_loader().
                get_replacement_topic_spec().topic_name),
            metrics=self.metrics,
        )
        batch = [test_worker.process_message(message)]
        test_worker.flush_batch(batch)

        clickhouse = (get_storage(
            StorageKey.EVENTS).get_cluster().get_query_connection(
                ClickhouseClientSettings.QUERY))

        assert clickhouse.execute(
            "SELECT project_id, event_id, offset, partition FROM %s" %
            self.table) == [(self.event["project_id"], self.event["event_id"],
                             123, 456)]
Exemple #2
0
    def test_skip_too_old(self):
        test_worker = ConsumerWorker(
            self.dataset.get_writable_storage(),
            producer=FakeConfluentKafkaProducer(),
            replacements_topic=Topic(
                enforce_table_writer(self.dataset).get_stream_loader().
                get_replacement_topic_spec().topic_name),
            metrics=self.metrics,
        )

        event = self.event
        old_timestamp = datetime.utcnow() - timedelta(days=300)
        old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        event["datetime"] = old_timestamp_str
        event["data"]["datetime"] = old_timestamp_str
        event["data"]["received"] = int(
            calendar.timegm(old_timestamp.timetuple()))

        message: Message[KafkaPayload] = Message(
            Partition(Topic("events"), 1),
            42,
            KafkaPayload(None,
                         json.dumps((2, "insert", event)).encode("utf-8"), []),
            datetime.now(),
        )

        assert test_worker.process_message(message) is None
def test_table_name_filter() -> None:
    table_name = "table_name"
    message_filter = CdcTableNameMessageFilter(table_name)

    # Messages that math the table should not be dropped.
    assert not message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", [("table", table_name.encode("utf8"))]),
            datetime.now(),
        ))

    # Messages without a table should be dropped.
    assert message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", []),
            datetime.now(),
        ))

    # Messages from a different table should be dropped.
    assert message_filter.should_drop(
        Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, b"", [("table", b"other_table")]),
            datetime.now(),
        ))
Exemple #4
0
def test_multistorage_strategy() -> None:
    from snuba.datasets.storages import groupassignees, groupedmessages

    from tests.datasets.cdc.test_groupassignee import TestGroupassignee
    from tests.datasets.cdc.test_groupedmessage import TestGroupedMessage

    commit = Mock()

    storages = [groupassignees.storage, groupedmessages.storage]

    strategy = MultistorageConsumerProcessingStrategyFactory(
        storages,
        10,
        10,
        1,
        int(32 * 1e6),
        int(64 * 1e6),
        TestingMetricsBackend(),
    ).create(commit)

    payloads = [
        KafkaPayload(None, b"{}", [("table", b"ignored")]),
        KafkaPayload(
            None,
            json.dumps(TestGroupassignee.INSERT_MSG).encode("utf8"),
            [("table",
              groupassignees.storage.get_postgres_table().encode("utf8"))],
        ),
        KafkaPayload(
            None,
            json.dumps(TestGroupedMessage.INSERT_MSG).encode("utf8"),
            [("table",
              groupedmessages.storage.get_postgres_table().encode("utf8"))],
        ),
    ]

    messages = [
        Message(Partition(Topic("topic"), 0), offset, payload, datetime.now(),
                offset + 1) for offset, payload in enumerate(payloads)
    ]

    with assert_changes(lambda: get_row_count(groupassignees.storage), 0,
                        1), assert_changes(
                            lambda: get_row_count(groupedmessages.storage), 0,
                            1):

        for message in messages:
            strategy.submit(message)

        with assert_changes(lambda: commit.call_args_list, [],
                            [call({Partition(Topic("topic"), 0): 3})]):
            strategy.close()
            strategy.join()
Exemple #5
0
    def eventstream(*, dataset: Dataset):
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from snuba.consumer import ConsumerWorker

            worker = ConsumerWorker(storage, metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
Exemple #6
0
 def _wrap(self, msg: str) -> Message[KafkaPayload]:
     return Message(
         Partition(Topic("replacements"), 0),
         0,
         KafkaPayload(None, json.dumps(msg).encode("utf-8"), []),
         datetime.now(),
     )
Exemple #7
0
    def test_flattened_tags(self):
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        # | and = are intentional to test the escaping logic when computing the
        # flattened_tags on tag deletions
        self.event["data"]["tags"] = []
        self.event["data"]["tags"].append(["browser|name", "foo=1"])
        self.event["data"]["tags"].append(["browser|to_delete", "foo=2"])
        self.event["data"]["tags"].append(["notbrowser", "foo\\3"])
        self.event["data"]["tags"].append(["notbrowser2", "foo4"])
        self.write_events([self.event])

        project_id = self.project_id

        def _fetch_flattened_tags():
            return json.loads(
                self.app.post(
                    "/query",
                    data=json.dumps({
                        "project": [project_id],
                        "selected_columns": [
                            "_tags_flattened",
                            "tags.key",
                            "tags.value",
                        ],
                    }),
                ).data)["data"]

        timestamp = datetime.now(tz=pytz.utc)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "end_delete_tag",
                    {
                        "project_id": project_id,
                        "tag": "browser|to_delete",
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert _fetch_flattened_tags() == [{
            "tags.key": ["browser|name", "notbrowser", "notbrowser2"],
            "tags.value": ["foo=1", "foo\\3", "foo4"],
            "_tags_flattened":
            "|browser\\|name=foo\\=1||notbrowser=foo\\\\3||notbrowser2=foo4|",
        }]
 def __make_msg(self, partition: int, offset: int, payload: str,
                headers: Headers) -> Message[KafkaPayload]:
     return Message(
         partition=Partition(Topic("topic"), partition),
         offset=offset,
         payload=KafkaPayload(b"key", payload.encode(), headers),
         timestamp=datetime(2019, 6, 19, 6, 46, 28),
     )
Exemple #9
0
    def test_delete_tag_promoted_insert(self):
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["data"]["tags"].append(["browser.name", "foo"])
        self.event["data"]["tags"].append(["notbrowser", "foo"])
        self.write_unprocessed_events([self.event])

        project_id = self.project_id

        def _issue_count(total=False):
            return json.loads(
                self.app.post(
                    "/query",
                    data=json.dumps(
                        {
                            "project": [project_id],
                            "aggregations": [["count()", "", "count"]],
                            "conditions": [["tags[browser.name]", "=", "foo"]]
                            if not total
                            else [],
                            "groupby": ["group_id"],
                        }
                    ),
                ).data
            )["data"]

        assert _issue_count() == [{"count": 1, "group_id": 1}]
        assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]

        timestamp = datetime.now(tz=pytz.utc)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps(
                    (
                        2,
                        "end_delete_tag",
                        {
                            "project_id": project_id,
                            "tag": "browser.name",
                            "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                        },
                    )
                ).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert _issue_count() == []
        assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
Exemple #10
0
def get_messages(events_file) -> Sequence[Message[KafkaPayload]]:
    "Create a fake Kafka message for each JSON event in the file."
    messages: MutableSequence[Message[KafkaPayload]] = []
    raw_events = open(events_file).readlines()
    for raw_event in raw_events:
        messages.append(
            Message(
                Partition(Topic("events"), 1),
                0,
                KafkaPayload(None, raw_event.encode("utf-8"), []),
                datetime.now(),
            ), )
    return messages
Exemple #11
0
    def eventstream(*, dataset: Dataset) -> RespTuple:
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_default_entity().get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from snuba.consumers.consumer import StreamingConsumerStrategyFactory

            table_writer = storage.get_table_writer()
            stream_loader = table_writer.get_stream_loader()
            strategy = StreamingConsumerStrategyFactory(
                stream_loader.get_pre_filter(),
                stream_loader.get_processor(),
                table_writer.get_batch_writer(metrics),
                metrics,
                max_batch_size=1,
                max_batch_time=1.0,
                processes=None,
                input_block_size=None,
                output_block_size=None,
            ).create(lambda offsets: None)
            strategy.submit(message)
            strategy.close()
            strategy.join()
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, metrics=metrics)
            processed = worker.process_message(message)
            if processed is not None:
                batch = [processed]
                worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
Exemple #12
0
    def test_unmerge_insert(self):
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["primary_hash"] = "a" * 32
        self.write_events([self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "end_unmerge",
                    {
                        "project_id": project_id,
                        "previous_group_id": 1,
                        "new_group_id": 2,
                        "hashes": ["a" * 32],
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 2
        }]
Exemple #13
0
    def __build_batch_writer(
        self, storage: WritableTableStorage
    ) -> ProcessedMessageBatchWriter:
        replacement_batch_writer: Optional[ReplacementBatchWriter]
        replacement_topic_spec = (
            storage.get_table_writer().get_stream_loader().get_replacement_topic_spec()
        )
        if replacement_topic_spec is not None:
            # XXX: The producer is flushed when closed on strategy teardown
            # after an assignment is revoked, but never explicitly closed.
            # XXX: This assumes that the Kafka cluster used for the input topic
            # to the storage is the same as the replacement topic.
            replacement_batch_writer = ReplacementBatchWriter(
                ConfluentKafkaProducer(
                    build_kafka_producer_configuration(
                        storage.get_storage_key(),
                        override_params={
                            "partitioner": "consistent",
                            "message.max.bytes": 50000000,  # 50MB, default is 1MB
                        },
                    )
                ),
                Topic(replacement_topic_spec.topic_name),
            )
        else:
            replacement_batch_writer = None

        return ProcessedMessageBatchWriter(
            InsertBatchWriter(
                storage.get_table_writer().get_batch_writer(
                    self.__metrics,
                    {"load_balancing": "in_order", "insert_distributed_sync": 1},
                ),
                MetricsWrapper(
                    self.__metrics,
                    "insertions",
                    {"storage": storage.get_storage_key().value},
                ),
            ),
            replacement_batch_writer,
        )
Exemple #14
0
    def test_delete_groups_insert(self):
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 1
        }]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "end_delete_groups",
                    {
                        "project_id": project_id,
                        "group_ids": [1],
                        "datetime":
                        timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == []
Exemple #15
0
    def test_produce_replacement_messages(self):
        producer = FakeConfluentKafkaProducer()
        test_worker = ConsumerWorker(
            self.dataset.get_writable_storage(),
            producer=producer,
            replacements_topic=Topic(
                enforce_table_writer(self.dataset).get_stream_loader().
                get_replacement_topic_spec().topic_name),
            metrics=self.metrics,
        )

        test_worker.flush_batch([
            ReplacementBatch("1", [{
                "project_id": 1
            }]),
            ReplacementBatch("2", [{
                "project_id": 2
            }]),
        ])

        assert [(m._topic, m._key, m._value) for m in producer.messages] == [
            ("event-replacements", b"1", b'{"project_id":1}'),
            ("event-replacements", b"2", b'{"project_id":2}'),
        ]
Exemple #16
0
    def test_reprocessing_flow_insert(self) -> None:
        # We have a group that contains two events, 1 and 2.
        self.event["project_id"] = self.project_id
        self.event["group_id"] = 1
        self.event["event_id"] = event_id = "00e24a150d7f4ee4b142b61b4d893b6d"
        write_unprocessed_events(self.storage, [self.event])
        self.event["event_id"] = event_id2 = "00e24a150d7f4ee4b142b61b4d893b6e"
        write_unprocessed_events(self.storage, [self.event])

        assert self._issue_count(self.project_id) == [{
            "count": 2,
            "group_id": 1
        }]

        project_id = self.project_id

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "tombstone_events",
                    {
                        "project_id": project_id,
                        "event_ids": [event_id]
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        # The user chooses to reprocess a subset of the group and throw away
        # the other events. Event 1 gets manually tombstoned by Sentry while
        # Event 2 prevails.
        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        # At this point the count doesn't make any sense but we don't care.
        assert self._issue_count(self.project_id) == [{
            "count": 2,
            "group_id": 1
        }]

        # The reprocessed event is inserted with a guaranteed-new group ID but
        # the *same* event ID (this is why we need to skip tombstoning this
        # event ID)
        self.event["group_id"] = 2
        write_unprocessed_events(self.storage, [self.event])

        message: Message[KafkaPayload] = Message(
            Partition(Topic("replacements"), 1),
            42,
            KafkaPayload(
                None,
                json.dumps((
                    2,
                    "exclude_groups",
                    {
                        "project_id": project_id,
                        "group_ids": [1]
                    },
                )).encode("utf-8"),
                [],
            ),
            datetime.now(),
        )

        # Group 1 is excluded from queries. At this point we have almost a
        # regular group deletion, except only a subset of events have been
        # tombstoned (the ones that will *not* be reprocessed).
        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        # Group 2 should contain the one event that the user chose to
        # reprocess, and Group 1 should be gone. (Note: In the product Group 2
        # looks identical to Group 1, including short ID).
        assert self._issue_count(self.project_id) == [{
            "count": 1,
            "group_id": 2
        }]
        assert self._get_group_id(project_id, event_id2) == 2
        assert not self._get_group_id(project_id, event_id)
Exemple #17
0
    def __init__(
        self,
        storage_key: StorageKey,
        raw_topic: Optional[str],
        replacements_topic: Optional[str],
        max_batch_size: int,
        max_batch_time_ms: int,
        bootstrap_servers: Sequence[str],
        group_id: str,
        commit_log_topic: Optional[str],
        auto_offset_reset: str,
        queued_max_messages_kbytes: int,
        queued_min_messages: int,
        processes: Optional[int],
        input_block_size: Optional[int],
        output_block_size: Optional[int],
        commit_retry_policy: Optional[RetryPolicy] = None,
        profile_path: Optional[str] = None,
    ) -> None:
        self.storage = get_writable_storage(storage_key)
        self.bootstrap_servers = bootstrap_servers
        self.broker_config = get_default_kafka_configuration(
            storage_key, bootstrap_servers=bootstrap_servers
        )
        self.producer_broker_config = build_kafka_producer_configuration(
            storage_key,
            bootstrap_servers=bootstrap_servers,
            override_params={
                "partitioner": "consistent",
                "message.max.bytes": 50000000,  # 50MB, default is 1MB
            },
        )

        stream_loader = self.storage.get_table_writer().get_stream_loader()

        self.raw_topic: Topic
        if raw_topic is not None:
            self.raw_topic = Topic(raw_topic)
        else:
            self.raw_topic = Topic(stream_loader.get_default_topic_spec().topic_name)

        self.replacements_topic: Optional[Topic]
        if replacements_topic is not None:
            self.replacements_topic = Topic(replacements_topic)
        else:
            replacement_topic_spec = stream_loader.get_replacement_topic_spec()
            if replacement_topic_spec is not None:
                self.replacements_topic = Topic(replacement_topic_spec.topic_name)
            else:
                self.replacements_topic = None

        self.commit_log_topic: Optional[Topic]
        if commit_log_topic is not None:
            self.commit_log_topic = Topic(commit_log_topic)
        else:
            commit_log_topic_spec = stream_loader.get_commit_log_topic_spec()
            if commit_log_topic_spec is not None:
                self.commit_log_topic = Topic(commit_log_topic_spec.topic_name)
            else:
                self.commit_log_topic = None

        # XXX: This can result in a producer being built in cases where it's
        # not actually required.
        self.producer = Producer(self.producer_broker_config)

        self.metrics = MetricsWrapper(
            environment.metrics,
            "consumer",
            tags={"group": group_id, "storage": storage_key.value},
        )

        self.max_batch_size = max_batch_size
        self.max_batch_time_ms = max_batch_time_ms
        self.group_id = group_id
        self.auto_offset_reset = auto_offset_reset
        self.queued_max_messages_kbytes = queued_max_messages_kbytes
        self.queued_min_messages = queued_min_messages
        self.processes = processes
        self.input_block_size = input_block_size
        self.output_block_size = output_block_size
        self.__profile_path = profile_path

        if commit_retry_policy is None:
            commit_retry_policy = BasicRetryPolicy(
                3,
                constant_delay(1),
                lambda e: isinstance(e, KafkaException)
                and e.args[0].code()
                in (
                    KafkaError.REQUEST_TIMED_OUT,
                    KafkaError.NOT_COORDINATOR,
                    KafkaError._WAIT_COORD,
                ),
            )

        self.__commit_retry_policy = commit_retry_policy
Exemple #18
0
def test_subscription_worker(broker: Broker[SubscriptionTaskResult], ) -> None:
    result_topic = Topic("subscription-results")

    broker.create_topic(result_topic, partitions=1)

    frequency = timedelta(minutes=1)
    evaluations = 3

    subscription = Subscription(
        SubscriptionIdentifier(PartitionId(0), uuid1()),
        SubscriptionData(
            project_id=1,
            conditions=[],
            aggregations=[["count()", "", "count"]],
            time_window=timedelta(minutes=60),
            resolution=frequency,
        ),
    )

    store = DummySubscriptionDataStore()
    store.create(subscription.identifier.uuid, subscription.data)

    metrics = DummyMetricsBackend(strict=True)

    dataset = get_dataset("events")
    worker = SubscriptionWorker(
        dataset,
        ThreadPoolExecutor(),
        {
            0: SubscriptionScheduler(store, PartitionId(0), timedelta(),
                                     metrics)
        },
        broker.get_producer(),
        result_topic,
        metrics,
    )

    now = datetime(2000, 1, 1)

    tick = Tick(
        offsets=Interval(0, 1),
        timestamps=Interval(now - (frequency * evaluations), now),
    )

    result_futures = worker.process_message(
        Message(Partition(Topic("events"), 0), 0, tick, now))

    assert result_futures is not None and len(result_futures) == evaluations

    # Publish the results.
    worker.flush_batch([result_futures])

    # Check to make sure the results were published.
    # NOTE: This does not cover the ``SubscriptionTaskResultCodec``!
    consumer = broker.get_consumer("group")
    consumer.subscribe([result_topic])

    for i in range(evaluations):
        timestamp = now - frequency * (evaluations - i)

        message = consumer.poll()
        assert message is not None
        assert message.partition.topic == result_topic

        task, future = result_futures[i]
        future_result = request, result = future.result()
        assert message.payload.task.timestamp == timestamp
        assert message.payload == SubscriptionTaskResult(task, future_result)

        # NOTE: The time series extension is folded back into the request
        # body, ideally this would reference the timeseries options in
        # isolation.
        assert (request.body.items() > {
            "from_date":
            (timestamp - subscription.data.time_window).isoformat(),
            "to_date":
            timestamp.isoformat(),
        }.items())

        assert result == {
            "meta": [{
                "name": "count",
                "type": "UInt64"
            }],
            "data": [{
                "count": 0
            }],
        }
Exemple #19
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    max_query_workers: Optional[int],
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
    delay_seconds: Optional[int],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    assert result_topic is not None

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    if not bootstrap_servers:
        storage = dataset.get_default_entity().get_writable_storage()
        assert storage is not None
        storage_key = storage.get_storage_key().value
        bootstrap_servers = settings.DEFAULT_STORAGE_BROKERS.get(
            storage_key, settings.DEFAULT_BROKERS)

    loader = enforce_table_writer(dataset).get_stream_loader()

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions",
        tags={
            "group": consumer_group,
            "dataset": dataset_name
        },
    )

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                ), ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                ), ),
            (Topic(commit_log_topic) if commit_log_topic is not None else
             Topic(loader.get_commit_log_topic_spec().topic_name)),
            set(commit_log_groups),
        ),
        time_shift=(timedelta(seconds=delay_seconds *
                              -1) if delay_seconds is not None else None),
    )

    producer = ProducerEncodingWrapper(
        KafkaProducer({
            "bootstrap.servers": ",".join(bootstrap_servers),
            "partitioner": "consistent",
            "message.max.bytes": 50000000,  # 50MB, default is 1MB
        }),
        SubscriptionTaskResultEncoder(),
    )

    executor = ThreadPoolExecutor(max_workers=max_query_workers)
    logger.debug("Starting %r with %s workers...", executor,
                 executor._max_workers)
    metrics.gauge("executor.workers", executor._max_workers)

    with closing(consumer), executor, closing(producer):
        batching_consumer = StreamProcessor(
            consumer,
            (Topic(topic) if topic is not None else Topic(
                loader.get_default_topic_spec().topic_name)),
            BatchProcessingStrategyFactory(
                SubscriptionWorker(
                    dataset,
                    executor,
                    {
                        index: SubscriptionScheduler(
                            RedisSubscriptionDataStore(redis_client, dataset,
                                                       PartitionId(index)),
                            PartitionId(index),
                            cache_ttl=timedelta(seconds=schedule_ttl),
                            metrics=metrics,
                        )
                        for index in
                        range(partitions if partitions is not None else loader.
                              get_default_topic_spec().partitions_number)
                    },
                    producer,
                    Topic(result_topic),
                    metrics,
                ),
                max_batch_size,
                max_batch_time_ms,
                metrics,
            ),
            metrics=metrics,
        )

        def handler(signum, frame) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
Exemple #20
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from snuba.replacer import ReplacerWorker
    from snuba.utils.streams import Topic
    from snuba.utils.streams.backends.kafka import (
        KafkaConsumer,
        TransportError,
        build_kafka_consumer_configuration,
    )
    from snuba.utils.streams.processing import StreamProcessor
    from snuba.utils.streams.processing.strategies.batching import (
        BatchProcessingStrategyFactory, )

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)
    storage = get_writable_storage(storage_key)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (
        default_replacement_topic_spec is not None
    ), f"Storage {storage.get_storage_key().value} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(
        environment.metrics,
        "replacer",
        tags=metrics_tags,
    )

    replacer = StreamProcessor(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ), ),
        Topic(replacements_topic),
        BatchProcessingStrategyFactory(
            worker=ReplacerWorker(storage, metrics=metrics),
            max_batch_size=max_batch_size,
            max_batch_time=max_batch_time_ms,
            metrics=metrics,
        ),
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum: int, frame: Any) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()
Exemple #21
0
def test_streaming_consumer_strategy() -> None:
    messages = (Message(
        Partition(Topic("events"), 0),
        i,
        KafkaPayload(None, b"{}", None),
        datetime.now(),
    ) for i in itertools.count())

    replacements_producer = FakeConfluentKafkaProducer()

    processor = Mock()
    processor.process_message.side_effect = [
        None,
        InsertBatch([{}]),
        ReplacementBatch("key", [{}]),
    ]

    writer = Mock()

    metrics = TestingMetricsBackend()

    factory = StreamingConsumerStrategyFactory(
        None,
        processor,
        writer,
        metrics,
        max_batch_size=10,
        max_batch_time=60,
        processes=None,
        input_block_size=None,
        output_block_size=None,
        replacements_producer=replacements_producer,
        replacements_topic=Topic("replacements"),
    )

    commit_function = Mock()
    strategy = factory.create(commit_function)

    for i in range(3):
        strategy.poll()
        strategy.submit(next(messages))

    assert metrics.calls == []

    processor.process_message.side_effect = [{}]

    with pytest.raises(TypeError):
        strategy.poll()
        strategy.submit(next(messages))

    def get_number_of_insertion_metrics() -> int:
        count = 0
        for call in metrics.calls:
            if isinstance(call,
                          Timing) and call.name == "insertions.latency_ms":
                count += 1
        return count

    expected_write_count = 1

    with assert_changes(get_number_of_insertion_metrics, 0,
                        expected_write_count), assert_changes(
                            lambda: writer.write.call_count, 0,
                            expected_write_count), assert_changes(
                                lambda: len(replacements_producer.messages), 0,
                                1):
        strategy.close()
        strategy.join()
Exemple #22
0
def test_tick_consumer_non_monotonic(clock: Clock,
                                     broker: Broker[int]) -> None:
    epoch = datetime.fromtimestamp(clock.time())

    topic = Topic("messages")
    partition = Partition(topic, 0)

    broker.create_topic(topic, partitions=1)

    producer = broker.get_producer()

    inner_consumer = broker.get_consumer("group")

    consumer = TickConsumer(inner_consumer)

    def assignment_callback(offsets: Mapping[Partition, int]) -> None:
        assignment_callback.called = True
        assert inner_consumer.tell() == {partition: 0}
        assert consumer.tell() == {partition: 0}

    assignment_callback.called = False

    consumer.subscribe([topic], on_assign=assignment_callback)

    producer.produce(partition, 0)

    clock.sleep(1)

    producer.produce(partition, 1)

    with assert_changes(lambda: assignment_callback.called, False, True):
        assert consumer.poll() is None

    assert inner_consumer.tell() == {partition: 1}
    assert consumer.tell() == {partition: 0}

    with assert_changes(inner_consumer.tell, {partition: 1},
                        {partition: 2}), assert_changes(
                            consumer.tell, {partition: 0}, {partition: 1}):
        assert consumer.poll() == Message(
            partition,
            0,
            Tick(
                offsets=Interval(0, 1),
                timestamps=Interval(epoch, epoch + timedelta(seconds=1)),
            ),
            epoch + timedelta(seconds=1),
        )

    clock.sleep(-1)

    producer.produce(partition, 2)

    with assert_changes(inner_consumer.tell, {partition: 2},
                        {partition: 3}), assert_does_not_change(
                            consumer.tell, {partition: 1}):
        assert consumer.poll() is None

    clock.sleep(2)

    producer.produce(partition, 3)

    with assert_changes(inner_consumer.tell, {partition: 3},
                        {partition: 4}), assert_changes(
                            consumer.tell, {partition: 1}, {partition: 3}):
        assert consumer.poll() == Message(
            partition,
            1,
            Tick(
                offsets=Interval(1, 3),
                timestamps=Interval(epoch + timedelta(seconds=1),
                                    epoch + timedelta(seconds=2)),
            ),
            epoch + timedelta(seconds=2),
        )
Exemple #23
0
def test_tick_consumer(clock: Clock, broker: Broker[int],
                       time_shift: Optional[timedelta]) -> None:
    epoch = datetime.fromtimestamp(clock.time())

    topic = Topic("messages")

    broker.create_topic(topic, partitions=2)

    producer = broker.get_producer()
    for partition, payloads in enumerate([[0, 1, 2], [0]]):
        for payload in payloads:
            producer.produce(Partition(topic, partition), payload).result()

    inner_consumer = broker.get_consumer("group")

    consumer = TickConsumer(inner_consumer, time_shift=time_shift)

    if time_shift is None:
        time_shift = timedelta()

    def assignment_callback(offsets: Mapping[Partition, int]) -> None:
        assignment_callback.called = True

        assert consumer.tell() == {
            Partition(topic, 0): 0,
            Partition(topic, 1): 0,
        }

        assert inner_consumer.tell() == {
            Partition(topic, 0): 0,
            Partition(topic, 1): 0,
        }

    assignment_callback.called = False

    consumer.subscribe([topic], on_assign=assignment_callback)

    with assert_changes(lambda: assignment_callback.called, False, True):
        # consume 0, 0
        assert consumer.poll() is None

    assert consumer.tell() == {
        Partition(topic, 0): 0,
        Partition(topic, 1): 0,
    }

    assert inner_consumer.tell() == {
        Partition(topic, 0): 1,
        Partition(topic, 1): 0,
    }

    # consume 0, 1
    assert consumer.poll() == Message(
        Partition(topic, 0),
        0,
        Tick(offsets=Interval(0, 1),
             timestamps=Interval(epoch, epoch)).time_shift(time_shift),
        epoch,
    )

    assert consumer.tell() == {
        Partition(topic, 0): 1,
        Partition(topic, 1): 0,
    }

    assert inner_consumer.tell() == {
        Partition(topic, 0): 2,
        Partition(topic, 1): 0,
    }

    # consume 0, 2
    assert consumer.poll() == Message(
        Partition(topic, 0),
        1,
        Tick(offsets=Interval(1, 2),
             timestamps=Interval(epoch, epoch)).time_shift(time_shift),
        epoch,
    )

    assert consumer.tell() == {
        Partition(topic, 0): 2,
        Partition(topic, 1): 0,
    }

    assert inner_consumer.tell() == {
        Partition(topic, 0): 3,
        Partition(topic, 1): 0,
    }

    # consume 1, 0
    assert consumer.poll() is None

    assert consumer.tell() == {
        Partition(topic, 0): 2,
        Partition(topic, 1): 0,
    }

    assert inner_consumer.tell() == {
        Partition(topic, 0): 3,
        Partition(topic, 1): 1,
    }

    # consume no message
    assert consumer.poll() is None

    assert consumer.tell() == {
        Partition(topic, 0): 2,
        Partition(topic, 1): 0,
    }

    assert inner_consumer.tell() == {
        Partition(topic, 0): 3,
        Partition(topic, 1): 1,
    }

    consumer.seek({Partition(topic, 0): 1})

    assert consumer.tell() == {
        Partition(topic, 0): 1,
        Partition(topic, 1): 0,
    }

    assert inner_consumer.tell() == {
        Partition(topic, 0): 1,
        Partition(topic, 1): 1,
    }

    # consume 0, 1
    assert consumer.poll() is None

    assert consumer.tell() == {
        Partition(topic, 0): 1,
        Partition(topic, 1): 0,
    }

    assert inner_consumer.tell() == {
        Partition(topic, 0): 2,
        Partition(topic, 1): 1,
    }

    # consume 0, 2
    assert consumer.poll() == Message(
        Partition(topic, 0),
        1,
        Tick(offsets=Interval(1, 2),
             timestamps=Interval(epoch, epoch)).time_shift(time_shift),
        epoch,
    )

    assert consumer.tell() == {
        Partition(topic, 0): 2,
        Partition(topic, 1): 0,
    }

    assert inner_consumer.tell() == {
        Partition(topic, 0): 3,
        Partition(topic, 1): 1,
    }

    with pytest.raises(ConsumerError):
        consumer.seek({Partition(topic, -1): 0})