def test_offsets(self): event = self.event message: Message[KafkaPayload] = Message( Partition(Topic("events"), 456), 123, KafkaPayload(None, json.dumps((2, "insert", event)).encode("utf-8"), []), # event doesn't really matter datetime.now(), ) test_worker = ConsumerWorker( self.dataset.get_writable_storage(), producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset).get_stream_loader(). get_replacement_topic_spec().topic_name), metrics=self.metrics, ) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) clickhouse = (get_storage( StorageKey.EVENTS).get_cluster().get_query_connection( ClickhouseClientSettings.QUERY)) assert clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
def test_skip_too_old(self): test_worker = ConsumerWorker( self.dataset.get_writable_storage(), producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset).get_stream_loader(). get_replacement_topic_spec().topic_name), metrics=self.metrics, ) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event["datetime"] = old_timestamp_str event["data"]["datetime"] = old_timestamp_str event["data"]["received"] = int( calendar.timegm(old_timestamp.timetuple())) message: Message[KafkaPayload] = Message( Partition(Topic("events"), 1), 42, KafkaPayload(None, json.dumps((2, "insert", event)).encode("utf-8"), []), datetime.now(), ) assert test_worker.process_message(message) is None
def test_table_name_filter() -> None: table_name = "table_name" message_filter = CdcTableNameMessageFilter(table_name) # Messages that math the table should not be dropped. assert not message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", table_name.encode("utf8"))]), datetime.now(), )) # Messages without a table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", []), datetime.now(), )) # Messages from a different table should be dropped. assert message_filter.should_drop( Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, b"", [("table", b"other_table")]), datetime.now(), ))
def test_multistorage_strategy() -> None: from snuba.datasets.storages import groupassignees, groupedmessages from tests.datasets.cdc.test_groupassignee import TestGroupassignee from tests.datasets.cdc.test_groupedmessage import TestGroupedMessage commit = Mock() storages = [groupassignees.storage, groupedmessages.storage] strategy = MultistorageConsumerProcessingStrategyFactory( storages, 10, 10, 1, int(32 * 1e6), int(64 * 1e6), TestingMetricsBackend(), ).create(commit) payloads = [ KafkaPayload(None, b"{}", [("table", b"ignored")]), KafkaPayload( None, json.dumps(TestGroupassignee.INSERT_MSG).encode("utf8"), [("table", groupassignees.storage.get_postgres_table().encode("utf8"))], ), KafkaPayload( None, json.dumps(TestGroupedMessage.INSERT_MSG).encode("utf8"), [("table", groupedmessages.storage.get_postgres_table().encode("utf8"))], ), ] messages = [ Message(Partition(Topic("topic"), 0), offset, payload, datetime.now(), offset + 1) for offset, payload in enumerate(payloads) ] with assert_changes(lambda: get_row_count(groupassignees.storage), 0, 1), assert_changes( lambda: get_row_count(groupedmessages.storage), 0, 1): for message in messages: strategy.submit(message) with assert_changes(lambda: commit.call_args_list, [], [call({Partition(Topic("topic"), 0): 3})]): strategy.close() strategy.join()
def eventstream(*, dataset: Dataset): record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_writable_storage() assert storage is not None if type_ == "insert": from snuba.consumer import ConsumerWorker worker = ConsumerWorker(storage, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def _wrap(self, msg: str) -> Message[KafkaPayload]: return Message( Partition(Topic("replacements"), 0), 0, KafkaPayload(None, json.dumps(msg).encode("utf-8"), []), datetime.now(), )
def test_flattened_tags(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 # | and = are intentional to test the escaping logic when computing the # flattened_tags on tag deletions self.event["data"]["tags"] = [] self.event["data"]["tags"].append(["browser|name", "foo=1"]) self.event["data"]["tags"].append(["browser|to_delete", "foo=2"]) self.event["data"]["tags"].append(["notbrowser", "foo\\3"]) self.event["data"]["tags"].append(["notbrowser2", "foo4"]) self.write_events([self.event]) project_id = self.project_id def _fetch_flattened_tags(): return json.loads( self.app.post( "/query", data=json.dumps({ "project": [project_id], "selected_columns": [ "_tags_flattened", "tags.key", "tags.value", ], }), ).data)["data"] timestamp = datetime.now(tz=pytz.utc) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_delete_tag", { "project_id": project_id, "tag": "browser|to_delete", "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert _fetch_flattened_tags() == [{ "tags.key": ["browser|name", "notbrowser", "notbrowser2"], "tags.value": ["foo=1", "foo\\3", "foo4"], "_tags_flattened": "|browser\\|name=foo\\=1||notbrowser=foo\\\\3||notbrowser2=foo4|", }]
def __make_msg(self, partition: int, offset: int, payload: str, headers: Headers) -> Message[KafkaPayload]: return Message( partition=Partition(Topic("topic"), partition), offset=offset, payload=KafkaPayload(b"key", payload.encode(), headers), timestamp=datetime(2019, 6, 19, 6, 46, 28), )
def test_delete_tag_promoted_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["data"]["tags"].append(["browser.name", "foo"]) self.event["data"]["tags"].append(["notbrowser", "foo"]) self.write_unprocessed_events([self.event]) project_id = self.project_id def _issue_count(total=False): return json.loads( self.app.post( "/query", data=json.dumps( { "project": [project_id], "aggregations": [["count()", "", "count"]], "conditions": [["tags[browser.name]", "=", "foo"]] if not total else [], "groupby": ["group_id"], } ), ).data )["data"] assert _issue_count() == [{"count": 1, "group_id": 1}] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}] timestamp = datetime.now(tz=pytz.utc) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps( ( 2, "end_delete_tag", { "project_id": project_id, "tag": "browser.name", "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, ) ).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert _issue_count() == [] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
def get_messages(events_file) -> Sequence[Message[KafkaPayload]]: "Create a fake Kafka message for each JSON event in the file." messages: MutableSequence[Message[KafkaPayload]] = [] raw_events = open(events_file).readlines() for raw_event in raw_events: messages.append( Message( Partition(Topic("events"), 1), 0, KafkaPayload(None, raw_event.encode("utf-8"), []), datetime.now(), ), ) return messages
def eventstream(*, dataset: Dataset) -> RespTuple: record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data, []), datetime.now(), ) type_ = record[1] storage = dataset.get_default_entity().get_writable_storage() assert storage is not None if type_ == "insert": from snuba.consumers.consumer import StreamingConsumerStrategyFactory table_writer = storage.get_table_writer() stream_loader = table_writer.get_stream_loader() strategy = StreamingConsumerStrategyFactory( stream_loader.get_pre_filter(), stream_loader.get_processor(), table_writer.get_batch_writer(metrics), metrics, max_batch_size=1, max_batch_time=1.0, processes=None, input_block_size=None, output_block_size=None, ).create(lambda offsets: None) strategy.submit(message) strategy.close() strategy.join() else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(storage, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def test_unmerge_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["primary_hash"] = "a" * 32 self.write_events([self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_unmerge", { "project_id": project_id, "previous_group_id": 1, "new_group_id": 2, "hashes": ["a" * 32], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }]
def __build_batch_writer( self, storage: WritableTableStorage ) -> ProcessedMessageBatchWriter: replacement_batch_writer: Optional[ReplacementBatchWriter] replacement_topic_spec = ( storage.get_table_writer().get_stream_loader().get_replacement_topic_spec() ) if replacement_topic_spec is not None: # XXX: The producer is flushed when closed on strategy teardown # after an assignment is revoked, but never explicitly closed. # XXX: This assumes that the Kafka cluster used for the input topic # to the storage is the same as the replacement topic. replacement_batch_writer = ReplacementBatchWriter( ConfluentKafkaProducer( build_kafka_producer_configuration( storage.get_storage_key(), override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) ), Topic(replacement_topic_spec.topic_name), ) else: replacement_batch_writer = None return ProcessedMessageBatchWriter( InsertBatchWriter( storage.get_table_writer().get_batch_writer( self.__metrics, {"load_balancing": "in_order", "insert_distributed_sync": 1}, ), MetricsWrapper( self.__metrics, "insertions", {"storage": storage.get_storage_key().value}, ), ), replacement_batch_writer, )
def test_delete_groups_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 1 }] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_delete_groups", { "project_id": project_id, "group_ids": [1], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), [], ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == []
def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() test_worker = ConsumerWorker( self.dataset.get_writable_storage(), producer=producer, replacements_topic=Topic( enforce_table_writer(self.dataset).get_stream_loader(). get_replacement_topic_spec().topic_name), metrics=self.metrics, ) test_worker.flush_batch([ ReplacementBatch("1", [{ "project_id": 1 }]), ReplacementBatch("2", [{ "project_id": 2 }]), ]) assert [(m._topic, m._key, m._value) for m in producer.messages] == [ ("event-replacements", b"1", b'{"project_id":1}'), ("event-replacements", b"2", b'{"project_id":2}'), ]
def test_reprocessing_flow_insert(self) -> None: # We have a group that contains two events, 1 and 2. self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["event_id"] = event_id = "00e24a150d7f4ee4b142b61b4d893b6d" write_unprocessed_events(self.storage, [self.event]) self.event["event_id"] = event_id2 = "00e24a150d7f4ee4b142b61b4d893b6e" write_unprocessed_events(self.storage, [self.event]) assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "tombstone_events", { "project_id": project_id, "event_ids": [event_id] }, )).encode("utf-8"), [], ), datetime.now(), ) # The user chooses to reprocess a subset of the group and throw away # the other events. Event 1 gets manually tombstoned by Sentry while # Event 2 prevails. processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # At this point the count doesn't make any sense but we don't care. assert self._issue_count(self.project_id) == [{ "count": 2, "group_id": 1 }] # The reprocessed event is inserted with a guaranteed-new group ID but # the *same* event ID (this is why we need to skip tombstoning this # event ID) self.event["group_id"] = 2 write_unprocessed_events(self.storage, [self.event]) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "exclude_groups", { "project_id": project_id, "group_ids": [1] }, )).encode("utf-8"), [], ), datetime.now(), ) # Group 1 is excluded from queries. At this point we have almost a # regular group deletion, except only a subset of events have been # tombstoned (the ones that will *not* be reprocessed). processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) # Group 2 should contain the one event that the user chose to # reprocess, and Group 1 should be gone. (Note: In the product Group 2 # looks identical to Group 1, including short ID). assert self._issue_count(self.project_id) == [{ "count": 1, "group_id": 2 }] assert self._get_group_id(project_id, event_id2) == 2 assert not self._get_group_id(project_id, event_id)
def __init__( self, storage_key: StorageKey, raw_topic: Optional[str], replacements_topic: Optional[str], max_batch_size: int, max_batch_time_ms: int, bootstrap_servers: Sequence[str], group_id: str, commit_log_topic: Optional[str], auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, processes: Optional[int], input_block_size: Optional[int], output_block_size: Optional[int], commit_retry_policy: Optional[RetryPolicy] = None, profile_path: Optional[str] = None, ) -> None: self.storage = get_writable_storage(storage_key) self.bootstrap_servers = bootstrap_servers self.broker_config = get_default_kafka_configuration( storage_key, bootstrap_servers=bootstrap_servers ) self.producer_broker_config = build_kafka_producer_configuration( storage_key, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, ) stream_loader = self.storage.get_table_writer().get_stream_loader() self.raw_topic: Topic if raw_topic is not None: self.raw_topic = Topic(raw_topic) else: self.raw_topic = Topic(stream_loader.get_default_topic_spec().topic_name) self.replacements_topic: Optional[Topic] if replacements_topic is not None: self.replacements_topic = Topic(replacements_topic) else: replacement_topic_spec = stream_loader.get_replacement_topic_spec() if replacement_topic_spec is not None: self.replacements_topic = Topic(replacement_topic_spec.topic_name) else: self.replacements_topic = None self.commit_log_topic: Optional[Topic] if commit_log_topic is not None: self.commit_log_topic = Topic(commit_log_topic) else: commit_log_topic_spec = stream_loader.get_commit_log_topic_spec() if commit_log_topic_spec is not None: self.commit_log_topic = Topic(commit_log_topic_spec.topic_name) else: self.commit_log_topic = None # XXX: This can result in a producer being built in cases where it's # not actually required. self.producer = Producer(self.producer_broker_config) self.metrics = MetricsWrapper( environment.metrics, "consumer", tags={"group": group_id, "storage": storage_key.value}, ) self.max_batch_size = max_batch_size self.max_batch_time_ms = max_batch_time_ms self.group_id = group_id self.auto_offset_reset = auto_offset_reset self.queued_max_messages_kbytes = queued_max_messages_kbytes self.queued_min_messages = queued_min_messages self.processes = processes self.input_block_size = input_block_size self.output_block_size = output_block_size self.__profile_path = profile_path if commit_retry_policy is None: commit_retry_policy = BasicRetryPolicy( 3, constant_delay(1), lambda e: isinstance(e, KafkaException) and e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR, KafkaError._WAIT_COORD, ), ) self.__commit_retry_policy = commit_retry_policy
def test_subscription_worker(broker: Broker[SubscriptionTaskResult], ) -> None: result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 3 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), SubscriptionData( project_id=1, conditions=[], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=60), resolution=frequency, ), ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = DummyMetricsBackend(strict=True) dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), metrics) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) result_futures = worker.process_message( Message(Partition(Topic("events"), 0), 0, tick, now)) assert result_futures is not None and len(result_futures) == evaluations # Publish the results. worker.flush_batch([result_futures]) # Check to make sure the results were published. # NOTE: This does not cover the ``SubscriptionTaskResultCodec``! consumer = broker.get_consumer("group") consumer.subscribe([result_topic]) for i in range(evaluations): timestamp = now - frequency * (evaluations - i) message = consumer.poll() assert message is not None assert message.partition.topic == result_topic task, future = result_futures[i] future_result = request, result = future.result() assert message.payload.task.timestamp == timestamp assert message.payload == SubscriptionTaskResult(task, future_result) # NOTE: The time series extension is folded back into the request # body, ideally this would reference the timeseries options in # isolation. assert (request.body.items() > { "from_date": (timestamp - subscription.data.time_window).isoformat(), "to_date": timestamp.isoformat(), }.items()) assert result == { "meta": [{ "name": "count", "type": "UInt64" }], "data": [{ "count": 0 }], }
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" assert result_topic is not None setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) if not bootstrap_servers: storage = dataset.get_default_entity().get_writable_storage() assert storage is not None storage_key = storage.get_storage_key().value bootstrap_servers = settings.DEFAULT_STORAGE_BROKERS.get( storage_key, settings.DEFAULT_BROKERS) loader = enforce_table_writer(dataset).get_stream_loader() metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, consumer_group, auto_offset_reset=auto_offset_reset, ), ), KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(loader.get_commit_log_topic_spec().topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer({ "bootstrap.servers": ",".join(bootstrap_servers), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, executor._max_workers) metrics.gauge("executor.workers", executor._max_workers) with closing(consumer), executor, closing(producer): batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic), metrics, ), max_batch_size, max_batch_time_ms, metrics, ), metrics=metrics, ) def handler(signum, frame) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def replacer( *, replacements_topic: Optional[str], consumer_group: str, bootstrap_server: Sequence[str], storage_name: str, max_batch_size: int, max_batch_time_ms: int, auto_offset_reset: str, queued_max_messages_kbytes: int, queued_min_messages: int, log_level: Optional[str] = None, ) -> None: from snuba.replacer import ReplacerWorker from snuba.utils.streams import Topic from snuba.utils.streams.backends.kafka import ( KafkaConsumer, TransportError, build_kafka_consumer_configuration, ) from snuba.utils.streams.processing import StreamProcessor from snuba.utils.streams.processing.strategies.batching import ( BatchProcessingStrategyFactory, ) setup_logging(log_level) setup_sentry() storage_key = StorageKey(storage_name) storage = get_writable_storage(storage_key) metrics_tags = {"group": consumer_group, "storage": storage_name} stream_loader = storage.get_table_writer().get_stream_loader() default_replacement_topic_spec = stream_loader.get_replacement_topic_spec() assert ( default_replacement_topic_spec is not None ), f"Storage {storage.get_storage_key().value} does not have a replacement topic." replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name metrics = MetricsWrapper( environment.metrics, "replacer", tags=metrics_tags, ) replacer = StreamProcessor( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers=bootstrap_server, group_id=consumer_group, auto_offset_reset=auto_offset_reset, queued_max_messages_kbytes=queued_max_messages_kbytes, queued_min_messages=queued_min_messages, ), ), Topic(replacements_topic), BatchProcessingStrategyFactory( worker=ReplacerWorker(storage, metrics=metrics), max_batch_size=max_batch_size, max_batch_time=max_batch_time_ms, metrics=metrics, ), metrics=metrics, recoverable_errors=[TransportError], ) def handler(signum: int, frame: Any) -> None: replacer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) replacer.run()
def test_streaming_consumer_strategy() -> None: messages = (Message( Partition(Topic("events"), 0), i, KafkaPayload(None, b"{}", None), datetime.now(), ) for i in itertools.count()) replacements_producer = FakeConfluentKafkaProducer() processor = Mock() processor.process_message.side_effect = [ None, InsertBatch([{}]), ReplacementBatch("key", [{}]), ] writer = Mock() metrics = TestingMetricsBackend() factory = StreamingConsumerStrategyFactory( None, processor, writer, metrics, max_batch_size=10, max_batch_time=60, processes=None, input_block_size=None, output_block_size=None, replacements_producer=replacements_producer, replacements_topic=Topic("replacements"), ) commit_function = Mock() strategy = factory.create(commit_function) for i in range(3): strategy.poll() strategy.submit(next(messages)) assert metrics.calls == [] processor.process_message.side_effect = [{}] with pytest.raises(TypeError): strategy.poll() strategy.submit(next(messages)) def get_number_of_insertion_metrics() -> int: count = 0 for call in metrics.calls: if isinstance(call, Timing) and call.name == "insertions.latency_ms": count += 1 return count expected_write_count = 1 with assert_changes(get_number_of_insertion_metrics, 0, expected_write_count), assert_changes( lambda: writer.write.call_count, 0, expected_write_count), assert_changes( lambda: len(replacements_producer.messages), 0, 1): strategy.close() strategy.join()
def test_tick_consumer_non_monotonic(clock: Clock, broker: Broker[int]) -> None: epoch = datetime.fromtimestamp(clock.time()) topic = Topic("messages") partition = Partition(topic, 0) broker.create_topic(topic, partitions=1) producer = broker.get_producer() inner_consumer = broker.get_consumer("group") consumer = TickConsumer(inner_consumer) def assignment_callback(offsets: Mapping[Partition, int]) -> None: assignment_callback.called = True assert inner_consumer.tell() == {partition: 0} assert consumer.tell() == {partition: 0} assignment_callback.called = False consumer.subscribe([topic], on_assign=assignment_callback) producer.produce(partition, 0) clock.sleep(1) producer.produce(partition, 1) with assert_changes(lambda: assignment_callback.called, False, True): assert consumer.poll() is None assert inner_consumer.tell() == {partition: 1} assert consumer.tell() == {partition: 0} with assert_changes(inner_consumer.tell, {partition: 1}, {partition: 2}), assert_changes( consumer.tell, {partition: 0}, {partition: 1}): assert consumer.poll() == Message( partition, 0, Tick( offsets=Interval(0, 1), timestamps=Interval(epoch, epoch + timedelta(seconds=1)), ), epoch + timedelta(seconds=1), ) clock.sleep(-1) producer.produce(partition, 2) with assert_changes(inner_consumer.tell, {partition: 2}, {partition: 3}), assert_does_not_change( consumer.tell, {partition: 1}): assert consumer.poll() is None clock.sleep(2) producer.produce(partition, 3) with assert_changes(inner_consumer.tell, {partition: 3}, {partition: 4}), assert_changes( consumer.tell, {partition: 1}, {partition: 3}): assert consumer.poll() == Message( partition, 1, Tick( offsets=Interval(1, 3), timestamps=Interval(epoch + timedelta(seconds=1), epoch + timedelta(seconds=2)), ), epoch + timedelta(seconds=2), )
def test_tick_consumer(clock: Clock, broker: Broker[int], time_shift: Optional[timedelta]) -> None: epoch = datetime.fromtimestamp(clock.time()) topic = Topic("messages") broker.create_topic(topic, partitions=2) producer = broker.get_producer() for partition, payloads in enumerate([[0, 1, 2], [0]]): for payload in payloads: producer.produce(Partition(topic, partition), payload).result() inner_consumer = broker.get_consumer("group") consumer = TickConsumer(inner_consumer, time_shift=time_shift) if time_shift is None: time_shift = timedelta() def assignment_callback(offsets: Mapping[Partition, int]) -> None: assignment_callback.called = True assert consumer.tell() == { Partition(topic, 0): 0, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 0, Partition(topic, 1): 0, } assignment_callback.called = False consumer.subscribe([topic], on_assign=assignment_callback) with assert_changes(lambda: assignment_callback.called, False, True): # consume 0, 0 assert consumer.poll() is None assert consumer.tell() == { Partition(topic, 0): 0, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 1, Partition(topic, 1): 0, } # consume 0, 1 assert consumer.poll() == Message( Partition(topic, 0), 0, Tick(offsets=Interval(0, 1), timestamps=Interval(epoch, epoch)).time_shift(time_shift), epoch, ) assert consumer.tell() == { Partition(topic, 0): 1, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 2, Partition(topic, 1): 0, } # consume 0, 2 assert consumer.poll() == Message( Partition(topic, 0), 1, Tick(offsets=Interval(1, 2), timestamps=Interval(epoch, epoch)).time_shift(time_shift), epoch, ) assert consumer.tell() == { Partition(topic, 0): 2, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 3, Partition(topic, 1): 0, } # consume 1, 0 assert consumer.poll() is None assert consumer.tell() == { Partition(topic, 0): 2, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 3, Partition(topic, 1): 1, } # consume no message assert consumer.poll() is None assert consumer.tell() == { Partition(topic, 0): 2, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 3, Partition(topic, 1): 1, } consumer.seek({Partition(topic, 0): 1}) assert consumer.tell() == { Partition(topic, 0): 1, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 1, Partition(topic, 1): 1, } # consume 0, 1 assert consumer.poll() is None assert consumer.tell() == { Partition(topic, 0): 1, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 2, Partition(topic, 1): 1, } # consume 0, 2 assert consumer.poll() == Message( Partition(topic, 0), 1, Tick(offsets=Interval(1, 2), timestamps=Interval(epoch, epoch)).time_shift(time_shift), epoch, ) assert consumer.tell() == { Partition(topic, 0): 2, Partition(topic, 1): 0, } assert inner_consumer.tell() == { Partition(topic, 0): 3, Partition(topic, 1): 1, } with pytest.raises(ConsumerError): consumer.seek({Partition(topic, -1): 0})