def test_filter() -> None: next_step = Mock() def test_function(message: Message[bool]) -> bool: return message.payload filter_step = FilterStep(test_function, next_step) fail_message = Message(Partition(Topic("topic"), 0), 0, False, datetime.now()) with assert_does_not_change(lambda: next_step.submit.call_count, 0): filter_step.submit(fail_message) pass_message = Message(Partition(Topic("topic"), 0), 0, True, datetime.now()) with assert_changes(lambda: next_step.submit.call_count, 0, 1): filter_step.submit(pass_message) assert next_step.submit.call_args == call(pass_message) with assert_changes(lambda: next_step.poll.call_count, 0, 1): filter_step.poll() with assert_changes(lambda: next_step.close.call_count, 0, 1), assert_changes(lambda: next_step.join.call_count, 0, 1): filter_step.join()
def revocation_callback(partitions: Sequence[Partition]) -> None: revocation_callback.called = True assert partitions == [Partition(topic, 0)] assert consumer.tell() == {Partition(topic, 0): messages[1].offset} # Not sure why you'd want to do this, but it shouldn't error. consumer.seek({Partition(topic, 0): messages[0].offset})
def test_stream_processor_termination_on_error() -> None: topic = Topic("test") consumer = mock.Mock() consumer.poll.return_value = Message(Partition(topic, 0), 0, 0, datetime.now()) exception = NotImplementedError("error") strategy = mock.Mock() strategy.submit.side_effect = exception factory = mock.Mock() factory.create.return_value = strategy processor: StreamProcessor[int] = StreamProcessor(consumer, topic, factory, TestingMetricsBackend()) assignment_callback = consumer.subscribe.call_args.kwargs["on_assign"] assignment_callback({Partition(topic, 0): 0}) with pytest.raises(Exception) as e, assert_changes( lambda: strategy.terminate.call_count, 0, 1), assert_changes(lambda: consumer.close.call_count, 0, 1): processor.run() assert e.value == exception
def assignment_callback(partitions: Mapping[Partition, int]) -> None: assignment_callback.called = True assert partitions == {Partition(topic, 0): messages[0].offset} consumer.seek({Partition(topic, 0): messages[1].offset}) with pytest.raises(ConsumerError): consumer.seek({Partition(topic, 1): 0})
def test_synchronized_consumer_handles_end_of_partition( broker: Broker[KafkaPayload], ) -> None: topic = Topic("topic") commit_log_topic = Topic("commit-log") broker.create_topic(topic, partitions=1) broker.create_topic(commit_log_topic, partitions=1) consumer = broker.get_consumer("consumer", enable_end_of_partition=True) producer = broker.get_producer() commit_log_consumer = broker.get_consumer("commit-log-consumer") messages = [ producer.produce(topic, KafkaPayload(None, f"{i}".encode("utf8"), [])).result(1.0) for i in range(2) ] synchronized_consumer: Consumer[KafkaPayload] = SynchronizedConsumer( consumer, commit_log_consumer, commit_log_topic=commit_log_topic, commit_log_groups={"leader"}, ) with closing(synchronized_consumer): synchronized_consumer.subscribe([topic]) wait_for_consumer( commit_log_consumer, producer.produce( commit_log_topic, commit_codec.encode( Commit("leader", Partition(topic, 0), messages[0].next_offset), ), ).result(), ) assert synchronized_consumer.poll(0) == messages[0] # If the commit log consumer does not handle EOF, it will have crashed # here and will never return the next message. wait_for_consumer( commit_log_consumer, producer.produce( commit_log_topic, commit_codec.encode( Commit("leader", Partition(topic, 0), messages[1].next_offset), ), ).result(), ) assert synchronized_consumer.poll(0) == messages[1]
def test_pause_resume_rebalancing(self) -> None: payloads = self.get_payloads() with self.get_topic(2) as topic, closing( self.get_producer()) as producer, closing( self.get_consumer( "group", enable_end_of_partition=False)) as consumer_a, closing( self.get_consumer( "group", enable_end_of_partition=False)) as consumer_b: messages = [ producer.produce(Partition(topic, i), next(payloads)).result(timeout=5.0) for i in range(2) ] consumer_a.subscribe([topic]) # It doesn't really matter which message is fetched first -- we # just want to know the assignment occurred. assert (consumer_a.poll(10.0) in messages) # XXX: getting the subcription is slow assert len(consumer_a.tell()) == 2 assert len(consumer_b.tell()) == 0 # Pause all partitions. consumer_a.pause([Partition(topic, 0), Partition(topic, 1)]) assert set(consumer_a.paused()) == set( [Partition(topic, 0), Partition(topic, 1)]) consumer_b.subscribe([topic]) for i in range(10): assert consumer_a.poll( 0) is None # attempt to force session timeout if consumer_b.poll(1.0) is not None: break else: assert False, "rebalance did not occur" # The first consumer should have had its offsets rolled back, as # well as should have had it's partition resumed during # rebalancing. assert consumer_a.paused() == [] assert consumer_a.poll(10.0) is not None assert len(consumer_a.tell()) == 1 assert len(consumer_b.tell()) == 1
def test_synchronized_consumer_handles_end_of_partition() -> None: topic = Topic("topic") commit_log_topic = Topic("commit-log") broker: DummyBroker[int] = DummyBroker() broker.create_topic(topic, partitions=1) consumer: Consumer[int] = DummyConsumer(broker, "consumer") producer: Producer[int] = DummyProducer(broker) messages = [producer.produce(topic, i).result(1.0) for i in range(2)] commit_log_broker: DummyBroker[Commit] = DummyBroker() commit_log_broker.create_topic(commit_log_topic, partitions=1) commit_log_consumer: Consumer[Commit] = DummyConsumer( commit_log_broker, "commit-log-consumer", enable_end_of_partition=True) commit_log_producer: Producer[Commit] = DummyProducer(commit_log_broker) synchronized_consumer: Consumer[int] = SynchronizedConsumer( consumer, commit_log_consumer, commit_log_topic=commit_log_topic, commit_log_groups={"leader"}, ) with closing(synchronized_consumer): synchronized_consumer.subscribe([topic]) wait_for_consumer( commit_log_consumer, commit_log_producer.produce( commit_log_topic, Commit("leader", Partition(topic, 0), messages[0].get_next_offset()), ).result(), ) assert synchronized_consumer.poll(0) == messages[0] # If the commit log consumer does not handle EOF, it will have crashed # here and will never return the next message. wait_for_consumer( commit_log_consumer, commit_log_producer.produce( commit_log_topic, Commit("leader", Partition(topic, 0), messages[1].get_next_offset()), ).result(), ) assert synchronized_consumer.poll(0) == messages[1]
def test_consumer_offset_out_of_range(self) -> None: payloads = self.get_payloads() with self.get_topic() as topic: with closing(self.get_producer()) as producer: messages = [ producer.produce(topic, next(payloads)).result(5.0) ] consumer = self.get_consumer() consumer.subscribe([topic]) for i in range(5): message = consumer.poll(1.0) if message is not None: break else: time.sleep(1.0) else: raise Exception("assignment never received") with pytest.raises(EndOfPartition): consumer.poll() # Somewhat counterintuitively, seeking to an invalid position # should be allowed -- we don't know it's invalid until we try and # read from it. consumer.seek( {Partition(topic, 0): messages[-1].next_offset + 1000}) with pytest.raises(OffsetOutOfRange): consumer.poll()
def eventstream(*, dataset: Dataset): ensure_table_exists(dataset) record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data), datetime.now(), ) type_ = record[1] metrics = DummyMetricsBackend() if type_ == "insert": from snuba.consumer import ConsumerWorker worker = ConsumerWorker(dataset, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def test_delete_groups_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.write_raw_events(self.event) assert self._issue_count(self.project_id) == [{"count": 1, "group_id": 1}] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps( ( 2, "end_delete_groups", { "project_id": project_id, "group_ids": [1], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, ) ).encode("utf-8"), ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == []
def produce(self, destination: Union[Topic, Partition], payload: TPayload) -> Future[Message[TPayload]]: with self.__lock: assert not self.__closed partition: Partition if isinstance(destination, Topic): partition = Partition( destination, random.randint( 0, self.__broker.get_topic_partition_count(destination) - 1), ) elif isinstance(destination, Partition): partition = destination else: raise TypeError("invalid destination type") future: Future[Message[TPayload]] = Future() future.set_running_or_notify_cancel() try: message = self.__broker.produce(partition, payload) future.set_result(message) except Exception as e: future.set_exception(e) return future
def subscribe(self, consumer: LocalConsumer[TPayload], topics: Sequence[Topic]) -> Mapping[Partition, int]: with self.__lock: if self.__subscriptions[consumer.group]: # XXX: Consumer group balancing is not currently implemented. if consumer not in self.__subscriptions[consumer.group]: raise NotImplementedError # XXX: Updating an existing subscription is currently not implemented. if self.__subscriptions[consumer.group][consumer] != topics: raise NotImplementedError self.__subscriptions[consumer.group][consumer] = topics assignment: MutableMapping[Partition, int] = {} for topic in set(topics): partition_count = self.__message_storage.get_partition_count( topic) for index in range(partition_count): partition = Partition(topic, index) # TODO: Handle offset reset more realistically. assignment[partition] = self.__offsets[consumer.group].get( partition, 0) return assignment
def test_transform() -> None: next_step = Mock() def transform_function(message: Message[int]) -> int: return message.payload * 2 transform_step = TransformStep(transform_function, next_step) original_message = Message(Partition(Topic("topic"), 0), 0, 1, datetime.now()) with assert_changes(lambda: next_step.submit.call_count, 0, 1): transform_step.submit(original_message) assert next_step.submit.call_args == call( Message( original_message.partition, original_message.offset, transform_function(original_message), original_message.timestamp, )) with assert_changes(lambda: next_step.poll.call_count, 0, 1): transform_step.poll() with assert_changes(lambda: next_step.close.call_count, 0, 1), assert_changes(lambda: next_step.join.call_count, 0, 1): transform_step.join()
def _wrap(self, msg: str) -> Message[KafkaPayload]: return Message( Partition(Topic("replacements"), 0), 0, KafkaPayload(None, json.dumps(msg).encode("utf-8")), datetime.now(), )
def test_offsets(self): event = self.event message: Message[KafkaPayload] = Message( Partition(Topic("events"), 456), 123, KafkaPayload( None, json.dumps((0, "insert", event)).encode("utf-8") ), # event doesn't really matter datetime.now(), ) test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table ) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
def __delivery_callback( self, future: Future[Message[KafkaPayload]], payload: KafkaPayload, error: KafkaError, message: ConfluentMessage, ) -> None: if error is not None: future.set_exception(TransportError(error)) else: try: timestamp_type, timestamp_value = message.timestamp() if timestamp_type is TIMESTAMP_NOT_AVAILABLE: raise ValueError("timestamp not available") future.set_result( Message( Partition(Topic(message.topic()), message.partition()), message.offset(), payload, datetime.utcfromtimestamp(timestamp_value / 1000.0), ) ) except Exception as error: future.set_exception(error)
def test_skip_too_old(self): test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event["datetime"] = old_timestamp_str event["data"]["datetime"] = old_timestamp_str event["data"]["received"] = int(calendar.timegm(old_timestamp.timetuple())) message: Message[KafkaPayload] = Message( Partition(Topic("events"), 1), 42, KafkaPayload(None, json.dumps((0, "insert", event)).encode("utf-8")), datetime.now(), ) assert test_worker.process_message(message) is None
def test_send_message( self, value: str, expected: Optional[ProcessedMessage], ) -> None: storage = get_storage("groupedmessages") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( storage=storage, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 1, KafkaPayload( None, value.encode("utf-8"), [("table", "sentry_groupedmessage".encode())], ), datetime.now(), ) ret = worker.process_message(message) assert ret == expected
def test_commit_log_consumer(self) -> None: # XXX: This would be better as an integration test (or at least a test # against an abstract Producer interface) instead of against a test against # a mock. commit_log_producer = FakeConfluentKafkaProducer() consumer: KafkaConsumer[int] = KafkaConsumerWithCommitLog( { **self.configuration, "auto.offset.reset": "earliest", "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "true", "group.id": "test", "session.timeout.ms": 10000, }, codec=self.codec, producer=commit_log_producer, commit_log_topic=Topic("commit-log"), ) with self.get_topic() as topic, closing(consumer) as consumer: consumer.subscribe([topic]) with closing(self.get_producer()) as producer: producer.produce(topic, 0).result(5.0) message = consumer.poll( 10.0) # XXX: getting the subscription is slow assert isinstance(message, Message) consumer.stage_offsets( {message.partition: message.get_next_offset()}) assert consumer.commit_offsets() == { Partition(topic, 0): message.get_next_offset() } assert len(commit_log_producer.messages) == 1 commit_message = commit_log_producer.messages[0] assert commit_message.topic() == "commit-log" assert CommitCodec().decode( KafkaPayload(commit_message.key(), commit_message.value())) == Commit( "test", Partition(topic, 0), message.get_next_offset())
def __make_msg(self, partition: int, offset: int, payload: str, headers: Headers) -> Message[KafkaPayload]: return Message( partition=Partition(Topic("topic"), partition), offset=offset, payload=KafkaPayload(b"key", payload.encode(), headers), timestamp=datetime(2019, 6, 19, 6, 46, 28), )
def unsubscribe(self, consumer: LocalConsumer[TPayload]) -> Sequence[Partition]: with self.__lock: partitions: MutableSequence[Partition] = [] for topic in self.__subscriptions[consumer.group].pop(consumer): partitions.extend( Partition(topic, i) for i in range( self.__message_storage.get_partition_count(topic))) return partitions
def test_unaligned_offset(self) -> None: topic = Topic(uuid.uuid1().hex) partition = Partition(topic, 0) self.storage.create_topic(topic, 1) message = self.storage.produce(partition, 1, datetime.now()) invalid_offset = message.offset + 4 assert message.next_offset > invalid_offset > message.offset with pytest.raises(InvalidChecksum): self.storage.consume(partition, invalid_offset)
def test_parallel_transform_worker_apply() -> None: messages = [ Message( Partition(Topic("test"), 0), i, KafkaPayload(None, b"\x00" * size, None), datetime.now(), ) for i, size in enumerate([1000, 1000, 2000, 4000]) ] with SharedMemoryManager() as smm: input_block = smm.SharedMemory(8192) assert input_block.size == 8192 input_batch = MessageBatch(input_block) for message in messages: input_batch.append(message) assert len(input_batch) == 4 output_block = smm.SharedMemory(4096) assert output_block.size == 4096 index, output_batch = parallel_transform_worker_apply( transform_payload_expand, input_batch, output_block, ) # The first batch should be able to fit 2 messages. assert index == 2 assert len(output_batch) == 2 index, output_batch = parallel_transform_worker_apply( transform_payload_expand, input_batch, output_block, index, ) # The second batch should be able to fit one message. assert index == 3 assert len(output_batch) == 1 # The last message is too large to fit in the batch. with pytest.raises(ValueTooLarge): parallel_transform_worker_apply( transform_payload_expand, input_batch, output_block, index, )
def decode(self, value: KafkaPayload) -> Commit: key = value.key if not isinstance(key, bytes): raise TypeError("payload key must be a bytes object") val = value.value if not isinstance(val, bytes): raise TypeError("payload value must be a bytes object") topic_name, partition_index, group = key.decode("utf-8").split(":", 3) offset = int(val.decode("utf-8")) return Commit(group, Partition(Topic(topic_name), int(partition_index)), offset)
def test_delete_tag_promoted_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["data"]["tags"].append(["browser.name", "foo"]) self.event["data"]["tags"].append(["notbrowser", "foo"]) self.write_raw_events(self.event) project_id = self.project_id def _issue_count(total=False): return json.loads( self.app.post( "/query", data=json.dumps({ "project": [project_id], "aggregations": [["count()", "", "count"]], "conditions": [["tags[browser.name]", "=", "foo"]] if not total else [], "groupby": ["group_id"], }), ).data)["data"] assert _issue_count() == [{"count": 1, "group_id": 1}] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}] timestamp = datetime.now(tz=pytz.utc) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_delete_tag", { "project_id": project_id, "tag": "browser.name", "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert _issue_count() == [] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
def get_messages(events_file) -> Sequence[Message[KafkaPayload]]: "Create a fake Kafka message for each JSON event in the file." messages: MutableSequence[Message[KafkaPayload]] = [] raw_events = open(events_file).readlines() for raw_event in raw_events: messages.append( Message( Partition(Topic("events"), 1), 0, KafkaPayload(None, raw_event.encode("utf-8")), datetime.now(), ), ) return messages
def test_auto_offset_reset_latest(self) -> None: with self.get_topic() as topic: with closing(self.get_producer()) as producer: producer.produce(topic, next(self.get_payloads())).result(5.0) with closing( self.get_consumer(auto_offset_reset="latest")) as consumer: consumer.subscribe([topic]) try: consumer.poll(10.0) # XXX: getting the subcription is slow except EndOfPartition as error: assert error.partition == Partition(topic, 0) assert error.offset == 1 else: raise AssertionError("expected EndOfPartition error")
def assignment_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition] ) -> None: self.__state = KafkaConsumerState.ASSIGNING try: assignment: MutableSequence[ConfluentTopicPartition] = [] for partition in self.__consumer.committed(partitions): if partition.offset >= 0: assignment.append(partition) elif partition.offset == OFFSET_INVALID: assignment.append( self.__resolve_partition_starting_offset(partition) ) else: raise ValueError("received unexpected offset") offsets: MutableMapping[Partition, int] = { Partition(Topic(i.topic), i.partition): i.offset for i in assignment } self.__seek(offsets) # Ensure that all partitions are resumed on assignment to avoid # carrying over state from a previous assignment. self.__consumer.resume( [ ConfluentTopicPartition( partition.topic.name, partition.index, offset ) for partition, offset in offsets.items() ] ) for partition in offsets: self.__paused.discard(partition) except Exception: self.__state = KafkaConsumerState.ERROR raise try: if on_assign is not None: on_assign(offsets) finally: self.__state = KafkaConsumerState.CONSUMING
def test_collect() -> None: step_factory = Mock() step_factory.return_value = inner_step = Mock() commit_function = Mock() partition = Partition(Topic("topic"), 0) messages = message_generator(partition, 0) collect_step = CollectStep(step_factory, commit_function, 2, 60) # A batch should be started the first time the step receives a message. with assert_changes(lambda: step_factory.call_count, 0, 1): collect_step.poll() collect_step.submit(next(messages)) # offset 0 # Subsequent messages should reuse the existing batch, ... with assert_does_not_change(lambda: step_factory.call_count, 1): collect_step.poll() collect_step.submit(next(messages)) # offset 1 # ...until we hit the batch size limit. with assert_changes(lambda: inner_step.close.call_count, 0, 1), assert_changes( lambda: inner_step.join.call_count, 0, 1), assert_changes( lambda: commit_function.call_count, 0, 1): collect_step.poll() assert commit_function.call_args == call({partition: 2}) step_factory.return_value = inner_step = Mock() # The next message should create a new batch. with assert_changes(lambda: step_factory.call_count, 1, 2): collect_step.submit(next(messages)) with assert_changes(lambda: inner_step.close.call_count, 0, 1): collect_step.close() with assert_changes(lambda: inner_step.join.call_count, 0, 1), assert_changes(lambda: commit_function.call_count, 1, 2): collect_step.join()
def test_message_batch() -> None: partition = Partition(Topic("test"), 0) with SharedMemoryManager() as smm: block = smm.SharedMemory(4096) assert block.size == 4096 message = Message(partition, 0, KafkaPayload(None, b"\x00" * 4000, None), datetime.now()) batch: MessageBatch[KafkaPayload] = MessageBatch(block) with assert_changes(lambda: len(batch), 0, 1): batch.append(message) assert batch[0] == message assert list(batch) == [message] with assert_does_not_change(lambda: len(batch), 1), pytest.raises(ValueTooLarge): batch.append(message)