def test_transform() -> None: next_step = Mock() def transform_function(message: Message[int]) -> int: return message.payload * 2 transform_step = TransformStep(transform_function, next_step) original_message = Message(Partition(Topic("topic"), 0), 0, 1, datetime.now()) with assert_changes(lambda: next_step.submit.call_count, 0, 1): transform_step.submit(original_message) assert next_step.submit.call_args == call( Message( original_message.partition, original_message.offset, transform_function(original_message), original_message.timestamp, )) with assert_changes(lambda: next_step.poll.call_count, 0, 1): transform_step.poll() with assert_changes(lambda: next_step.close.call_count, 0, 1), assert_changes(lambda: next_step.join.call_count, 0, 1): transform_step.join()
def test_filter() -> None: next_step = Mock() def test_function(message: Message[bool]) -> bool: return message.payload filter_step = FilterStep(test_function, next_step) fail_message = Message(Partition(Topic("topic"), 0), 0, False, datetime.now()) with assert_does_not_change(lambda: next_step.submit.call_count, 0): filter_step.submit(fail_message) pass_message = Message(Partition(Topic("topic"), 0), 0, True, datetime.now()) with assert_changes(lambda: next_step.submit.call_count, 0, 1): filter_step.submit(pass_message) assert next_step.submit.call_args == call(pass_message) with assert_changes(lambda: next_step.poll.call_count, 0, 1): filter_step.poll() with assert_changes(lambda: next_step.close.call_count, 0, 1), assert_changes(lambda: next_step.join.call_count, 0, 1): filter_step.join()
def poll(self, timeout: Optional[float] = None) -> Optional[Message[Tick]]: message = self.__consumer.poll(timeout) if message is None: return None previous_message = self.__previous_messages.get(message.partition) result: Optional[Message[Tick]] if previous_message is not None: result = Message( message.partition, previous_message.offset, Tick( Interval(previous_message.offset, message.offset), Interval(previous_message.timestamp, message.timestamp), ), message.timestamp, ) else: result = None self.__previous_messages[message.partition] = MessageDetails( message.offset, message.timestamp ) return result
def eventstream(*, dataset: Dataset): ensure_table_exists(dataset) record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 0, KafkaPayload(None, http_request.data), datetime.now(), ) type_ = record[1] metrics = DummyMetricsBackend() if type_ == "insert": from snuba.consumer import ConsumerWorker worker = ConsumerWorker(dataset, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ("ok", 200, {"Content-Type": "text/plain"})
def _wrap(self, msg: str) -> Message[KafkaPayload]: return Message( Partition(Topic("replacements"), 0), 0, KafkaPayload(None, json.dumps(msg).encode("utf-8")), datetime.now(), )
def test_send_message( self, value: str, expected: Optional[ProcessedMessage], ) -> None: storage = get_storage("groupedmessages") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( storage=storage, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 1, KafkaPayload( None, value.encode("utf-8"), [("table", "sentry_groupedmessage".encode())], ), datetime.now(), ) ret = worker.process_message(message) assert ret == expected
def __delivery_callback( self, future: Future[Message[KafkaPayload]], payload: KafkaPayload, error: KafkaError, message: ConfluentMessage, ) -> None: if error is not None: future.set_exception(TransportError(error)) else: try: timestamp_type, timestamp_value = message.timestamp() if timestamp_type is TIMESTAMP_NOT_AVAILABLE: raise ValueError("timestamp not available") future.set_result( Message( Partition(Topic(message.topic()), message.partition()), message.offset(), payload, datetime.utcfromtimestamp(timestamp_value / 1000.0), ) ) except Exception as error: future.set_exception(error)
def test_stream_processor_termination_on_error() -> None: topic = Topic("test") consumer = mock.Mock() consumer.poll.return_value = Message(Partition(topic, 0), 0, 0, datetime.now()) exception = NotImplementedError("error") strategy = mock.Mock() strategy.submit.side_effect = exception factory = mock.Mock() factory.create.return_value = strategy processor: StreamProcessor[int] = StreamProcessor(consumer, topic, factory, TestingMetricsBackend()) assignment_callback = consumer.subscribe.call_args.kwargs["on_assign"] assignment_callback({Partition(topic, 0): 0}) with pytest.raises(Exception) as e, assert_changes( lambda: strategy.terminate.call_count, 0, 1), assert_changes(lambda: consumer.close.call_count, 0, 1): processor.run() assert e.value == exception
def test_offsets(self): event = self.event message: Message[KafkaPayload] = Message( Partition(Topic("events"), 456), 123, KafkaPayload( None, json.dumps((0, "insert", event)).encode("utf-8") ), # event doesn't really matter datetime.now(), ) test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table ) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
def parallel_transform_worker_apply( function: Callable[[Message[TPayload]], TTransformed], input_batch: MessageBatch[TPayload], output_block: SharedMemory, start_index: int = 0, ) -> Tuple[int, MessageBatch[TTransformed]]: output_batch: MessageBatch[TTransformed] = MessageBatch(output_block) i = start_index while i < len(input_batch): message = input_batch[i] try: output_batch.append( Message( message.partition, message.offset, function(message), message.timestamp, )) except ValueTooLarge: # If the output batch cannot accept the transformed message when # the batch is empty, we'll never be able to write it and should # error instead of retrying. Otherwise, we need to return the # values we've already accumulated and continue processing later. if len(output_batch) == 0: raise else: break else: i += 1 return (i, output_batch)
def test_skip_too_old(self): test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event["datetime"] = old_timestamp_str event["data"]["datetime"] = old_timestamp_str event["data"]["received"] = int(calendar.timegm(old_timestamp.timetuple())) message: Message[KafkaPayload] = Message( Partition(Topic("events"), 1), 42, KafkaPayload(None, json.dumps((0, "insert", event)).encode("utf-8")), datetime.now(), ) assert test_worker.process_message(message) is None
def test_delete_groups_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.write_raw_events(self.event) assert self._issue_count(self.project_id) == [{"count": 1, "group_id": 1}] timestamp = datetime.now(tz=pytz.utc) project_id = self.project_id message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps( ( 2, "end_delete_groups", { "project_id": project_id, "group_ids": [1], "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, ) ).encode("utf-8"), ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert self._issue_count(self.project_id) == []
def produce( self, partition: Partition, payload: TPayload, timestamp: datetime ) -> Message[TPayload]: messages = self.__get_messages(partition) offset = len(messages) messages.append((payload, timestamp)) return Message(partition, offset, payload, timestamp)
def produce(self, partition: Partition, payload: TPayload) -> Message[TPayload]: with self.__lock: messages = self.__topics[partition.topic][partition.index] offset = len(messages) messages.append(payload) return Message(partition, offset, payload, epoch)
def __make_msg(self, partition: int, offset: int, payload: str, headers: Headers) -> Message[KafkaPayload]: return Message( partition=Partition(Topic("topic"), partition), offset=offset, payload=KafkaPayload(b"key", payload.encode(), headers), timestamp=datetime(2019, 6, 19, 6, 46, 28), )
def produce(self, partition: Partition, payload: TPayload) -> Message[TPayload]: with self.__lock: messages = self.__topics[partition.topic][partition.index] offset = len(messages) timestamp = datetime.fromtimestamp(self.__clock.time()) messages.append((payload, timestamp)) return Message(partition, offset, payload, timestamp)
def submit(self, message: Message[TPayload]) -> None: assert not self.__closed self.__next_step.submit( Message( message.partition, message.offset, self.__transform_function(message), message.timestamp, ))
def produce( self, partition: Partition, payload: TPayload, timestamp: datetime ) -> Message[TPayload]: encoded = self.__codec.encode((payload, timestamp)) file = self.__get_file_partition(partition).writer offset = file.tell() file.write(self.__record_header.pack(len(encoded), crc32(encoded))) file.write(encoded) file.flush() next_offset = file.tell() return Message(partition, offset, payload, timestamp, next_offset)
def consume(self, partition: Partition, offset: int) -> Optional[Message[TPayload]]: messages = self.__get_messages(partition) try: payload, timestamp = messages[offset] except IndexError: if offset == len(messages): return None else: raise OffsetOutOfRange() return Message(partition, offset, payload, timestamp)
def test_parallel_transform_worker_apply() -> None: messages = [ Message( Partition(Topic("test"), 0), i, KafkaPayload(None, b"\x00" * size, None), datetime.now(), ) for i, size in enumerate([1000, 1000, 2000, 4000]) ] with SharedMemoryManager() as smm: input_block = smm.SharedMemory(8192) assert input_block.size == 8192 input_batch = MessageBatch(input_block) for message in messages: input_batch.append(message) assert len(input_batch) == 4 output_block = smm.SharedMemory(4096) assert output_block.size == 4096 index, output_batch = parallel_transform_worker_apply( transform_payload_expand, input_batch, output_block, ) # The first batch should be able to fit 2 messages. assert index == 2 assert len(output_batch) == 2 index, output_batch = parallel_transform_worker_apply( transform_payload_expand, input_batch, output_block, index, ) # The second batch should be able to fit one message. assert index == 3 assert len(output_batch) == 1 # The last message is too large to fit in the batch. with pytest.raises(ValueTooLarge): parallel_transform_worker_apply( transform_payload_expand, input_batch, output_block, index, )
def test_delete_tag_promoted_insert(self): self.event["project_id"] = self.project_id self.event["group_id"] = 1 self.event["data"]["tags"].append(["browser.name", "foo"]) self.event["data"]["tags"].append(["notbrowser", "foo"]) self.write_raw_events(self.event) project_id = self.project_id def _issue_count(total=False): return json.loads( self.app.post( "/query", data=json.dumps({ "project": [project_id], "aggregations": [["count()", "", "count"]], "conditions": [["tags[browser.name]", "=", "foo"]] if not total else [], "groupby": ["group_id"], }), ).data)["data"] assert _issue_count() == [{"count": 1, "group_id": 1}] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}] timestamp = datetime.now(tz=pytz.utc) message: Message[KafkaPayload] = Message( Partition(Topic("replacements"), 1), 42, KafkaPayload( None, json.dumps(( 2, "end_delete_tag", { "project_id": project_id, "tag": "browser.name", "datetime": timestamp.strftime(PAYLOAD_DATETIME_FORMAT), }, )).encode("utf-8"), ), datetime.now(), ) processed = self.replacer.process_message(message) self.replacer.flush_batch([processed]) assert _issue_count() == [] assert _issue_count(total=True) == [{"count": 1, "group_id": 1}]
def consume(self, partition: Partition, offset: int) -> Optional[Message[TPayload]]: with self.__lock: messages = self.__topics[partition.topic][partition.index] try: payload, timestamp = messages[offset] except IndexError: if offset == len(messages): return None else: raise Exception("invalid offset") return Message(partition, offset, payload, timestamp)
def get_messages(events_file) -> Sequence[Message[KafkaPayload]]: "Create a fake Kafka message for each JSON event in the file." messages: MutableSequence[Message[KafkaPayload]] = [] raw_events = open(events_file).readlines() for raw_event in raw_events: messages.append( Message( Partition(Topic("events"), 1), 0, KafkaPayload(None, raw_event.encode("utf-8")), datetime.now(), ), ) return messages
def set_decoded_future_result( encoded_future: Future[Message[TEncoded]], ) -> None: try: message = encoded_future.result() except Exception as e: decoded_future.set_exception(e) else: decoded_future.set_result( Message( message.partition, message.offset, payload, message.timestamp, message.next_offset, ) )
def parallel_transform_worker_apply( function: Callable[[Message[TPayload]], TTransformed], input_batch: MessageBatch[TPayload], output_block: SharedMemory, start_index: int = 0, ) -> Tuple[int, MessageBatch[TTransformed]]: output_batch: MessageBatch[TTransformed] = MessageBatch(output_block) i = start_index while i < len(input_batch): message = input_batch[i] try: result = function(message) except Exception: # The remote traceback thrown when retrieving the result from the # pool elides a lot of useful data (and usually includes a # truncated traceback), logging it here allows us to get this # information at the expense of sending duplicate events to Sentry # (one from the child and one from the parent.) logger.warning( "Caught exception while applying %r to %r!", function, message, exc_info=True, ) raise try: output_batch.append( Message(message.partition, message.offset, result, message.timestamp)) except ValueTooLarge: # If the output batch cannot accept the transformed message when # the batch is empty, we'll never be able to write it and should # error instead of retrying. Otherwise, we need to return the # values we've already accumulated and continue processing later. if len(output_batch) == 0: raise else: break else: i += 1 return (i, output_batch)
def test_message_batch() -> None: partition = Partition(Topic("test"), 0) with SharedMemoryManager() as smm: block = smm.SharedMemory(4096) assert block.size == 4096 message = Message(partition, 0, KafkaPayload(None, b"\x00" * 4000, None), datetime.now()) batch: MessageBatch[KafkaPayload] = MessageBatch(block) with assert_changes(lambda: len(batch), 0, 1): batch.append(message) assert batch[0] == message assert list(batch) == [message] with assert_does_not_change(lambda: len(batch), 1), pytest.raises(ValueTooLarge): batch.append(message)
def poll(self, timeout: Optional[float] = None) -> Optional[Message[Tick]]: message = self.__consumer.poll(timeout) if message is None: return None previous_message = self.__previous_messages.get(message.partition) result: Optional[Message[Tick]] if previous_message is not None: try: time_interval = Interval(previous_message.timestamp, message.timestamp) except InvalidRangeError: logger.warning( "Could not construct valid time interval between %r and %r!", previous_message, message, exc_info=True, ) return None else: result = Message( message.partition, previous_message.offset, Tick( Interval(previous_message.offset, message.offset), time_interval, ), message.timestamp, ) else: result = None self.__previous_messages[message.partition] = MessageDetails( message.offset, message.timestamp) return result
def consume(self, partition: Partition, offset: int) -> Optional[Message[TPayload]]: file_partition = self.__get_file_partition(partition) file = file_partition.reader if file.tell() != offset: file.seek(offset) size_raw = file.read(self.__record_header.size) if not size_raw: if offset > file_partition.size(): raise OffsetOutOfRange() else: return None [size, expected_checksum] = self.__record_header.unpack(size_raw) encoded = file.read(size) actual_checksum = crc32(encoded) if not actual_checksum == expected_checksum: raise InvalidChecksum( f"checksum mismatch: expected {expected_checksum:#0x}, got {actual_checksum:#0x}" ) payload, timestamp = self.__codec.decode(encoded) return Message(partition, offset, payload, timestamp, file.tell())
def test_stream_processor_lifecycle() -> None: topic = Topic("topic") consumer = mock.Mock() strategy = mock.Mock() factory = mock.Mock() factory.create.return_value = strategy metrics = TestingMetricsBackend() with assert_changes(lambda: consumer.subscribe.call_count, 0, 1): processor: StreamProcessor[int] = StreamProcessor( consumer, topic, factory, metrics) # The processor should accept heartbeat messages without an assignment or # active processor. consumer.poll.return_value = None processor._run_once() message = Message(Partition(topic, 0), 0, 0, datetime.now()) # XXX: ``call().args``, ``call().kwargs`` are not available until 3.8 subscribe_args, subscribe_kwargs = consumer.subscribe.call_args assert subscribe_args[0] == [topic] assignment_callback = subscribe_kwargs["on_assign"] revocation_callback = subscribe_kwargs["on_revoke"] # Assignment should succeed if no assignment already exxists. offsets = {Partition(topic, 0): 0} assignment_callback(offsets) # If ``Consumer.poll`` doesn't return a message, we should poll the # processing strategy, but not submit anything for processing. consumer.poll.return_value = None with assert_changes(lambda: strategy.poll.call_count, 0, 1), assert_does_not_change( lambda: strategy.submit.call_count, 0): processor._run_once() # If ``Consumer.poll`` **does** return a message, we should poll the # processing strategy and submit the message for processing. consumer.poll.return_value = message with assert_changes(lambda: strategy.poll.call_count, 1, 2), assert_changes(lambda: strategy.submit.call_count, 0, 1): processor._run_once() assert strategy.submit.call_args_list[-1] == mock.call(message) # If the message is rejected by the processing strategy, the consumer # should be paused and the message should be held for later. consumer.tell.return_value = offsets consumer.poll.return_value = message strategy.submit.side_effect = MessageRejected() with assert_changes(lambda: consumer.pause.call_count, 0, 1): processor._run_once() assert strategy.submit.call_args_list[-1] == mock.call(message) # If ``Consumer.poll`` returns a message when we expect it to be paused, # we should raise an exception. with pytest.raises(InvalidStateError): processor._run_once() # Once the message is accepted by the processing strategy, the consumer # should be resumed. consumer.poll.return_value = None strategy.submit.return_value = None strategy.submit.side_effect = None with assert_changes(lambda: consumer.resume.call_count, 0, 1): processor._run_once() assert strategy.submit.call_args_list[-1] == mock.call(message) metric = metrics.calls[0] assert isinstance(metric, Timing) assert metric.name == "pause_duration_ms" # Assignment should fail if one already exists. with pytest.raises(InvalidStateError): assignment_callback({Partition(topic, 0): 0}) # Revocation should succeed with an active assignment, and cause the # strategy instance to be closed. with assert_changes(lambda: strategy.close.call_count, 0, 1): revocation_callback([Partition(topic, 0)]) # Revocation should fail without an active assignment. with pytest.raises(InvalidStateError): revocation_callback([Partition(topic, 0)]) # The processor should not accept non-heartbeat messages without an # assignment or active processor. consumer.poll.return_value = message with pytest.raises(InvalidStateError): processor._run_once() with assert_changes(lambda: consumer.close.call_count, 0, 1): processor._shutdown()
def poll(self, timeout: Optional[float] = None) -> Optional[Message[TPayload]]: """ Return the next message available to be consumed, if one is available. If no message is available, this method will block up to the ``timeout`` value before returning ``None``. A timeout of ``0.0`` represents "do not block", while a timeout of ``None`` represents "block until a message is available (or forever)". Calling this method may also invoke subscription state change callbacks. This method may also raise an ``EndOfPartition`` error (a subtype of ``ConsumerError``) when the consumer has reached the end of a partition that it is subscribed to and no additional messages are available. The ``partition`` attribute of the raised exception specifies the end which partition has been reached. (Since this consumer is multiplexing a set of partitions, this exception does not mean that *all* of the partitions that the consumer is subscribed to do not have any messages, just that it has reached the end of one of them. This also does not mean that additional messages won't be available in future poll calls.) Not every backend implementation supports this feature or is configured to raise in this scenario. Raises an ``InvalidState`` exception if called on a closed consumer. Raises a ``TransportError`` for various other consumption-related errors. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) message: Optional[ConfluentMessage] = self.__consumer.poll( *[timeout] if timeout is not None else []) if message is None: return None error: Optional[KafkaError] = message.error() if error is not None: code = error.code() if code == KafkaError._PARTITION_EOF: raise EndOfPartition( Partition(Topic(message.topic()), message.partition()), message.offset(), ) elif code == KafkaError._TRANSPORT: raise TransportError(str(error)) else: raise ConsumerError(str(error)) headers: Optional[Headers] = message.headers() result = Message( Partition(Topic(message.topic()), message.partition()), message.offset(), self.__codec.decode( KafkaPayload( message.key(), message.value(), headers if headers is not None else [], )), datetime.utcfromtimestamp(message.timestamp()[1] / 1000.0), ) self.__offsets[result.partition] = result.get_next_offset() return result