def __init__( self, entity_key: EntityKey, mode: SchedulingWatermarkMode, schedule_ttl: int, stale_threshold_seconds: Optional[int], partitions: int, producer: Producer[KafkaPayload], scheduled_topic_spec: KafkaTopicSpec, metrics: MetricsBackend, ) -> None: self.__mode = mode self.__stale_threshold_seconds = stale_threshold_seconds self.__partitions = partitions self.__producer = producer self.__scheduled_topic_spec = scheduled_topic_spec self.__metrics = metrics self.__buffer_size = settings.SUBSCRIPTIONS_ENTITY_BUFFER_SIZE.get( entity_key.value, settings.SUBSCRIPTIONS_DEFAULT_BUFFER_SIZE) self.__schedulers = { index: SubscriptionScheduler( entity_key, RedisSubscriptionDataStore(redis_client, entity_key, PartitionId(index)), partition_id=PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=self.__metrics, ) for index in range(self.__partitions) }
def test_subscription_worker_consistent( subscription_data: SubscriptionData) -> None: state.set_config("event_subscription_non_consistent_sample_rate", 1) broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(), TestingClock()) result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 1 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), subscription_data, ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = TestingMetricsBackend() dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), DummyMetricsBackend(strict=True)) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) worker.process_message(Message(Partition(Topic("events"), 0), 0, tick, now)) time.sleep(0.1) assert (len([ m for m in metrics.calls if isinstance(m, Increment) and m.name == "consistent" ]) == 1)
def test(self): executor = SubscriptionExecutor(self.dataset, ThreadPoolExecutor(), DummyMetricsBackend(strict=True)) subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), SubscriptionData( project_id=self.project_id, conditions=[["platform", "IN", ["a"]]], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=500), resolution=timedelta(minutes=1), ), ) now = datetime.utcnow() tick = Tick( offsets=Interval(1, 2), timestamps=Interval(now - timedelta(minutes=1), now), ) result = executor.execute(ScheduledTask(now, subscription), tick).result() assert result["data"][0]["count"] == 10 result = executor.execute( ScheduledTask( now + timedelta(minutes=self.minutes) + subscription.data.time_window, subscription, ), tick, ).result() assert result["data"][0]["count"] == 0
def test_encode(self): result = SubscriptionResult( ScheduledTask( datetime.now(), Subscription( SubscriptionIdentifier(PartitionId(1), uuid.uuid1()), SubscriptionData( 1, [], [["count()", "", "count"]], timedelta(minutes=1), timedelta(minutes=1), ), ), ), {"data": { "count": 100 }}, ) codec = SubscriptionResultCodec() message = codec.encode(result) data = json.loads(message.value.decode("utf-8")) assert data["version"] == 1 payload = data["payload"] assert payload["subscription_id"] == str(result.task.task.identifier) assert payload["values"] == result.result assert payload["timestamp"] == result.task.timestamp.isoformat()
def build_subscription(resolution: timedelta, sequence: int) -> Subscription: return Subscription( SubscriptionIdentifier(PartitionId(1), UUIDS[sequence]), SnQLSubscriptionData( project_id=1, time_window=timedelta(minutes=5), resolution=resolution, query="MATCH events SELECT count()", ), )
def build_subscription(resolution: timedelta, sequence: int) -> Subscription: entity_subscription = EventsSubscription(data_dict={}) return Subscription( SubscriptionIdentifier(PartitionId(1), UUIDS[sequence]), SubscriptionData( project_id=1, time_window_sec=int(timedelta(minutes=5).total_seconds()), resolution_sec=int(resolution.total_seconds()), query="MATCH events SELECT count()", entity_subscription=entity_subscription, ), )
def delete_subscription( *, dataset: Dataset, partition: int, key: str, entity: Entity ) -> RespTuple: if entity not in dataset.get_all_entities(): raise InvalidSubscriptionError( "Invalid subscription dataset and entity combination" ) entity_key = ENTITY_NAME_LOOKUP[entity] SubscriptionDeleter(entity_key, PartitionId(partition)).delete(UUID(key)) metrics.increment("subscription_deleted", tags={"entity": entity_key.value}) return "ok", 202, {"Content-Type": "text/plain"}
def test_subscription_task_result_encoder() -> None: codec = SubscriptionTaskResultEncoder() timestamp = datetime.now() entity_subscription = EventsSubscription(data_dict={}) subscription_data = SubscriptionData( project_id=1, query="MATCH (events) SELECT count() AS count", time_window_sec=60, resolution_sec=60, entity_subscription=entity_subscription, ) # XXX: This seems way too coupled to the dataset. request = subscription_data.build_request(get_dataset("events"), timestamp, None, Timer("timer")) result: Result = { "meta": [{ "type": "UInt64", "name": "count" }], "data": [{ "count": 1 }], } task_result = SubscriptionTaskResult( ScheduledSubscriptionTask( timestamp, SubscriptionWithMetadata( EntityKey.EVENTS, Subscription( SubscriptionIdentifier(PartitionId(1), uuid.uuid1()), subscription_data, ), 5, ), ), (request, result), ) message = codec.encode(task_result) data = json.loads(message.value.decode("utf-8")) assert data["version"] == 3 payload = data["payload"] assert payload["subscription_id"] == str( task_result.task.task.subscription.identifier) assert payload["request"] == request.original_body assert payload["result"] == result assert payload["timestamp"] == task_result.task.timestamp.isoformat() assert payload["entity"] == EntityKey.EVENTS.value
def create_subscription() -> None: store = RedisSubscriptionDataStore(redis_client, EntityKey.EVENTS, PartitionId(0)) store.create( uuid.uuid4(), SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query="MATCH (events) SELECT count()", entity_subscription=EventsSubscription(data_dict={}), ), )
def test_execute_and_produce_result() -> None: state.set_config("subscription_mode_events", "new") dataset = get_dataset("events") entity_names = ["events"] max_concurrent_queries = 2 total_concurrent_queries = 2 metrics = TestingMetricsBackend() scheduled_topic = Topic("scheduled-subscriptions-events") result_topic = Topic("events-subscriptions-results") clock = TestingClock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(scheduled_topic, partitions=1) broker.create_topic(result_topic, partitions=1) producer = broker.get_producer() commit = mock.Mock() strategy = ExecuteQuery( dataset, entity_names, max_concurrent_queries, total_concurrent_queries, None, metrics, ProduceResult(producer, result_topic.name, commit), commit, ) subscription_identifier = SubscriptionIdentifier(PartitionId(0), uuid.uuid1()) make_message = generate_message(EntityKey.EVENTS, subscription_identifier) message = next(make_message) strategy.submit(message) # Eventually a message should be produced and offsets committed while (broker_storage.consume(Partition(result_topic, 0), 0) is None or commit.call_count == 0): strategy.poll() produced_message = broker_storage.consume(Partition(result_topic, 0), 0) assert produced_message is not None assert produced_message.payload.key == str(subscription_identifier).encode( "utf-8") assert commit.call_count == 1
def test_subscription_task_result_encoder() -> None: codec = SubscriptionTaskResultEncoder() timestamp = datetime.now() subscription_data = LegacySubscriptionData( project_id=1, conditions=[], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=1), resolution=timedelta(minutes=1), ) # XXX: This seems way too coupled to the dataset. request = subscription_data.build_request(get_dataset("events"), timestamp, None, Timer("timer")) result: Result = { "meta": [{ "type": "UInt64", "name": "count" }], "data": [{ "count": 1 }], } task_result = SubscriptionTaskResult( ScheduledTask( timestamp, Subscription( SubscriptionIdentifier(PartitionId(1), uuid.uuid1()), subscription_data, ), ), (request, result), ) message = codec.encode(task_result) data = json.loads(message.value.decode("utf-8")) assert data["version"] == 2 payload = data["payload"] assert payload["subscription_id"] == str(task_result.task.task.identifier) assert payload["request"] == request.body assert payload["result"] == result assert payload["timestamp"] == task_result.task.timestamp.isoformat()
def test_skip_stale_message() -> None: dataset = get_dataset("events") entity_names = ["events"] max_concurrent_queries = 2 total_concurrent_queries = 2 metrics = TestingMetricsBackend() scheduled_topic = Topic("scheduled-subscriptions-events") result_topic = Topic("events-subscriptions-results") clock = TestingClock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(scheduled_topic, partitions=1) broker.create_topic(result_topic, partitions=1) producer = broker.get_producer() commit = mock.Mock() stale_threshold_seconds = 60 strategy = ExecuteQuery( dataset, entity_names, max_concurrent_queries, total_concurrent_queries, stale_threshold_seconds, metrics, ProduceResult(producer, result_topic.name, commit), commit, ) subscription_identifier = SubscriptionIdentifier(PartitionId(0), uuid.uuid1()) make_message = generate_message(EntityKey.EVENTS, subscription_identifier) message = next(make_message) strategy.submit(message) # No message will be produced strategy.poll() assert broker_storage.consume(Partition(result_topic, 0), 0) is None assert Increment("skipped_execution", 1, {"entity": "events"}) in metrics.calls
def test_subscription_task_encoder() -> None: encoder = SubscriptionScheduledTaskEncoder() subscription_data = SubscriptionData( project_id=1, query="MATCH events SELECT count()", time_window_sec=60, resolution_sec=60, entity_subscription=EventsSubscription(data_dict={}), ) subscription_id = uuid.UUID("91b46cb6224f11ecb2ddacde48001122") epoch = datetime(1970, 1, 1) tick_upper_offset = 5 subscription_with_metadata = SubscriptionWithMetadata( EntityKey.EVENTS, Subscription(SubscriptionIdentifier(PartitionId(1), subscription_id), subscription_data), tick_upper_offset, ) task = ScheduledSubscriptionTask(timestamp=epoch, task=subscription_with_metadata) encoded = encoder.encode(task) assert encoded.key == b"1/91b46cb6224f11ecb2ddacde48001122" assert encoded.value == ( b"{" b'"timestamp":"1970-01-01T00:00:00",' b'"entity":"events",' b'"task":{' b'"data":{"project_id":1,"time_window":60,"resolution":60,"query":"MATCH events SELECT count()"}},' b'"tick_upper_offset":5' b"}") decoded = encoder.decode(encoded) assert decoded == task
def generate_message( entity_key: EntityKey, subscription_identifier: Optional[SubscriptionIdentifier] = None, ) -> Iterator[Message[KafkaPayload]]: codec = SubscriptionScheduledTaskEncoder() epoch = datetime(1970, 1, 1) i = 0 if subscription_identifier is None: subscription_identifier = SubscriptionIdentifier( PartitionId(1), uuid.uuid1()) data_dict = {} if entity_key in (EntityKey.METRICS_SETS, EntityKey.METRICS_COUNTERS): data_dict = {"organization": 1} entity_subscription = ENTITY_KEY_TO_SUBSCRIPTION_MAPPER[entity_key]( data_dict=data_dict) while True: payload = codec.encode( ScheduledSubscriptionTask( epoch + timedelta(minutes=i), SubscriptionWithMetadata( entity_key, Subscription( subscription_identifier, SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query=f"MATCH ({entity_key.value}) SELECT count()", entity_subscription=entity_subscription, ), ), i + 1, ), )) yield Message(Partition(Topic("test"), 0), i, payload, epoch) i += 1
def delete_subscription(*, dataset: Dataset, partition: int, key: str) -> RespTuple: SubscriptionDeleter(dataset, PartitionId(partition)).delete(UUID(key)) return "ok", 202, {"Content-Type": "text/plain"}
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, max_query_workers: Optional[int], schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], delay_seconds: Optional[int], ) -> None: """Evaluates subscribed queries for a dataset.""" setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) storage = dataset.get_default_entity().get_writable_storage() assert ( storage is not None ), f"Dataset {dataset_name} does not have a writable storage by default." loader = enforce_table_writer(dataset).get_stream_loader() commit_log_topic_spec = loader.get_commit_log_topic_spec() assert commit_log_topic_spec is not None result_topic_spec = loader.get_subscription_result_topic_spec() assert result_topic_spec is not None metrics = MetricsWrapper( environment.metrics, "subscriptions", tags={ "group": consumer_group, "dataset": dataset_name }, ) consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( loader.get_default_topic_spec().topic, consumer_group, auto_offset_reset=auto_offset_reset, bootstrap_servers=bootstrap_servers, ), ), KafkaConsumer( build_kafka_consumer_configuration( commit_log_topic_spec.topic, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", bootstrap_servers=bootstrap_servers, ), ), (Topic(commit_log_topic) if commit_log_topic is not None else Topic(commit_log_topic_spec.topic_name)), set(commit_log_groups), ), time_shift=(timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None), ) producer = ProducerEncodingWrapper( KafkaProducer( build_kafka_producer_configuration( loader.get_default_topic_spec().topic, bootstrap_servers=bootstrap_servers, override_params={ "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, )), SubscriptionTaskResultEncoder(), ) executor = ThreadPoolExecutor(max_workers=max_query_workers) logger.debug("Starting %r with %s workers...", executor, getattr(executor, "_max_workers", 0)) metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0)) with closing(consumer), executor, closing(producer): from arroyo import configure_metrics configure_metrics(StreamMetricsAdapter(metrics)) batching_consumer = StreamProcessor( consumer, (Topic(topic) if topic is not None else Topic( loader.get_default_topic_spec().topic_name)), BatchProcessingStrategyFactory( SubscriptionWorker( dataset, executor, { index: SubscriptionScheduler( RedisSubscriptionDataStore(redis_client, dataset, PartitionId(index)), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=metrics, ) for index in range(partitions if partitions is not None else loader. get_default_topic_spec().partitions_number) }, producer, Topic(result_topic) if result_topic is not None else Topic( result_topic_spec.topic_name), metrics, ), max_batch_size, max_batch_time_ms, ), ) def handler(signum: int, frame: Optional[Any]) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def test_scheduler_consumer() -> None: settings.TOPIC_PARTITION_COUNTS = {"events": 2} importlib.reload(scheduler_consumer) admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.COMMIT_LOG]) metrics_backend = TestingMetricsBackend() entity_name = "events" entity = get_entity(EntityKey(entity_name)) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() commit_log_topic = Topic("snuba-commit-log") mock_scheduler_producer = mock.Mock() from snuba.redis import redis_client from snuba.subscriptions.data import PartitionId, SubscriptionData from snuba.subscriptions.entity_subscription import EventsSubscription from snuba.subscriptions.store import RedisSubscriptionDataStore entity_key = EntityKey(entity_name) partition_index = 0 store = RedisSubscriptionDataStore(redis_client, entity_key, PartitionId(partition_index)) store.create( uuid.uuid4(), SubscriptionData( project_id=1, time_window_sec=60, resolution_sec=60, query="MATCH events SELECT count()", entity_subscription=EventsSubscription(data_dict={}), ), ) builder = scheduler_consumer.SchedulerBuilder( entity_name, str(uuid.uuid1().hex), "events", mock_scheduler_producer, "latest", False, 60 * 5, None, None, metrics_backend, ) scheduler = builder.build_consumer() time.sleep(2) scheduler._run_once() scheduler._run_once() scheduler._run_once() epoch = datetime(1970, 1, 1) producer = KafkaProducer( build_kafka_producer_configuration( stream_loader.get_default_topic_spec().topic, )) for (partition, offset, orig_message_ts) in [ (0, 0, epoch), (1, 0, epoch + timedelta(minutes=1)), (0, 1, epoch + timedelta(minutes=2)), (1, 1, epoch + timedelta(minutes=3)), ]: fut = producer.produce( commit_log_topic, payload=commit_codec.encode( Commit( "events", Partition(commit_log_topic, partition), offset, orig_message_ts, )), ) fut.result() producer.close() for _ in range(5): scheduler._run_once() scheduler._shutdown() assert mock_scheduler_producer.produce.call_count == 2 settings.TOPIC_PARTITION_COUNTS = {}
def setup_method(self, test_method, dataset_name="events") -> None: super().setup_method(test_method, dataset_name) self.now = datetime.utcnow().replace(minute=0, second=0, microsecond=0) self.partition_id = PartitionId(1)
def subscriptions( *, dataset_name: str, topic: Optional[str], partitions: Optional[int], commit_log_topic: Optional[str], commit_log_groups: Sequence[str], consumer_group: str, auto_offset_reset: str, bootstrap_servers: Sequence[str], max_batch_size: int, max_batch_time_ms: int, schedule_ttl: int, result_topic: Optional[str], log_level: Optional[str], ) -> None: """Evaluates subscribed queries for a dataset.""" assert result_topic is not None setup_logging(log_level) setup_sentry() dataset = get_dataset(dataset_name) if not bootstrap_servers: bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get( dataset_name, settings.DEFAULT_BROKERS ) loader = enforce_table_writer(dataset).get_stream_loader() consumer = TickConsumer( SynchronizedConsumer( KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, consumer_group, auto_offset_reset=auto_offset_reset, ), PassthroughCodec(), ), KafkaConsumer( build_kafka_consumer_configuration( bootstrap_servers, f"subscriptions-commit-log-{uuid.uuid1().hex}", auto_offset_reset="earliest", ), CommitCodec(), ), ( Topic(commit_log_topic) if commit_log_topic is not None else Topic(loader.get_commit_log_topic_spec().topic_name) ), set(commit_log_groups), ) ) producer = KafkaProducer( { "bootstrap.servers": ",".join(bootstrap_servers), "partitioner": "consistent", "message.max.bytes": 50000000, # 50MB, default is 1MB }, SubscriptionResultCodec(), ) with closing(consumer), closing(producer): batching_consumer = BatchingConsumer( consumer, ( Topic(topic) if topic is not None else Topic(loader.get_default_topic_spec().topic_name) ), SubscriptionWorker( SubscriptionExecutor( dataset, ThreadPoolExecutor( max_workers=settings.SUBSCRIPTIONS_MAX_CONCURRENT_QUERIES ), ), { index: SubscriptionScheduler( RedisSubscriptionDataStore( redis_client, dataset, PartitionId(index) ), PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), ) for index in range( partitions if partitions is not None else loader.get_default_topic_spec().partitions_number ) }, producer, Topic(result_topic), ), max_batch_size, max_batch_time_ms, create_metrics( "snuba.subscriptions", tags={"group": consumer_group, "dataset": dataset_name}, ), ) def handler(signum, frame) -> None: batching_consumer.signal_shutdown() signal.signal(signal.SIGINT, handler) signal.signal(signal.SIGTERM, handler) batching_consumer.run()
def build_partition_id(self, data: SubscriptionData) -> PartitionId: return PartitionId( crc32(str(data.project_id).encode("utf-8")) % self.__topic.partitions_number)
def test_produce_result() -> None: state.set_config("subscription_mode_events", "new") epoch = datetime(1970, 1, 1) scheduled_topic = Topic("scheduled-subscriptions-events") result_topic = Topic("events-subscriptions-results") clock = TestingClock() broker_storage: MemoryMessageStorage[KafkaPayload] = MemoryMessageStorage() broker: Broker[KafkaPayload] = Broker(broker_storage, clock) broker.create_topic(scheduled_topic, partitions=1) broker.create_topic(result_topic, partitions=1) producer = broker.get_producer() commit = mock.Mock() strategy = ProduceResult(producer, result_topic.name, commit) subscription_data = SubscriptionData( project_id=1, query="MATCH (events) SELECT count() AS count", time_window_sec=60, resolution_sec=60, entity_subscription=EventsSubscription(data_dict={}), ) subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid.uuid1()), subscription_data) request = subscription_data.build_request(get_dataset("events"), epoch, None, Timer("timer")) result: Result = { "meta": [{ "type": "UInt64", "name": "count" }], "data": [{ "count": 1 }], } message = Message( Partition(scheduled_topic, 0), 1, SubscriptionTaskResult( ScheduledSubscriptionTask( epoch, SubscriptionWithMetadata(EntityKey.EVENTS, subscription, 1), ), (request, result), ), epoch, ) strategy.submit(message) produced_message = broker_storage.consume(Partition(result_topic, 0), 0) assert produced_message is not None assert produced_message.payload.key == str( subscription.identifier).encode("utf-8") assert broker_storage.consume(Partition(result_topic, 0), 1) is None assert commit.call_count == 0 strategy.poll() assert commit.call_count == 1 # Commit is throttled so if we immediately submit another message, the commit count will not change strategy.submit(message) strategy.poll() assert commit.call_count == 1 # Commit count immediately increases once we call join() strategy.join() assert commit.call_count == 2
def setup_method(self) -> None: self.now = datetime.utcnow().replace(minute=0, second=0, microsecond=0) self.partition_id = PartitionId(1) self.entity_key = EntityKey("events")
def test_executor_consumer() -> None: """ End to end integration test """ state.set_config("subscription_mode_events", "new") admin_client = AdminClient(get_default_kafka_configuration()) create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_SCHEDULED_EVENTS]) create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_RESULTS_EVENTS]) dataset_name = "events" entity_name = "events" entity_key = EntityKey(entity_name) entity = get_entity(entity_key) storage = entity.get_writable_storage() assert storage is not None stream_loader = storage.get_table_writer().get_stream_loader() scheduled_result_topic_spec = stream_loader.get_subscription_result_topic_spec( ) assert scheduled_result_topic_spec is not None result_producer = KafkaProducer( build_kafka_producer_configuration(scheduled_result_topic_spec.topic)) result_consumer = KafkaConsumer( build_kafka_consumer_configuration( scheduled_result_topic_spec.topic, str(uuid.uuid1().hex), auto_offset_reset="latest", strict_offset_reset=False, )) assigned = False def on_partitions_assigned(partitions: Mapping[Partition, int]) -> None: nonlocal assigned assigned = True result_consumer.subscribe( [Topic(scheduled_result_topic_spec.topic_name)], on_assign=on_partitions_assigned, ) attempts = 10 while attempts > 0 and not assigned: result_consumer.poll(1.0) attempts -= 1 # We need to wait for the consumer to receive partitions otherwise, # when we try to consume messages, we will not find anything. # Subscription is an async process. assert assigned == True, "Did not receive assignment within 10 attempts" consumer_group = str(uuid.uuid1().hex) auto_offset_reset = "latest" strict_offset_reset = False executor = build_executor_consumer( dataset_name, [entity_name], consumer_group, result_producer, 2, 2, auto_offset_reset, strict_offset_reset, TestingMetricsBackend(), None, ) for i in range(1, 5): # Give time to the executor to subscribe time.sleep(1) executor._run_once() # Produce a scheduled task to the scheduled subscriptions topic subscription_data = SubscriptionData( project_id=1, query="MATCH (events) SELECT count()", time_window_sec=60, resolution_sec=60, entity_subscription=EventsSubscription(data_dict={}), ) task = ScheduledSubscriptionTask( timestamp=datetime(1970, 1, 1), task=SubscriptionWithMetadata( entity_key, Subscription( SubscriptionIdentifier( PartitionId(1), uuid.UUID("91b46cb6224f11ecb2ddacde48001122")), subscription_data, ), 1, ), ) encoder = SubscriptionScheduledTaskEncoder() encoded_task = encoder.encode(task) scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec( ) assert scheduled_topic_spec is not None tasks_producer = KafkaProducer( build_kafka_producer_configuration(scheduled_topic_spec.topic)) scheduled_topic = Topic(scheduled_topic_spec.topic_name) tasks_producer.produce(scheduled_topic, payload=encoded_task).result() tasks_producer.close() executor._run_once() executor.signal_shutdown() # Call run here so that the executor shuts down itself cleanly. executor.run() result = result_consumer.poll(5) assert result is not None, "Did not receive a result message" data = json.loads(result.payload.value) assert (data["payload"]["subscription_id"] == "1/91b46cb6224f11ecb2ddacde48001122"), "Invalid subscription id" result_producer.close()
def test_subscription_worker(subscription_data: SubscriptionData) -> None: broker: Broker[SubscriptionTaskResult] = Broker(MemoryMessageStorage(), TestingClock()) result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 3 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), subscription_data, ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = DummyMetricsBackend(strict=True) dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), metrics) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) result_futures = worker.process_message( Message(Partition(Topic("events"), 0), 0, tick, now)) assert result_futures is not None and len(result_futures) == evaluations # Publish the results. worker.flush_batch([result_futures]) # Check to make sure the results were published. # NOTE: This does not cover the ``SubscriptionTaskResultCodec``! consumer = broker.get_consumer("group") consumer.subscribe([result_topic]) for i in range(evaluations): timestamp = now - frequency * (evaluations - i) message = consumer.poll() assert message is not None assert message.partition.topic == result_topic task, future = result_futures[i] future_result = request, result = future.result() assert message.payload.task.timestamp == timestamp assert message.payload == SubscriptionTaskResult(task, future_result) # NOTE: The time series extension is folded back into the request # body, ideally this would reference the timeseries options in # isolation. from_pattern = FunctionCall( String(ConditionFunctions.GTE), ( Column(None, String("timestamp")), Literal(Datetime(timestamp - subscription.data.time_window)), ), ) to_pattern = FunctionCall( String(ConditionFunctions.LT), (Column(None, String("timestamp")), Literal(Datetime(timestamp))), ) condition = request.query.get_condition() assert condition is not None conditions = get_first_level_and_conditions(condition) assert any([from_pattern.match(e) for e in conditions]) assert any([to_pattern.match(e) for e in conditions]) assert result == { "meta": [{ "name": "count", "type": "UInt64" }], "data": [{ "count": 0 }], }
def delete_subscription(*, dataset: Dataset, partition: int, key: str): ensure_not_internal(dataset) SubscriptionDeleter(dataset, PartitionId(partition)).delete(UUID(key)) return "ok", 202, {"Content-Type": "text/plain"}
def __init__( self, dataset: Dataset, entity_names: Sequence[str], partitions: int, max_concurrent_queries: int, total_concurrent_queries: int, producer: Producer[KafkaPayload], metrics: MetricsBackend, stale_threshold_seconds: Optional[int], result_topic: str, schedule_ttl: int, scheduling_mode: Optional[SchedulingWatermarkMode] = None, ) -> None: # TODO: self.__partitions might not be the same for each entity self.__partitions = partitions self.__entity_names = entity_names self.__metrics = metrics entity_keys = [EntityKey(entity_name) for entity_name in self.__entity_names] self.__schedulers = [ { index: SubscriptionScheduler( entity_key, RedisSubscriptionDataStore( redis_client, entity_key, PartitionId(index) ), partition_id=PartitionId(index), cache_ttl=timedelta(seconds=schedule_ttl), metrics=self.__metrics, ) for index in range(self.__partitions) } for entity_key in entity_keys ] # Just apply the max buffer size if they are configured differently # for each entity that is being run together self.__buffer_size = max( [ settings.SUBSCRIPTIONS_ENTITY_BUFFER_SIZE.get( entity_key.value, settings.SUBSCRIPTIONS_DEFAULT_BUFFER_SIZE ) for entity_key in entity_keys ] ) self.__executor_factory = SubscriptionExecutorProcessingFactory( max_concurrent_queries, total_concurrent_queries, dataset, entity_names, producer, metrics, stale_threshold_seconds, result_topic, ) if scheduling_mode is not None: self.__mode = scheduling_mode else: modes = { self._get_entity_watermark_mode(entity_key) for entity_key in entity_keys } mode = modes.pop() assert len(modes) == 0, "Entities provided do not share the same mode" self.__mode = mode
def test_metrics_subscription_task_result_encoder( subscription_cls: Type[EntitySubscription], aggregate: str, entity_key: EntityKey) -> None: codec = SubscriptionTaskResultEncoder() timestamp = datetime.now() entity_subscription = subscription_cls(data_dict={"organization": 1}) subscription_data = SubscriptionData( project_id=1, query=(f""" MATCH ({entity_key.value}) SELECT {aggregate}(value) AS value BY project_id, tags[3] WHERE org_id = 1 AND project_id IN array(1) AND metric_id = 7 AND tags[3] IN array(1,2) """), time_window_sec=60, resolution_sec=60, entity_subscription=entity_subscription, ) # XXX: This seems way too coupled to the dataset. request = subscription_data.build_request(get_dataset("metrics"), timestamp, None, Timer("timer")) result: Result = { "data": [ { "project_id": 1, "tags[3]": 13, "value": 8 }, { "project_id": 1, "tags[3]": 4, "value": 46 }, ], "meta": [ { "name": "project_id", "type": "UInt64" }, { "name": "tags[3]", "type": "UInt64" }, { "name": "value", "type": "Float64" }, ], } task_result = SubscriptionTaskResult( ScheduledSubscriptionTask( timestamp, SubscriptionWithMetadata( entity_key, Subscription( SubscriptionIdentifier(PartitionId(1), uuid.uuid1()), subscription_data, ), 5, ), ), (request, result), ) message = codec.encode(task_result) data = json.loads(message.value.decode("utf-8")) assert data["version"] == 3 payload = data["payload"] assert payload["subscription_id"] == str( task_result.task.task.subscription.identifier) assert payload["request"] == request.original_body assert payload["result"] == result assert payload["timestamp"] == task_result.task.timestamp.isoformat() assert payload["entity"] == entity_key.value
def build_store(self, key: int = 1) -> RedisSubscriptionDataStore: return RedisSubscriptionDataStore(redis_client, self.entity_key, PartitionId(key))
def setup_method(self) -> None: self.now = datetime.utcnow().replace(minute=0, second=0, microsecond=0) self.partition_id = PartitionId(1) self.dataset = get_dataset("events")
def test_subscription_worker(broker: Broker[SubscriptionTaskResult], ) -> None: result_topic = Topic("subscription-results") broker.create_topic(result_topic, partitions=1) frequency = timedelta(minutes=1) evaluations = 3 subscription = Subscription( SubscriptionIdentifier(PartitionId(0), uuid1()), SubscriptionData( project_id=1, conditions=[], aggregations=[["count()", "", "count"]], time_window=timedelta(minutes=60), resolution=frequency, ), ) store = DummySubscriptionDataStore() store.create(subscription.identifier.uuid, subscription.data) metrics = DummyMetricsBackend(strict=True) dataset = get_dataset("events") worker = SubscriptionWorker( dataset, ThreadPoolExecutor(), { 0: SubscriptionScheduler(store, PartitionId(0), timedelta(), metrics) }, broker.get_producer(), result_topic, metrics, ) now = datetime(2000, 1, 1) tick = Tick( offsets=Interval(0, 1), timestamps=Interval(now - (frequency * evaluations), now), ) result_futures = worker.process_message( Message(Partition(Topic("events"), 0), 0, tick, now)) assert result_futures is not None and len(result_futures) == evaluations # Publish the results. worker.flush_batch([result_futures]) # Check to make sure the results were published. # NOTE: This does not cover the ``SubscriptionTaskResultCodec``! consumer = broker.get_consumer("group") consumer.subscribe([result_topic]) for i in range(evaluations): timestamp = now - frequency * (evaluations - i) message = consumer.poll() assert message is not None assert message.partition.topic == result_topic task, future = result_futures[i] future_result = request, result = future.result() assert message.payload.task.timestamp == timestamp assert message.payload == SubscriptionTaskResult(task, future_result) # NOTE: The time series extension is folded back into the request # body, ideally this would reference the timeseries options in # isolation. assert (request.body.items() > { "from_date": (timestamp - subscription.data.time_window).isoformat(), "to_date": timestamp.isoformat(), }.items()) assert result == { "meta": [{ "name": "count", "type": "UInt64" }], "data": [{ "count": 0 }], }