Beispiel #1
0
    def run_test(
        self,
        subscriptions: Collection[Subscription],
        start: timedelta,
        end: timedelta,
        expected: Collection[ScheduledSubscriptionTask],
        sort_key: Optional[Callable[[ScheduledSubscriptionTask],
                                    Tuple[datetime, uuid.UUID]]] = None,
    ) -> None:
        tick = self.build_tick(start, end)

        store = RedisSubscriptionDataStore(
            redis_client,
            self.entity_key,
            self.partition_id,
        )
        for subscription in subscriptions:
            store.create(subscription.identifier.uuid, subscription.data)

        scheduler = SubscriptionScheduler(
            EntityKey.EVENTS,
            store,
            self.partition_id,
            timedelta(minutes=1),
            DummyMetricsBackend(strict=True),
        )

        result = list(scheduler.find(tick))
        if sort_key:
            result.sort(key=sort_key)

        assert result == expected
Beispiel #2
0
    def test(self) -> None:
        creator = SubscriptionCreator(self.dataset, EntityKey.EVENTS)
        subscription = SubscriptionData(
            project_id=1,
            query="MATCH (events) SELECT count() AS count",
            time_window_sec=10 * 60,
            resolution_sec=60,
            entity_subscription=create_entity_subscription(),
        )
        identifier = creator.create(subscription, Timer("test"))
        assert (cast(
            List[Tuple[UUID, SubscriptionData]],
            RedisSubscriptionDataStore(
                redis_client,
                self.entity_key,
                identifier.partition,
            ).all(),
        )[0][1] == subscription)

        SubscriptionDeleter(self.entity_key,
                            identifier.partition).delete(identifier.uuid)
        assert (RedisSubscriptionDataStore(
            redis_client,
            self.entity_key,
            identifier.partition,
        ).all() == [])
Beispiel #3
0
    def run_test(
        self,
        subscriptions: Collection[Subscription],
        start: timedelta,
        end: timedelta,
        expected: Collection[ScheduledTask[Subscription]],
        sort_key=None,
    ) -> None:
        store = RedisSubscriptionDataStore(
            redis_client,
            self.dataset,
            self.partition_id,
        )
        for subscription in subscriptions:
            store.create(subscription.identifier.uuid, subscription.data)

        scheduler = SubscriptionScheduler(
            store,
            self.partition_id,
            timedelta(minutes=1),
            DummyMetricsBackend(strict=True),
        )

        result = list(scheduler.find(self.build_interval(start, end)))
        if sort_key:
            result.sort(key=sort_key)

        assert result == expected
Beispiel #4
0
    def test(self) -> None:
        creator = SubscriptionCreator(self.dataset)
        subscription = LegacySubscriptionData(
            project_id=1,
            conditions=[],
            aggregations=[["count()", "", "count"]],
            time_window=timedelta(minutes=10),
            resolution=timedelta(minutes=1),
        )
        identifier = creator.create(subscription, Timer("test"))
        assert (
            cast(
                List[Tuple[UUID, SubscriptionData]],
                RedisSubscriptionDataStore(
                    redis_client, self.dataset, identifier.partition,
                ).all(),
            )[0][1]
            == subscription
        )

        SubscriptionDeleter(self.dataset, identifier.partition).delete(identifier.uuid)
        assert (
            RedisSubscriptionDataStore(
                redis_client, self.dataset, identifier.partition,
            ).all()
            == []
        )
def create_subscription() -> None:
    store = RedisSubscriptionDataStore(redis_client, EntityKey.EVENTS,
                                       PartitionId(0))
    store.create(
        uuid.uuid4(),
        SubscriptionData(
            project_id=1,
            time_window_sec=60,
            resolution_sec=60,
            query="MATCH (events) SELECT count()",
            entity_subscription=EventsSubscription(data_dict={}),
        ),
    )
Beispiel #6
0
    def __init__(
        self,
        entity_key: EntityKey,
        mode: SchedulingWatermarkMode,
        schedule_ttl: int,
        stale_threshold_seconds: Optional[int],
        partitions: int,
        producer: Producer[KafkaPayload],
        scheduled_topic_spec: KafkaTopicSpec,
        metrics: MetricsBackend,
    ) -> None:
        self.__mode = mode
        self.__stale_threshold_seconds = stale_threshold_seconds
        self.__partitions = partitions
        self.__producer = producer
        self.__scheduled_topic_spec = scheduled_topic_spec
        self.__metrics = metrics

        self.__buffer_size = settings.SUBSCRIPTIONS_ENTITY_BUFFER_SIZE.get(
            entity_key.value, settings.SUBSCRIPTIONS_DEFAULT_BUFFER_SIZE)

        self.__schedulers = {
            index: SubscriptionScheduler(
                entity_key,
                RedisSubscriptionDataStore(redis_client, entity_key,
                                           PartitionId(index)),
                partition_id=PartitionId(index),
                cache_ttl=timedelta(seconds=schedule_ttl),
                metrics=self.__metrics,
            )
            for index in range(self.__partitions)
        }
Beispiel #7
0
    def test(self):
        creator = SubscriptionCreator(self.dataset)
        subscription = SubscriptionData(
            project_id=1,
            conditions=[],
            aggregations=[["count()", "", "count"]],
            time_window=timedelta(minutes=10),
            resolution=timedelta(minutes=1),
        )
        identifier = creator.create(subscription, Timer("test"))
        RedisSubscriptionDataStore(
            redis_client, self.dataset, identifier.partition,
        ).all()[0][1] == subscription

        SubscriptionDeleter(self.dataset, identifier.partition).delete(identifier.uuid)
        RedisSubscriptionDataStore(
            redis_client, self.dataset, identifier.partition,
        ).all() == []
Beispiel #8
0
 def test(self, subscription: SubscriptionData) -> None:
     creator = SubscriptionCreator(self.dataset, EntityKey.EVENTS)
     identifier = creator.create(subscription, self.timer)
     assert (cast(
         List[Tuple[UUID, SubscriptionData]],
         RedisSubscriptionDataStore(
             redis_client,
             self.entity_key,
             identifier.partition,
         ).all(),
     )[0][1] == subscription)
Beispiel #9
0
 def test(self):
     creator = SubscriptionCreator(self.dataset)
     subscription = SubscriptionData(
         project_id=123,
         conditions=[["platform", "IN", ["a"]]],
         aggregations=[["count()", "", "count"]],
         time_window=timedelta(minutes=10),
         resolution=timedelta(minutes=1),
     )
     identifier = creator.create(subscription, self.timer)
     RedisSubscriptionDataStore(
         redis_client, self.dataset, identifier.partition,
     ).all()[0][1] == subscription
Beispiel #10
0
    def create(self, data: SubscriptionData, timer: Timer) -> SubscriptionIdentifier:
        data.validate()
        self._test_request(data, timer)

        identifier = SubscriptionIdentifier(
            self.__partitioner.build_partition_id(data),
            uuid1(),
        )
        RedisSubscriptionDataStore(
            redis_client, self.entity_key, identifier.partition
        ).create(
            identifier.uuid,
            data,
        )
        return identifier
Beispiel #11
0
 def create(self, data: SubscriptionData,
            timer: Timer) -> SubscriptionIdentifier:
     # We want to test the query out here to make sure it's valid and can run
     request = data.build_request(self.dataset, datetime.utcnow(), None,
                                  timer)
     parse_and_run_query(self.dataset, request, timer)
     identifier = SubscriptionIdentifier(
         self.__partitioner.build_partition_id(data),
         uuid1(),
     )
     RedisSubscriptionDataStore(redis_client, self.dataset,
                                identifier.partition).create(
                                    identifier.uuid,
                                    data,
                                )
     return identifier
Beispiel #12
0
    def create(self, data: SubscriptionData, timer: Timer) -> SubscriptionIdentifier:
        # We want to test the query out here to make sure it's valid and can run
        # If there is a delegate subscription, we need to run both the SnQL and Legacy validator
        if isinstance(data, DelegateSubscriptionData):
            self._test_request(data.to_snql(), timer)
            self._test_request(data.to_legacy(), timer)
        else:
            self._test_request(data, timer)

        identifier = SubscriptionIdentifier(
            self.__partitioner.build_partition_id(data), uuid1(),
        )
        RedisSubscriptionDataStore(
            redis_client, self.dataset, identifier.partition
        ).create(
            identifier.uuid, data,
        )
        return identifier
Beispiel #13
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    assert result_topic is not None

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    if not bootstrap_servers:
        bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get(
            dataset_name, settings.DEFAULT_BROKERS
        )

    loader = enforce_table_writer(dataset).get_stream_loader()

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                ),
                PassthroughCodec(),
            ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                ),
                CommitCodec(),
            ),
            (
                Topic(commit_log_topic)
                if commit_log_topic is not None
                else Topic(loader.get_commit_log_topic_spec().topic_name)
            ),
            set(commit_log_groups),
        )
    )

    producer = KafkaProducer(
        {
            "bootstrap.servers": ",".join(bootstrap_servers),
            "partitioner": "consistent",
            "message.max.bytes": 50000000,  # 50MB, default is 1MB
        },
        SubscriptionResultCodec(),
    )

    with closing(consumer), closing(producer):
        batching_consumer = BatchingConsumer(
            consumer,
            (
                Topic(topic)
                if topic is not None
                else Topic(loader.get_default_topic_spec().topic_name)
            ),
            SubscriptionWorker(
                SubscriptionExecutor(
                    dataset,
                    ThreadPoolExecutor(
                        max_workers=settings.SUBSCRIPTIONS_MAX_CONCURRENT_QUERIES
                    ),
                ),
                {
                    index: SubscriptionScheduler(
                        RedisSubscriptionDataStore(
                            redis_client, dataset, PartitionId(index)
                        ),
                        PartitionId(index),
                        cache_ttl=timedelta(seconds=schedule_ttl),
                    )
                    for index in range(
                        partitions
                        if partitions is not None
                        else loader.get_default_topic_spec().partitions_number
                    )
                },
                producer,
                Topic(result_topic),
            ),
            max_batch_size,
            max_batch_time_ms,
            create_metrics(
                "snuba.subscriptions",
                tags={"group": consumer_group, "dataset": dataset_name},
            ),
        )

        def handler(signum, frame) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
Beispiel #14
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    max_query_workers: Optional[int],
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
    delay_seconds: Optional[int],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    storage = dataset.get_default_entity().get_writable_storage()
    assert (
        storage is not None
    ), f"Dataset {dataset_name} does not have a writable storage by default."

    loader = enforce_table_writer(dataset).get_stream_loader()
    commit_log_topic_spec = loader.get_commit_log_topic_spec()
    assert commit_log_topic_spec is not None

    result_topic_spec = loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions",
        tags={
            "group": consumer_group,
            "dataset": dataset_name
        },
    )

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    loader.get_default_topic_spec().topic,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                    bootstrap_servers=bootstrap_servers,
                ), ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    commit_log_topic_spec.topic,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                    bootstrap_servers=bootstrap_servers,
                ), ),
            (Topic(commit_log_topic) if commit_log_topic is not None else
             Topic(commit_log_topic_spec.topic_name)),
            set(commit_log_groups),
        ),
        time_shift=(timedelta(seconds=delay_seconds *
                              -1) if delay_seconds is not None else None),
    )

    producer = ProducerEncodingWrapper(
        KafkaProducer(
            build_kafka_producer_configuration(
                loader.get_default_topic_spec().topic,
                bootstrap_servers=bootstrap_servers,
                override_params={
                    "partitioner": "consistent",
                    "message.max.bytes": 50000000,  # 50MB, default is 1MB
                },
            )),
        SubscriptionTaskResultEncoder(),
    )

    executor = ThreadPoolExecutor(max_workers=max_query_workers)
    logger.debug("Starting %r with %s workers...", executor,
                 getattr(executor, "_max_workers", 0))
    metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0))

    with closing(consumer), executor, closing(producer):
        from arroyo import configure_metrics

        configure_metrics(StreamMetricsAdapter(metrics))
        batching_consumer = StreamProcessor(
            consumer,
            (Topic(topic) if topic is not None else Topic(
                loader.get_default_topic_spec().topic_name)),
            BatchProcessingStrategyFactory(
                SubscriptionWorker(
                    dataset,
                    executor,
                    {
                        index: SubscriptionScheduler(
                            RedisSubscriptionDataStore(redis_client, dataset,
                                                       PartitionId(index)),
                            PartitionId(index),
                            cache_ttl=timedelta(seconds=schedule_ttl),
                            metrics=metrics,
                        )
                        for index in
                        range(partitions if partitions is not None else loader.
                              get_default_topic_spec().partitions_number)
                    },
                    producer,
                    Topic(result_topic) if result_topic is not None else Topic(
                        result_topic_spec.topic_name),
                    metrics,
                ),
                max_batch_size,
                max_batch_time_ms,
            ),
        )

        def handler(signum: int, frame: Optional[Any]) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
Beispiel #15
0
 def delete(self, subscription_id: UUID) -> None:
     RedisSubscriptionDataStore(redis_client, self.dataset,
                                self.partition).delete(subscription_id)
def test_scheduler_consumer() -> None:
    settings.TOPIC_PARTITION_COUNTS = {"events": 2}
    importlib.reload(scheduler_consumer)

    admin_client = AdminClient(get_default_kafka_configuration())
    create_topics(admin_client, [SnubaTopic.COMMIT_LOG])

    metrics_backend = TestingMetricsBackend()
    entity_name = "events"
    entity = get_entity(EntityKey(entity_name))
    storage = entity.get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()

    commit_log_topic = Topic("snuba-commit-log")

    mock_scheduler_producer = mock.Mock()

    from snuba.redis import redis_client
    from snuba.subscriptions.data import PartitionId, SubscriptionData
    from snuba.subscriptions.entity_subscription import EventsSubscription
    from snuba.subscriptions.store import RedisSubscriptionDataStore

    entity_key = EntityKey(entity_name)
    partition_index = 0

    store = RedisSubscriptionDataStore(redis_client, entity_key,
                                       PartitionId(partition_index))
    store.create(
        uuid.uuid4(),
        SubscriptionData(
            project_id=1,
            time_window_sec=60,
            resolution_sec=60,
            query="MATCH events SELECT count()",
            entity_subscription=EventsSubscription(data_dict={}),
        ),
    )

    builder = scheduler_consumer.SchedulerBuilder(
        entity_name,
        str(uuid.uuid1().hex),
        "events",
        mock_scheduler_producer,
        "latest",
        False,
        60 * 5,
        None,
        None,
        metrics_backend,
    )
    scheduler = builder.build_consumer()
    time.sleep(2)
    scheduler._run_once()
    scheduler._run_once()
    scheduler._run_once()

    epoch = datetime(1970, 1, 1)

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            stream_loader.get_default_topic_spec().topic, ))

    for (partition, offset, orig_message_ts) in [
        (0, 0, epoch),
        (1, 0, epoch + timedelta(minutes=1)),
        (0, 1, epoch + timedelta(minutes=2)),
        (1, 1, epoch + timedelta(minutes=3)),
    ]:
        fut = producer.produce(
            commit_log_topic,
            payload=commit_codec.encode(
                Commit(
                    "events",
                    Partition(commit_log_topic, partition),
                    offset,
                    orig_message_ts,
                )),
        )
        fut.result()

    producer.close()

    for _ in range(5):
        scheduler._run_once()

    scheduler._shutdown()

    assert mock_scheduler_producer.produce.call_count == 2

    settings.TOPIC_PARTITION_COUNTS = {}
Beispiel #17
0
 def build_store(self, key="1") -> RedisSubscriptionDataStore:
     return RedisSubscriptionDataStore(redis_client, self.dataset, key)
Beispiel #18
0
 def build_store(self, key: int = 1) -> RedisSubscriptionDataStore:
     return RedisSubscriptionDataStore(redis_client, self.entity_key,
                                       PartitionId(key))
Beispiel #19
0
    def __init__(
        self,
        dataset: Dataset,
        entity_names: Sequence[str],
        partitions: int,
        max_concurrent_queries: int,
        total_concurrent_queries: int,
        producer: Producer[KafkaPayload],
        metrics: MetricsBackend,
        stale_threshold_seconds: Optional[int],
        result_topic: str,
        schedule_ttl: int,
        scheduling_mode: Optional[SchedulingWatermarkMode] = None,
    ) -> None:
        # TODO: self.__partitions might not be the same for each entity
        self.__partitions = partitions
        self.__entity_names = entity_names
        self.__metrics = metrics

        entity_keys = [EntityKey(entity_name) for entity_name in self.__entity_names]

        self.__schedulers = [
            {
                index: SubscriptionScheduler(
                    entity_key,
                    RedisSubscriptionDataStore(
                        redis_client, entity_key, PartitionId(index)
                    ),
                    partition_id=PartitionId(index),
                    cache_ttl=timedelta(seconds=schedule_ttl),
                    metrics=self.__metrics,
                )
                for index in range(self.__partitions)
            }
            for entity_key in entity_keys
        ]

        # Just apply the max buffer size if they are configured differently
        # for each entity that is being run together
        self.__buffer_size = max(
            [
                settings.SUBSCRIPTIONS_ENTITY_BUFFER_SIZE.get(
                    entity_key.value, settings.SUBSCRIPTIONS_DEFAULT_BUFFER_SIZE
                )
                for entity_key in entity_keys
            ]
        )

        self.__executor_factory = SubscriptionExecutorProcessingFactory(
            max_concurrent_queries,
            total_concurrent_queries,
            dataset,
            entity_names,
            producer,
            metrics,
            stale_threshold_seconds,
            result_topic,
        )

        if scheduling_mode is not None:
            self.__mode = scheduling_mode
        else:
            modes = {
                self._get_entity_watermark_mode(entity_key)
                for entity_key in entity_keys
            }

            mode = modes.pop()

            assert len(modes) == 0, "Entities provided do not share the same mode"

            self.__mode = mode