Ejemplo n.º 1
0
    def __build_tick_consumer(self) -> CommitLogTickConsumer:
        consumer_configuration = build_kafka_consumer_configuration(
            self.__commit_log_topic_spec.topic,
            self.__consumer_group,
            auto_offset_reset=self.__auto_offset_reset,
            strict_offset_reset=self.__strict_offset_reset,
        )

        # Collect metrics from librdkafka if we have stats_collection_freq_ms set
        # for the consumer group, or use the default.
        stats_collection_frequency_ms = get_config(
            f"stats_collection_freq_ms_{self.__consumer_group}",
            get_config("stats_collection_freq_ms", 0),
        )

        if stats_collection_frequency_ms and stats_collection_frequency_ms > 0:

            def stats_callback(stats_json: str) -> None:
                stats = rapidjson.loads(stats_json)
                self.__metrics.gauge("librdkafka.total_queue_size",
                                     stats.get("replyq", 0))

            consumer_configuration.update({
                "statistics.interval.ms": stats_collection_frequency_ms,
                "stats_cb": stats_callback,
            })

        return CommitLogTickConsumer(
            KafkaConsumer(consumer_configuration),
            followed_consumer_group=self.__followed_consumer_group,
            time_shift=(timedelta(
                seconds=self.__delay_seconds *
                -1) if self.__delay_seconds is not None else None),
        )
Ejemplo n.º 2
0
    def __build_consumer(
        self, strategy_factory: ProcessingStrategyFactory[KafkaPayload]
    ) -> StreamProcessor[KafkaPayload]:
        configuration = build_kafka_consumer_configuration(
            self.storage.get_table_writer().get_stream_loader().
            get_default_topic_spec().topic,
            bootstrap_servers=self.bootstrap_servers,
            group_id=self.group_id,
            auto_offset_reset=self.auto_offset_reset,
            queued_max_messages_kbytes=self.queued_max_messages_kbytes,
            queued_min_messages=self.queued_min_messages,
        )

        if self.commit_log_topic is None:
            consumer = KafkaConsumer(
                configuration,
                commit_retry_policy=self.__commit_retry_policy,
            )
        else:
            consumer = KafkaConsumerWithCommitLog(
                configuration,
                producer=self.producer,
                commit_log_topic=self.commit_log_topic,
                commit_retry_policy=self.__commit_retry_policy,
            )

        return StreamProcessor(consumer, self.raw_topic, strategy_factory)
Ejemplo n.º 3
0
    def __build_consumer(
        self, strategy_factory: ProcessingStrategyFactory[KafkaPayload]
    ) -> StreamProcessor[KafkaPayload]:
        configuration = build_kafka_consumer_configuration(
            self.storage.get_table_writer().get_stream_loader().
            get_default_topic_spec().topic,
            bootstrap_servers=self.bootstrap_servers,
            group_id=self.group_id,
            auto_offset_reset=self.auto_offset_reset,
            strict_offset_reset=self.strict_offset_reset,
            queued_max_messages_kbytes=self.queued_max_messages_kbytes,
            queued_min_messages=self.queued_min_messages,
        )

        if self.__cooperative_rebalancing is True:
            configuration[
                "partition.assignment.strategy"] = "cooperative-sticky"

        stats_collection_frequency_ms = get_config(
            f"stats_collection_freq_ms_{self.group_id}",
            get_config("stats_collection_freq_ms", 0),
        )

        if stats_collection_frequency_ms and stats_collection_frequency_ms > 0:
            configuration.update({
                "statistics.interval.ms": stats_collection_frequency_ms,
                "stats_cb": self.stats_callback,
            })

        if self.commit_log_topic is None:
            consumer = KafkaConsumer(
                configuration,
                commit_retry_policy=self.__commit_retry_policy,
            )
        else:
            consumer = KafkaConsumerWithCommitLog(
                configuration,
                producer=self.producer,
                commit_log_topic=self.commit_log_topic,
                commit_retry_policy=self.__commit_retry_policy,
            )

        return StreamProcessor(consumer, self.raw_topic, strategy_factory)
Ejemplo n.º 4
0
def get_streaming_metrics_consumer(
    topic: str,
    commit_max_batch_size: int,
    commit_max_batch_time: int,
    max_batch_size: int,
    max_batch_time: float,
    processes: int,
    input_block_size: int,
    output_block_size: int,
    group_id: str,
    auto_offset_reset: str,
    factory_name: str,
    **options: Mapping[str, Union[str, int]],
) -> StreamProcessor:

    if factory_name == "multiprocess":
        processing_factory = MetricsConsumerStrategyFactory(
            max_batch_size=max_batch_size,
            max_batch_time=max_batch_time,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
        )
    else:
        assert factory_name == "default"
        processing_factory = BatchConsumerStrategyFactory(
            max_batch_size=max_batch_size,
            max_batch_time=max_batch_time,
            commit_max_batch_size=commit_max_batch_size,
            commit_max_batch_time=commit_max_batch_time,
        )

    return StreamProcessor(
        KafkaConsumer(get_config(topic, group_id, auto_offset_reset)),
        Topic(topic),
        processing_factory,
    )
Ejemplo n.º 5
0
def multistorage_consumer(
    storage_names: Sequence[str],
    consumer_group: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
) -> None:

    DEFAULT_BLOCK_SIZE = int(32 * 1e6)

    if processes is not None:
        if input_block_size is None:
            input_block_size = DEFAULT_BLOCK_SIZE

        if output_block_size is None:
            output_block_size = DEFAULT_BLOCK_SIZE

    setup_logging(log_level)
    setup_sentry()

    storages = {
        key: get_writable_storage(key)
        for key in (getattr(StorageKey, name.upper())
                    for name in storage_names)
    }

    topics = {
        storage.get_table_writer().get_stream_loader().get_default_topic_spec(
        ).topic_name
        for storage in storages.values()
    }

    # XXX: The ``StreamProcessor`` only supports a single topic at this time,
    # but is easily modified. The topic routing in the processing strategy is a
    # bit trickier (but also shouldn't be too bad.)
    topic = Topic(topics.pop())
    if topics:
        raise ValueError("only one topic is supported")

    # XXX: The ``CommitLogConsumer`` also only supports a single topic at this
    # time. (It is less easily modified.) This also assumes the commit log
    # topic is on the same Kafka cluster as the input topic.
    commit_log_topics = {
        spec.topic_name
        for spec in (storage.get_table_writer().get_stream_loader(
        ).get_commit_log_topic_spec() for storage in storages.values())
        if spec is not None
    }

    commit_log_topic: Optional[Topic]
    if commit_log_topics:
        commit_log_topic = Topic(commit_log_topics.pop())
    else:
        commit_log_topic = None

    if commit_log_topics:
        raise ValueError("only one commit log topic is supported")

    # XXX: This requires that all storages are associated with the same Kafka
    # cluster so that they can be consumed by the same consumer instance.
    # Unfortunately, we don't have the concept of independently configurable
    # Kafka clusters in settings, only consumer configurations that are
    # associated with storages and/or global default configurations. To avoid
    # implementing yet another method of configuring Kafka clusters, this just
    # piggybacks on the existing configuration method(s), with the assumption
    # that most deployments are going to be using the default configuration.
    storage_keys = [*storages.keys()]

    kafka_topic = (storages[storage_keys[0]].get_table_writer().
                   get_stream_loader().get_default_topic_spec().topic)

    consumer_configuration = build_kafka_consumer_configuration(
        kafka_topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
    )

    for storage_key in storage_keys[1:]:
        if (build_kafka_consumer_configuration(
                storages[storage_key].get_table_writer().get_stream_loader().
                get_default_topic_spec().topic,
                consumer_group,
        )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]):
            raise ValueError(
                "storages cannot be located on different Kafka clusters")

    if commit_log_topic is None:
        consumer = KafkaConsumer(consumer_configuration)
    else:
        # XXX: This relies on the assumptions that a.) all storages are
        # located on the same Kafka cluster (validated above.)

        commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer(
        ).get_stream_loader().get_commit_log_topic_spec())
        assert commit_log_topic_spec is not None

        producer = ConfluentKafkaProducer(
            build_kafka_producer_configuration(commit_log_topic_spec.topic))
        consumer = KafkaConsumerWithCommitLog(
            consumer_configuration,
            producer=producer,
            commit_log_topic=commit_log_topic,
        )

    metrics = MetricsWrapper(environment.metrics, "consumer")

    configure_metrics(StreamMetricsAdapter(metrics))
    processor = StreamProcessor(
        consumer,
        topic,
        MultistorageConsumerProcessingStrategyFactory(
            [*storages.values()],
            max_batch_size,
            max_batch_time_ms / 1000.0,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
            metrics=metrics,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    processor.run()
Ejemplo n.º 6
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    max_query_workers: Optional[int],
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
    delay_seconds: Optional[int],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    storage = dataset.get_default_entity().get_writable_storage()
    assert (
        storage is not None
    ), f"Dataset {dataset_name} does not have a writable storage by default."

    loader = enforce_table_writer(dataset).get_stream_loader()
    commit_log_topic_spec = loader.get_commit_log_topic_spec()
    assert commit_log_topic_spec is not None

    result_topic_spec = loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions",
        tags={
            "group": consumer_group,
            "dataset": dataset_name
        },
    )

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    loader.get_default_topic_spec().topic,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                    bootstrap_servers=bootstrap_servers,
                ), ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    commit_log_topic_spec.topic,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                    bootstrap_servers=bootstrap_servers,
                ), ),
            (Topic(commit_log_topic) if commit_log_topic is not None else
             Topic(commit_log_topic_spec.topic_name)),
            set(commit_log_groups),
        ),
        time_shift=(timedelta(seconds=delay_seconds *
                              -1) if delay_seconds is not None else None),
    )

    producer = ProducerEncodingWrapper(
        KafkaProducer(
            build_kafka_producer_configuration(
                loader.get_default_topic_spec().topic,
                bootstrap_servers=bootstrap_servers,
                override_params={
                    "partitioner": "consistent",
                    "message.max.bytes": 50000000,  # 50MB, default is 1MB
                },
            )),
        SubscriptionTaskResultEncoder(),
    )

    executor = ThreadPoolExecutor(max_workers=max_query_workers)
    logger.debug("Starting %r with %s workers...", executor,
                 getattr(executor, "_max_workers", 0))
    metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0))

    with closing(consumer), executor, closing(producer):
        from arroyo import configure_metrics

        configure_metrics(StreamMetricsAdapter(metrics))
        batching_consumer = StreamProcessor(
            consumer,
            (Topic(topic) if topic is not None else Topic(
                loader.get_default_topic_spec().topic_name)),
            BatchProcessingStrategyFactory(
                SubscriptionWorker(
                    dataset,
                    executor,
                    {
                        index: SubscriptionScheduler(
                            RedisSubscriptionDataStore(redis_client, dataset,
                                                       PartitionId(index)),
                            PartitionId(index),
                            cache_ttl=timedelta(seconds=schedule_ttl),
                            metrics=metrics,
                        )
                        for index in
                        range(partitions if partitions is not None else loader.
                              get_default_topic_spec().partitions_number)
                    },
                    producer,
                    Topic(result_topic) if result_topic is not None else Topic(
                        result_topic_spec.topic_name),
                    metrics,
                ),
                max_batch_size,
                max_batch_time_ms,
            ),
        )

        def handler(signum: int, frame: Optional[Any]) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
Ejemplo n.º 7
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from arroyo import Topic, configure_metrics
    from arroyo.backends.kafka import KafkaConsumer
    from arroyo.processing import StreamProcessor
    from arroyo.processing.strategies.batching import BatchProcessingStrategyFactory

    from snuba.replacer import ReplacerWorker
    from snuba.utils.streams.configuration_builder import (
        build_kafka_consumer_configuration, )

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)
    storage = get_writable_storage(storage_key)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (
        default_replacement_topic_spec is not None
    ), f"Storage {storage.get_storage_key().value} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(environment.metrics,
                             "replacer",
                             tags=metrics_tags)

    configure_metrics(StreamMetricsAdapter(metrics))

    replacer = StreamProcessor(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                default_replacement_topic_spec.topic,
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                strict_offset_reset=not no_strict_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ), ),
        Topic(replacements_topic),
        BatchProcessingStrategyFactory(
            worker=ReplacerWorker(storage, consumer_group, metrics=metrics),
            max_batch_size=max_batch_size,
            max_batch_time=max_batch_time_ms,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()
Ejemplo n.º 8
0
def build_scheduler_executor_consumer(
    dataset_name: str,
    entity_names: Sequence[str],
    consumer_group: str,
    followed_consumer_group: str,
    producer: Producer[KafkaPayload],
    auto_offset_reset: str,
    strict_offset_reset: bool,
    schedule_ttl: int,
    delay_seconds: Optional[int],
    stale_threshold_seconds: Optional[int],
    max_concurrent_queries: int,
    total_concurrent_queries: int,
    metrics: MetricsBackend,
    scheduling_mode: Optional[SchedulingWatermarkMode],
) -> StreamProcessor[Tick]:
    dataset = get_dataset(dataset_name)

    # Only entities in the same dataset with the same partition count, commit log
    # topic and result topics may be run together.
    def get_topic_configuration_for_entity(
        entity_name: str,
    ) -> TopicConfig:
        storage = get_entity(EntityKey(entity_name)).get_writable_storage()
        assert storage is not None
        stream_loader = storage.get_table_writer().get_stream_loader()
        partition_count = stream_loader.get_default_topic_spec().partitions_number

        commit_log_topic_spec = stream_loader.get_commit_log_topic_spec()
        assert commit_log_topic_spec is not None

        result_topic_spec = stream_loader.get_subscription_result_topic_spec()
        assert result_topic_spec is not None
        return TopicConfig(partition_count, commit_log_topic_spec, result_topic_spec)

    entity_topic_configurations = [
        get_topic_configuration_for_entity(entity_name) for entity_name in entity_names
    ]
    entity_topic_configuration = entity_topic_configurations[0]
    for c in entity_topic_configurations[1:]:
        assert c == entity_topic_configuration

    partitions, commit_log_topic, result_topic = entity_topic_configuration

    tick_consumer = CommitLogTickConsumer(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                commit_log_topic.topic,
                consumer_group,
                auto_offset_reset=auto_offset_reset,
                strict_offset_reset=strict_offset_reset,
            ),
        ),
        followed_consumer_group=followed_consumer_group,
        time_shift=(
            timedelta(seconds=delay_seconds * -1) if delay_seconds is not None else None
        ),
    )

    factory = CombinedSchedulerExecutorFactory(
        dataset,
        entity_names,
        partitions,
        max_concurrent_queries,
        total_concurrent_queries,
        producer,
        metrics,
        stale_threshold_seconds,
        result_topic.topic_name,
        schedule_ttl,
        scheduling_mode,
    )

    return StreamProcessor(
        tick_consumer,
        Topic(commit_log_topic.topic_name),
        factory,
    )
Ejemplo n.º 9
0
def multistorage_consumer(
    storage_names: Sequence[str],
    consumer_group: str,
    commit_log_topic: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    parallel_collect: bool,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
    dead_letter_topic: Optional[str] = None,
    cooperative_rebalancing: bool = False,
) -> None:

    DEFAULT_BLOCK_SIZE = int(32 * 1e6)

    if processes is not None:
        if input_block_size is None:
            input_block_size = DEFAULT_BLOCK_SIZE

        if output_block_size is None:
            output_block_size = DEFAULT_BLOCK_SIZE

    setup_logging(log_level)
    setup_sentry()

    logger.info("Consumer Starting")
    storages = {
        key: get_writable_storage(key)
        for key in (getattr(StorageKey, name.upper())
                    for name in storage_names)
    }

    topics = {
        storage.get_table_writer().get_stream_loader().get_default_topic_spec(
        ).topic_name
        for storage in storages.values()
    }

    # XXX: The ``StreamProcessor`` only supports a single topic at this time,
    # but is easily modified. The topic routing in the processing strategy is a
    # bit trickier (but also shouldn't be too bad.)
    topic = Topic(topics.pop())
    if topics:
        raise ValueError("only one topic is supported")

    commit_log: Optional[Topic]
    if commit_log_topic:
        commit_log = Topic(commit_log_topic)
    else:
        # XXX: The ``CommitLogConsumer`` also only supports a single topic at this
        # time. (It is less easily modified.) This also assumes the commit log
        # topic is on the same Kafka cluster as the input topic.
        commit_log_topics = {
            spec.topic_name
            for spec in (storage.get_table_writer().get_stream_loader(
            ).get_commit_log_topic_spec() for storage in storages.values())
            if spec is not None
        }

        if commit_log_topics:
            commit_log = Topic(commit_log_topics.pop())
        else:
            commit_log = None

        if commit_log_topics:
            raise ValueError("only one commit log topic is supported")

    # XXX: This requires that all storages are associated with the same Kafka
    # cluster so that they can be consumed by the same consumer instance.
    # Unfortunately, we don't have the concept of independently configurable
    # Kafka clusters in settings, only consumer configurations that are
    # associated with storages and/or global default configurations. To avoid
    # implementing yet another method of configuring Kafka clusters, this just
    # piggybacks on the existing configuration method(s), with the assumption
    # that most deployments are going to be using the default configuration.
    storage_keys = [*storages.keys()]

    kafka_topic = (storages[storage_keys[0]].get_table_writer().
                   get_stream_loader().get_default_topic_spec().topic)

    consumer_configuration = build_kafka_consumer_configuration(
        kafka_topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        strict_offset_reset=not no_strict_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
    )

    if cooperative_rebalancing is True:
        consumer_configuration[
            "partition.assignment.strategy"] = "cooperative-sticky"

    for storage_key in storage_keys[1:]:
        if (build_kafka_consumer_configuration(
                storages[storage_key].get_table_writer().get_stream_loader().
                get_default_topic_spec().topic,
                consumer_group,
        )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]):
            raise ValueError(
                "storages cannot be located on different Kafka clusters")

    metrics = MetricsWrapper(
        environment.metrics,
        "consumer",
        tags={
            "group": consumer_group,
            "storage": "_".join([storage_keys[0].value, "m"]),
        },
    )
    # Collect metrics from librdkafka if we have stats_collection_freq_ms set
    # for the consumer group, or use the default.
    stats_collection_frequency_ms = get_config(
        f"stats_collection_freq_ms_{consumer_group}",
        get_config("stats_collection_freq_ms", 0),
    )

    if stats_collection_frequency_ms and stats_collection_frequency_ms > 0:

        def stats_callback(stats_json: str) -> None:
            stats = rapidjson.loads(stats_json)
            metrics.gauge("librdkafka.total_queue_size",
                          stats.get("replyq", 0))

        consumer_configuration.update({
            "statistics.interval.ms": stats_collection_frequency_ms,
            "stats_cb": stats_callback,
        })
    if commit_log is None:
        consumer = KafkaConsumer(consumer_configuration)
    else:
        # XXX: This relies on the assumptions that a.) all storages are
        # located on the same Kafka cluster (validated above.)

        commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer(
        ).get_stream_loader().get_commit_log_topic_spec())
        assert commit_log_topic_spec is not None

        producer = ConfluentKafkaProducer(
            build_kafka_producer_configuration(commit_log_topic_spec.topic))
        consumer = KafkaConsumerWithCommitLog(
            consumer_configuration,
            producer=producer,
            commit_log_topic=commit_log,
        )

    dead_letter_producer: Optional[KafkaProducer] = None
    dead_letter_queue: Optional[Topic] = None
    if dead_letter_topic:
        dead_letter_queue = Topic(dead_letter_topic)

        dead_letter_producer = KafkaProducer(
            build_kafka_producer_configuration(
                StreamsTopic(dead_letter_topic)))

    configure_metrics(StreamMetricsAdapter(metrics))
    processor = StreamProcessor(
        consumer,
        topic,
        MultistorageConsumerProcessingStrategyFactory(
            [*storages.values()],
            max_batch_size,
            max_batch_time_ms / 1000.0,
            parallel_collect=parallel_collect,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
            metrics=metrics,
            producer=dead_letter_producer,
            topic=dead_letter_queue,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)
    if dead_letter_producer:
        with closing(dead_letter_producer):
            processor.run()
    else:
        processor.run()
Ejemplo n.º 10
0
def test_executor_consumer() -> None:
    """
    End to end integration test
    """
    state.set_config("subscription_mode_events", "new")
    admin_client = AdminClient(get_default_kafka_configuration())
    create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_SCHEDULED_EVENTS])
    create_topics(admin_client, [SnubaTopic.SUBSCRIPTION_RESULTS_EVENTS])

    dataset_name = "events"
    entity_name = "events"
    entity_key = EntityKey(entity_name)
    entity = get_entity(entity_key)
    storage = entity.get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()

    scheduled_result_topic_spec = stream_loader.get_subscription_result_topic_spec(
    )
    assert scheduled_result_topic_spec is not None
    result_producer = KafkaProducer(
        build_kafka_producer_configuration(scheduled_result_topic_spec.topic))

    result_consumer = KafkaConsumer(
        build_kafka_consumer_configuration(
            scheduled_result_topic_spec.topic,
            str(uuid.uuid1().hex),
            auto_offset_reset="latest",
            strict_offset_reset=False,
        ))
    assigned = False

    def on_partitions_assigned(partitions: Mapping[Partition, int]) -> None:
        nonlocal assigned
        assigned = True

    result_consumer.subscribe(
        [Topic(scheduled_result_topic_spec.topic_name)],
        on_assign=on_partitions_assigned,
    )

    attempts = 10
    while attempts > 0 and not assigned:
        result_consumer.poll(1.0)
        attempts -= 1

    # We need to wait for the consumer to receive partitions otherwise,
    # when we try to consume messages, we will not find anything.
    # Subscription is an async process.
    assert assigned == True, "Did not receive assignment within 10 attempts"

    consumer_group = str(uuid.uuid1().hex)
    auto_offset_reset = "latest"
    strict_offset_reset = False
    executor = build_executor_consumer(
        dataset_name,
        [entity_name],
        consumer_group,
        result_producer,
        2,
        2,
        auto_offset_reset,
        strict_offset_reset,
        TestingMetricsBackend(),
        None,
    )
    for i in range(1, 5):
        # Give time to the executor to subscribe
        time.sleep(1)
        executor._run_once()

    # Produce a scheduled task to the scheduled subscriptions topic
    subscription_data = SubscriptionData(
        project_id=1,
        query="MATCH (events) SELECT count()",
        time_window_sec=60,
        resolution_sec=60,
        entity_subscription=EventsSubscription(data_dict={}),
    )

    task = ScheduledSubscriptionTask(
        timestamp=datetime(1970, 1, 1),
        task=SubscriptionWithMetadata(
            entity_key,
            Subscription(
                SubscriptionIdentifier(
                    PartitionId(1),
                    uuid.UUID("91b46cb6224f11ecb2ddacde48001122")),
                subscription_data,
            ),
            1,
        ),
    )

    encoder = SubscriptionScheduledTaskEncoder()
    encoded_task = encoder.encode(task)

    scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec(
    )
    assert scheduled_topic_spec is not None
    tasks_producer = KafkaProducer(
        build_kafka_producer_configuration(scheduled_topic_spec.topic))

    scheduled_topic = Topic(scheduled_topic_spec.topic_name)
    tasks_producer.produce(scheduled_topic, payload=encoded_task).result()
    tasks_producer.close()

    executor._run_once()
    executor.signal_shutdown()
    # Call run here so that the executor shuts down itself cleanly.
    executor.run()
    result = result_consumer.poll(5)
    assert result is not None, "Did not receive a result message"
    data = json.loads(result.payload.value)
    assert (data["payload"]["subscription_id"] ==
            "1/91b46cb6224f11ecb2ddacde48001122"), "Invalid subscription id"

    result_producer.close()
Ejemplo n.º 11
0
def build_executor_consumer(
    dataset_name: str,
    entity_names: Sequence[str],
    consumer_group: str,
    producer: Producer[KafkaPayload],
    max_concurrent_queries: int,
    total_concurrent_queries: int,
    auto_offset_reset: str,
    strict_offset_reset: Optional[bool],
    metrics: MetricsBackend,
    stale_threshold_seconds: Optional[int],
    cooperative_rebalancing: bool = False,
) -> StreamProcessor[KafkaPayload]:
    # Validate that a valid dataset/entity pair was passed in
    dataset = get_dataset(dataset_name)
    dataset_entity_names = [
        ENTITY_NAME_LOOKUP[e].value for e in dataset.get_all_entities()
    ]

    # Only entities in the same dataset with the same scheduled and result topics
    # may be run together

    def get_topics_for_entity(
        entity_name: str, ) -> Tuple[KafkaTopicSpec, KafkaTopicSpec]:
        assert (
            entity_name in dataset_entity_names
        ), f"Entity {entity_name} does not exist in dataset {dataset_name}"

        entity = get_entity(EntityKey(entity_name))
        storage = entity.get_writable_storage()

        assert (
            storage is not None
        ), f"Entity {entity_name} does not have a writable storage by default."

        stream_loader = storage.get_table_writer().get_stream_loader()

        scheduled_topic_spec = stream_loader.get_subscription_scheduled_topic_spec(
        )
        assert scheduled_topic_spec is not None

        result_topic_spec = stream_loader.get_subscription_result_topic_spec()
        assert result_topic_spec is not None

        return scheduled_topic_spec, result_topic_spec

    scheduled_topic_spec, result_topic_spec = get_topics_for_entity(
        entity_names[0])

    for entity_name in entity_names[1:]:
        assert get_topics_for_entity(entity_name) == (
            scheduled_topic_spec,
            result_topic_spec,
        ), "All entities must have same scheduled and result topics"

    consumer_configuration = build_kafka_consumer_configuration(
        scheduled_topic_spec.topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        strict_offset_reset=strict_offset_reset,
    )

    # Collect metrics from librdkafka if we have stats_collection_freq_ms set
    # for the consumer group, or use the default.
    stats_collection_frequency_ms = get_config(
        f"stats_collection_freq_ms_{consumer_group}",
        get_config("stats_collection_freq_ms", 0),
    )

    if stats_collection_frequency_ms and stats_collection_frequency_ms > 0:

        def stats_callback(stats_json: str) -> None:
            stats = rapidjson.loads(stats_json)
            metrics.gauge("librdkafka.total_queue_size",
                          stats.get("replyq", 0))

        consumer_configuration.update({
            "statistics.interval.ms": stats_collection_frequency_ms,
            "stats_cb": stats_callback,
        })

    if cooperative_rebalancing is True:
        consumer_configuration[
            "partition.assignment.strategy"] = "cooperative-sticky"

    return StreamProcessor(
        KafkaConsumer(consumer_configuration),
        Topic(scheduled_topic_spec.topic_name),
        SubscriptionExecutorProcessingFactory(
            max_concurrent_queries,
            total_concurrent_queries,
            dataset,
            entity_names,
            producer,
            metrics,
            stale_threshold_seconds,
            result_topic_spec.topic_name,
        ),
    )