Beispiel #1
0
def api(*, debug: bool, log_level: Optional[str], processes: int, threads: int) -> None:
    from snuba import settings

    if debug:
        if processes > 1 or threads > 1:
            raise click.ClickException("processes/threads can only be 1 in debug")

        from snuba.web.views import application
        from werkzeug.serving import WSGIRequestHandler

        setup_logging(log_level)

        WSGIRequestHandler.protocol_version = "HTTP/1.1"
        application.run(port=settings.PORT, threaded=True, debug=debug)
    else:
        import mywsgi

        if log_level:
            os.environ["LOG_LEVEL"] = log_level

        mywsgi.run(
            "snuba.web.wsgi:application",
            f"0.0.0.0:{settings.PORT}",
            processes=processes,
            threads=threads,
        )
Beispiel #2
0
def perf(
    *,
    events_file: Optional[str],
    repeat: int,
    profile_process: bool,
    profile_write: bool,
    dataset_name: str,
    log_level: Optional[str] = None,
) -> None:
    from snuba.perf import run, logger

    setup_logging(log_level)

    dataset = get_dataset(dataset_name)
    if not local_dataset_mode():
        logger.error(
            "The perf tool is only intended for local dataset environment.")
        sys.exit(1)

    run(
        events_file,
        dataset,
        repeat=repeat,
        profile_process=profile_process,
        profile_write=profile_write,
    )
Beispiel #3
0
def optimize(
    *,
    clickhouse_host: str,
    clickhouse_port: int,
    database: str,
    dataset_name: str,
    timeout: int,
    log_level: Optional[str] = None,
) -> None:
    from datetime import datetime
    from snuba.clickhouse.native import ClickhousePool
    from snuba.optimize import run_optimize, logger

    setup_logging(log_level)

    dataset = get_dataset(dataset_name)
    table = enforce_table_writer(dataset).get_schema().get_local_table_name()

    today = datetime.utcnow().replace(hour=0,
                                      minute=0,
                                      second=0,
                                      microsecond=0)
    clickhouse = ClickhousePool(clickhouse_host,
                                clickhouse_port,
                                send_receive_timeout=timeout)
    num_dropped = run_optimize(clickhouse, database, table, before=today)
    logger.info("Optimized %s partitions on %s" %
                (num_dropped, clickhouse_host))
Beispiel #4
0
def bulk_load(
    *,
    dataset_name: Optional[str],
    dest_table: Optional[str],
    source: Optional[str],
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info("Start bulk load process for dataset %s, from source %s",
                dataset_name, source)
    dataset = get_dataset(dataset_name)

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = enforce_table_writer(dataset).get_bulk_loader(
        snapshot_source, dest_table)
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table),
        settings.BULK_CLICKHOUSE_BUFFER,
    )

    loader.load(writer)
Beispiel #5
0
def perf(
    *,
    events_file: Optional[str],
    repeat: int,
    profile_process: bool,
    profile_write: bool,
    dataset_name: str,
    log_level: Optional[str] = None,
) -> None:
    from snuba.perf import run, logger

    setup_logging(log_level)

    dataset = get_dataset(dataset_name)

    if not all(storage.get_cluster().is_single_node()
               for storage in dataset.get_all_storages()):
        logger.error(
            "The perf tool is only intended for single node environment.")
        sys.exit(1)

    run(
        events_file,
        dataset,
        repeat=repeat,
        profile_process=profile_process,
        profile_write=profile_write,
    )
Beispiel #6
0
def migrate(*, log_level: Optional[str] = None) -> None:
    click.echo("Warning: The migrate command is deprecated and will be removed soon\n")

    setup_logging(log_level)

    check_clickhouse_connections()

    run()
Beispiel #7
0
def optimize(
    *,
    clickhouse_host: Optional[str],
    clickhouse_port: Optional[int],
    storage_name: str,
    parallel: int,
    log_level: Optional[str] = None,
) -> None:
    from datetime import datetime

    from snuba.clickhouse.native import ClickhousePool
    from snuba.optimize import logger, run_optimize

    setup_logging(log_level)
    setup_sentry()

    storage: ReadableTableStorage

    storage_key = StorageKey(storage_name)
    storage = get_storage(storage_key)

    (clickhouse_user, clickhouse_password) = storage.get_cluster().get_credentials()

    today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)

    database = storage.get_cluster().get_database()

    # TODO: In distributed mode, optimize currently must be run once for each node
    # with the host and port of that node provided via the CLI. In the future,
    # passing this information won't be necessary, and running this command once
    # will ensure that optimize is performed on all of the individual nodes for
    # that cluster.
    if clickhouse_host and clickhouse_port:
        connection = ClickhousePool(
            clickhouse_host,
            clickhouse_port,
            clickhouse_user,
            clickhouse_password,
            database,
            send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout,
        )
    elif not storage.get_cluster().is_single_node():
        raise click.ClickException("Provide Clickhouse host and port for optimize")
    else:
        connection = storage.get_cluster().get_query_connection(
            ClickhouseClientSettings.OPTIMIZE
        )

    num_dropped = run_optimize(
        connection,
        storage,
        database,
        before=today,
        parallel=parallel,
        clickhouse_host=clickhouse_host,
    )
    logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
Beispiel #8
0
 def api(*, debug: bool, log_level: Optional[str] = None) -> None:
     from snuba import settings
     from snuba.web.views import application
     from werkzeug.serving import WSGIRequestHandler
 
     setup_logging(log_level)
 
     WSGIRequestHandler.protocol_version = "HTTP/1.1"
     host = "0.0.0.0"
     application.run(host=host, port=settings.PORT, threaded=True, debug=debug)
Beispiel #9
0
def migrate(force: bool, log_level: Optional[str] = None) -> None:
    """
    Runs all migrations. Blocking migrations will not be run unless --force is passed.
    """
    setup_logging(log_level)
    check_clickhouse_connections()
    runner = Runner()

    try:
        runner.run_all(force=force)
    except MigrationError as e:
        raise click.ClickException(str(e))

    click.echo("Finished running migrations")
Beispiel #10
0
def cleanup(
    *,
    clickhouse_host: Optional[str],
    clickhouse_port: Optional[int],
    dry_run: bool,
    database: str,
    storage_name: str,
    log_level: Optional[str] = None,
) -> None:
    """
    Deletes stale partitions for ClickHouse tables
    """

    setup_logging(log_level)

    from snuba.cleanup import run_cleanup, logger
    from snuba.clickhouse.native import ClickhousePool

    writable_storage = get_writable_storage(StorageKey(storage_name))

    (
        clickhouse_user,
        clickhouse_password,
    ) = writable_storage.get_cluster().get_credentials()

    table = writable_storage.get_table_writer().get_schema(
    ).get_local_table_name()

    if clickhouse_host and clickhouse_port and database:
        connection = ClickhousePool(
            clickhouse_host,
            clickhouse_port,
            clickhouse_user,
            clickhouse_password,
            database,
        )
    elif not writable_storage.get_cluster().is_single_node():
        raise click.ClickException(
            "Provide ClickHouse host and port for cleanup")
    else:
        connection = writable_storage.get_cluster().get_query_connection(
            ClickhouseClientSettings.CLEANUP)

    num_dropped = run_cleanup(connection, database, table, dry_run=dry_run)
    logger.info("Dropped %s partitions on %s" % (num_dropped, clickhouse_host))
Beispiel #11
0
def api(
    *,
    bind: Optional[str],
    debug: bool,
    log_level: Optional[str],
    processes: int,
    threads: int,
) -> None:
    from snuba import settings

    port: Union[int, str]
    if bind:
        if ":" in bind:
            host, port = bind.split(":", 1)
            port = int(port)
        else:
            raise click.ClickException("bind can only be in the format <host>:<port>")
    else:
        host, port = settings.HOST, settings.PORT

    if debug:
        if processes > 1 or threads > 1:
            raise click.ClickException("processes/threads can only be 1 in debug")

        from werkzeug.serving import WSGIRequestHandler

        from snuba.web.views import application

        setup_logging(log_level)

        WSGIRequestHandler.protocol_version = "HTTP/1.1"
        application.run(host=host, port=port, threaded=True, debug=debug)
    else:
        import mywsgi

        if log_level:
            os.environ["LOG_LEVEL"] = log_level

        mywsgi.run(
            "snuba.web.wsgi:application",
            f"{host}:{port}",
            processes=processes,
            threads=threads,
        )
Beispiel #12
0
def bulk_load(
    *,
    storage_name: str,
    dest_table: str,
    source: str,
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info("Start bulk load process for storage %s, from source %s",
                storage_name, source)

    storage = get_cdc_storage(StorageKey(storage_name))
    table_writer = storage.get_table_writer()

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = table_writer.get_bulk_loader(
        snapshot_source,
        storage.get_postgres_table(),
        dest_table,
        storage.get_row_processor(),
    )
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        table_writer.get_batch_writer(
            environment.metrics,
            table_name=dest_table,
            chunk_size=settings.BULK_CLICKHOUSE_BUFFER,
        ),
        settings.BULK_CLICKHOUSE_BUFFER,
        JSONRowEncoder(),
    )

    loader.load(writer)
Beispiel #13
0
def migrate(*,
            log_level: Optional[str] = None,
            dataset_name: Optional[str] = None) -> None:
    from snuba.migrate import logger, run

    setup_logging(log_level)

    if not local_dataset_mode():
        logger.error("The migration tool can only work on local dataset mode.")
        sys.exit(1)

    dataset_names = [dataset_name] if dataset_name else DATASET_NAMES
    for name in dataset_names:
        dataset = get_dataset(name)
        logger.info("Migrating dataset %s", name)

        clickhouse = Client(
            host=settings.CLICKHOUSE_HOST,
            port=settings.CLICKHOUSE_PORT,
        )

        run(clickhouse, dataset)
Beispiel #14
0
def run(
    group: str,
    migration_id: str,
    force: bool,
    fake: bool,
    dry_run: bool,
    log_level: Optional[str] = None,
) -> None:
    """
    Runs a single migration.
    --force must be passed in order to run blocking migrations.
    --fake marks a migration as completed without running anything.

    Migrations that are already in an in-progress or completed status will not be run.
    """
    setup_logging(log_level)
    if not dry_run:
        check_clickhouse_connections()

    runner = Runner()
    migration_group = MigrationGroup(group)
    migration_key = MigrationKey(migration_group, migration_id)

    if dry_run:
        runner.run_migration(migration_key, dry_run=True)
        return

    try:
        if fake:
            click.confirm(
                "This will mark the migration as completed without actually running it. Your database may be in an invalid state. Are you sure?",
                abort=True,
            )
        runner.run_migration(migration_key, force=force, fake=fake)
    except MigrationError as e:
        raise click.ClickException(str(e))

    click.echo(f"Finished running migration {migration_key}")
Beispiel #15
0
def admin(
    *,
    debug: bool,
    log_level: Optional[str],
    processes: int,
    threads: int,
) -> None:
    from snuba import settings

    host, port = settings.ADMIN_HOST, settings.ADMIN_PORT

    if debug:
        if processes > 1 or threads > 1:
            raise click.ClickException(
                "processes/threads can only be 1 in debug")

        from werkzeug.serving import WSGIRequestHandler

        from snuba.admin.views import application

        setup_logging(log_level)

        WSGIRequestHandler.protocol_version = "HTTP/1.1"
        application.run(host=host, port=port, threaded=True, debug=debug)
    else:
        import mywsgi

        if log_level:
            os.environ["LOG_LEVEL"] = log_level

        mywsgi.run(
            "snuba.admin.wsgi:application",
            f"{host}:{port}",
            processes=processes,
            threads=threads,
        )
Beispiel #16
0
def cleanup(
    *,
    clickhouse_host: str,
    clickhouse_port: int,
    dry_run: bool,
    database: str,
    dataset_name: str,
    log_level: Optional[str] = None,
) -> None:
    """
    Deletes stale partitions for ClickHouse tables
    """

    setup_logging(log_level)

    from snuba.cleanup import run_cleanup, logger
    from snuba.clickhouse.native import ClickhousePool

    dataset = get_dataset(dataset_name)
    table = enforce_table_writer(dataset).get_schema().get_local_table_name()

    clickhouse = ClickhousePool(clickhouse_host, clickhouse_port)
    num_dropped = run_cleanup(clickhouse, database, table, dry_run=dry_run)
    logger.info("Dropped %s partitions on %s" % (num_dropped, clickhouse_host))
Beispiel #17
0
def reverse(
    group: str,
    migration_id: str,
    force: bool,
    fake: bool,
    dry_run: bool,
    log_level: Optional[str] = None,
) -> None:
    """
    Reverses a single migration.

    --force is required to reverse an already completed migration.
    --fake marks a migration as reversed without doing anything.
    """
    setup_logging(log_level)
    if not dry_run:
        check_clickhouse_connections()
    runner = Runner()
    migration_group = MigrationGroup(group)
    migration_key = MigrationKey(migration_group, migration_id)

    if dry_run:
        runner.reverse_migration(migration_key, dry_run=True)
        return

    try:
        if fake:
            click.confirm(
                "This will mark the migration as not started without actually reversing it. Your database may be in an invalid state. Are you sure?",
                abort=True,
            )
        runner.reverse_migration(migration_key, force=force, fake=fake)
    except MigrationError as e:
        raise click.ClickException(str(e))

    click.echo(f"Finished reversing migration {migration_key}")
Beispiel #18
0
def multistorage_consumer(
    storage_names: Sequence[str],
    consumer_group: str,
    commit_log_topic: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    parallel_collect: bool,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
    dead_letter_topic: Optional[str] = None,
    cooperative_rebalancing: bool = False,
) -> None:

    DEFAULT_BLOCK_SIZE = int(32 * 1e6)

    if processes is not None:
        if input_block_size is None:
            input_block_size = DEFAULT_BLOCK_SIZE

        if output_block_size is None:
            output_block_size = DEFAULT_BLOCK_SIZE

    setup_logging(log_level)
    setup_sentry()

    logger.info("Consumer Starting")
    storages = {
        key: get_writable_storage(key)
        for key in (getattr(StorageKey, name.upper())
                    for name in storage_names)
    }

    topics = {
        storage.get_table_writer().get_stream_loader().get_default_topic_spec(
        ).topic_name
        for storage in storages.values()
    }

    # XXX: The ``StreamProcessor`` only supports a single topic at this time,
    # but is easily modified. The topic routing in the processing strategy is a
    # bit trickier (but also shouldn't be too bad.)
    topic = Topic(topics.pop())
    if topics:
        raise ValueError("only one topic is supported")

    commit_log: Optional[Topic]
    if commit_log_topic:
        commit_log = Topic(commit_log_topic)
    else:
        # XXX: The ``CommitLogConsumer`` also only supports a single topic at this
        # time. (It is less easily modified.) This also assumes the commit log
        # topic is on the same Kafka cluster as the input topic.
        commit_log_topics = {
            spec.topic_name
            for spec in (storage.get_table_writer().get_stream_loader(
            ).get_commit_log_topic_spec() for storage in storages.values())
            if spec is not None
        }

        if commit_log_topics:
            commit_log = Topic(commit_log_topics.pop())
        else:
            commit_log = None

        if commit_log_topics:
            raise ValueError("only one commit log topic is supported")

    # XXX: This requires that all storages are associated with the same Kafka
    # cluster so that they can be consumed by the same consumer instance.
    # Unfortunately, we don't have the concept of independently configurable
    # Kafka clusters in settings, only consumer configurations that are
    # associated with storages and/or global default configurations. To avoid
    # implementing yet another method of configuring Kafka clusters, this just
    # piggybacks on the existing configuration method(s), with the assumption
    # that most deployments are going to be using the default configuration.
    storage_keys = [*storages.keys()]

    kafka_topic = (storages[storage_keys[0]].get_table_writer().
                   get_stream_loader().get_default_topic_spec().topic)

    consumer_configuration = build_kafka_consumer_configuration(
        kafka_topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        strict_offset_reset=not no_strict_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
    )

    if cooperative_rebalancing is True:
        consumer_configuration[
            "partition.assignment.strategy"] = "cooperative-sticky"

    for storage_key in storage_keys[1:]:
        if (build_kafka_consumer_configuration(
                storages[storage_key].get_table_writer().get_stream_loader().
                get_default_topic_spec().topic,
                consumer_group,
        )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]):
            raise ValueError(
                "storages cannot be located on different Kafka clusters")

    metrics = MetricsWrapper(
        environment.metrics,
        "consumer",
        tags={
            "group": consumer_group,
            "storage": "_".join([storage_keys[0].value, "m"]),
        },
    )
    # Collect metrics from librdkafka if we have stats_collection_freq_ms set
    # for the consumer group, or use the default.
    stats_collection_frequency_ms = get_config(
        f"stats_collection_freq_ms_{consumer_group}",
        get_config("stats_collection_freq_ms", 0),
    )

    if stats_collection_frequency_ms and stats_collection_frequency_ms > 0:

        def stats_callback(stats_json: str) -> None:
            stats = rapidjson.loads(stats_json)
            metrics.gauge("librdkafka.total_queue_size",
                          stats.get("replyq", 0))

        consumer_configuration.update({
            "statistics.interval.ms": stats_collection_frequency_ms,
            "stats_cb": stats_callback,
        })
    if commit_log is None:
        consumer = KafkaConsumer(consumer_configuration)
    else:
        # XXX: This relies on the assumptions that a.) all storages are
        # located on the same Kafka cluster (validated above.)

        commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer(
        ).get_stream_loader().get_commit_log_topic_spec())
        assert commit_log_topic_spec is not None

        producer = ConfluentKafkaProducer(
            build_kafka_producer_configuration(commit_log_topic_spec.topic))
        consumer = KafkaConsumerWithCommitLog(
            consumer_configuration,
            producer=producer,
            commit_log_topic=commit_log,
        )

    dead_letter_producer: Optional[KafkaProducer] = None
    dead_letter_queue: Optional[Topic] = None
    if dead_letter_topic:
        dead_letter_queue = Topic(dead_letter_topic)

        dead_letter_producer = KafkaProducer(
            build_kafka_producer_configuration(
                StreamsTopic(dead_letter_topic)))

    configure_metrics(StreamMetricsAdapter(metrics))
    processor = StreamProcessor(
        consumer,
        topic,
        MultistorageConsumerProcessingStrategyFactory(
            [*storages.values()],
            max_batch_size,
            max_batch_time_ms / 1000.0,
            parallel_collect=parallel_collect,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
            metrics=metrics,
            producer=dead_letter_producer,
            topic=dead_letter_queue,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)
    if dead_letter_producer:
        with closing(dead_letter_producer):
            processor.run()
    else:
        processor.run()
Beispiel #19
0
def consumer(
    *,
    raw_events_topic: Optional[str],
    replacements_topic: Optional[str],
    commit_log_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    parallel_collect: bool,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
    profile_path: Optional[str] = None,
    cooperative_rebalancing: bool = False,
) -> None:

    setup_logging(log_level)
    setup_sentry()
    logger.info("Consumer Starting")
    storage_key = StorageKey(storage_name)

    metrics = MetricsWrapper(
        environment.metrics,
        "consumer",
        tags={"group": consumer_group, "storage": storage_key.value},
    )
    configure_metrics(StreamMetricsAdapter(metrics))

    def stats_callback(stats_json: str) -> None:
        stats = rapidjson.loads(stats_json)
        metrics.gauge("librdkafka.total_queue_size", stats.get("replyq", 0))

    consumer_builder = ConsumerBuilder(
        storage_key=storage_key,
        kafka_params=KafkaParameters(
            raw_topic=raw_events_topic,
            replacements_topic=replacements_topic,
            bootstrap_servers=bootstrap_server,
            group_id=consumer_group,
            commit_log_topic=commit_log_topic,
            auto_offset_reset=auto_offset_reset,
            strict_offset_reset=not no_strict_offset_reset,
            queued_max_messages_kbytes=queued_max_messages_kbytes,
            queued_min_messages=queued_min_messages,
        ),
        processing_params=ProcessingParameters(
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
        ),
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        metrics=metrics,
        profile_path=profile_path,
        stats_callback=stats_callback,
        parallel_collect=parallel_collect,
        cooperative_rebalancing=cooperative_rebalancing,
    )

    consumer = consumer_builder.build_base_consumer()

    def handler(signum: int, frame: Any) -> None:
        consumer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    consumer.run()
def subscriptions_scheduler_executor(
    *,
    dataset_name: str,
    entity_names: Sequence[str],
    consumer_group: str,
    followed_consumer_group: str,
    max_concurrent_queries: int,
    total_concurrent_queries: int,
    auto_offset_reset: str,
    no_strict_offset_reset: bool,
    schedule_ttl: int,
    delay_seconds: Optional[int],
    stale_threshold_seconds: Optional[int],
    log_level: Optional[str],
    # TODO: Temporarily overrides the scheduling mode.
    # Required for single tenant since some partitions may be empty.
    # To be removed once transactions is no longer semantically partitioned.
    scheduling_mode: Optional[str],
) -> None:
    """
    Combined subscriptions scheduler and executor. Alternative to the separate scheduler and executor processes.
    """
    setup_logging(log_level)
    setup_sentry()

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions.scheduler_executor",
        tags={"dataset": dataset_name},
    )

    configure_metrics(StreamMetricsAdapter(metrics))

    # Just get the result topic configuration from the first entity. Later we
    # check they all have the same result topic anyway before building the consumer.
    entity_key = EntityKey(entity_names[0])

    storage = get_entity(entity_key).get_writable_storage()
    assert storage is not None
    stream_loader = storage.get_table_writer().get_stream_loader()
    result_topic_spec = stream_loader.get_subscription_scheduled_topic_spec()
    assert result_topic_spec is not None

    producer = KafkaProducer(
        build_kafka_producer_configuration(
            result_topic_spec.topic,
            override_params={"partitioner": "consistent"},
        )
    )

    processor = build_scheduler_executor_consumer(
        dataset_name,
        entity_names,
        consumer_group,
        followed_consumer_group,
        producer,
        auto_offset_reset,
        not no_strict_offset_reset,
        schedule_ttl,
        delay_seconds,
        stale_threshold_seconds,
        max_concurrent_queries,
        total_concurrent_queries,
        metrics,
        SchedulingWatermarkMode(scheduling_mode)
        if scheduling_mode is not None
        else None,
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    with closing(producer), flush_querylog():
        processor.run()
Beispiel #21
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    max_query_workers: Optional[int],
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
    delay_seconds: Optional[int],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    storage = dataset.get_default_entity().get_writable_storage()
    assert (
        storage is not None
    ), f"Dataset {dataset_name} does not have a writable storage by default."

    loader = enforce_table_writer(dataset).get_stream_loader()
    commit_log_topic_spec = loader.get_commit_log_topic_spec()
    assert commit_log_topic_spec is not None

    result_topic_spec = loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions",
        tags={
            "group": consumer_group,
            "dataset": dataset_name
        },
    )

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    loader.get_default_topic_spec().topic,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                    bootstrap_servers=bootstrap_servers,
                ), ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    commit_log_topic_spec.topic,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                    bootstrap_servers=bootstrap_servers,
                ), ),
            (Topic(commit_log_topic) if commit_log_topic is not None else
             Topic(commit_log_topic_spec.topic_name)),
            set(commit_log_groups),
        ),
        time_shift=(timedelta(seconds=delay_seconds *
                              -1) if delay_seconds is not None else None),
    )

    producer = ProducerEncodingWrapper(
        KafkaProducer(
            build_kafka_producer_configuration(
                loader.get_default_topic_spec().topic,
                bootstrap_servers=bootstrap_servers,
                override_params={
                    "partitioner": "consistent",
                    "message.max.bytes": 50000000,  # 50MB, default is 1MB
                },
            )),
        SubscriptionTaskResultEncoder(),
    )

    executor = ThreadPoolExecutor(max_workers=max_query_workers)
    logger.debug("Starting %r with %s workers...", executor,
                 getattr(executor, "_max_workers", 0))
    metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0))

    with closing(consumer), executor, closing(producer):
        from arroyo import configure_metrics

        configure_metrics(StreamMetricsAdapter(metrics))
        batching_consumer = StreamProcessor(
            consumer,
            (Topic(topic) if topic is not None else Topic(
                loader.get_default_topic_spec().topic_name)),
            BatchProcessingStrategyFactory(
                SubscriptionWorker(
                    dataset,
                    executor,
                    {
                        index: SubscriptionScheduler(
                            RedisSubscriptionDataStore(redis_client, dataset,
                                                       PartitionId(index)),
                            PartitionId(index),
                            cache_ttl=timedelta(seconds=schedule_ttl),
                            metrics=metrics,
                        )
                        for index in
                        range(partitions if partitions is not None else loader.
                              get_default_topic_spec().partitions_number)
                    },
                    producer,
                    Topic(result_topic) if result_topic is not None else Topic(
                        result_topic_spec.topic_name),
                    metrics,
                ),
                max_batch_size,
                max_batch_time_ms,
            ),
        )

        def handler(signum: int, frame: Optional[Any]) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
Beispiel #22
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from snuba.replacer import ReplacerWorker
    from snuba.utils.streams import Topic
    from snuba.utils.streams.backends.kafka import (
        KafkaConsumer,
        TransportError,
        build_kafka_consumer_configuration,
    )
    from snuba.utils.streams.processing import StreamProcessor
    from snuba.utils.streams.processing.strategies.batching import (
        BatchProcessingStrategyFactory, )

    setup_logging(log_level)
    setup_sentry()

    storage_key = StorageKey(storage_name)
    storage = get_writable_storage(storage_key)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (
        default_replacement_topic_spec is not None
    ), f"Storage {storage.get_storage_key().value} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(
        environment.metrics,
        "replacer",
        tags=metrics_tags,
    )

    replacer = StreamProcessor(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ), ),
        Topic(replacements_topic),
        BatchProcessingStrategyFactory(
            worker=ReplacerWorker(storage, metrics=metrics),
            max_batch_size=max_batch_size,
            max_batch_time=max_batch_time_ms,
            metrics=metrics,
        ),
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum: int, frame: Any) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()
Beispiel #23
0
def confirm_load(
    *,
    control_topic: Optional[str],
    bootstrap_server: Sequence[str],
    storage_name: str,
    source: str,
    log_level: Optional[str] = None,
) -> None:
    """
    Confirms the snapshot has been loaded by sending the
    snapshot-loaded message on the control topic.
    """

    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.loaded-snapshot")
    logger.info(
        "Sending load completion message for storage %s, from source %s",
        storage_name,
        source,
    )

    storage_key = StorageKey(storage_name)
    storage = get_cdc_storage(storage_key)

    stream_loader = storage.get_table_writer().get_stream_loader()

    control_topic = control_topic or storage.get_default_control_topic()

    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT, path=source,
    )

    descriptor = snapshot_source.get_descriptor()

    producer = Producer(
        build_kafka_producer_configuration(
            stream_loader.get_default_topic_spec().topic,
            bootstrap_servers=bootstrap_server,
            override_params={
                "partitioner": "consistent",
                "message.max.bytes": 50000000,  # 50MB, default is 1MB
            },
        )
    )

    msg = SnapshotLoaded(
        id=descriptor.id,
        transaction_info=TransactionData(
            xmin=descriptor.xmin, xmax=descriptor.xmax, xip_list=descriptor.xip_list,
        ),
    )
    json_string = json.dumps(msg.to_dict())

    def delivery_callback(error: KafkaError, message: Message) -> None:
        if error is not None:
            raise error
        else:
            logger.info("Message sent %r", message.value())

    producer.produce(
        control_topic, value=json_string, on_delivery=delivery_callback,
    )

    producer.flush()
Beispiel #24
0
def bulk_load(
    *,
    storage_name: str,
    dest_table: Optional[str],
    source: str,
    ignore_existing_data: bool,
    pre_processed: bool,
    show_progress: bool,
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info(
        "Start bulk load process for storage %s, from source %s", storage_name, source
    )

    storage = get_cdc_storage(StorageKey(storage_name))
    table_writer = storage.get_table_writer()

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT, path=source,
    )

    loader = table_writer.get_bulk_loader(
        snapshot_source,
        storage.get_postgres_table(),
        storage.get_row_processor(),
        dest_table,
    )
    # TODO: see whether we need to pass options to the writer

    def progress_callback(bar: progressbar.ProgressBar, progress: int) -> None:
        bar.update(progress)

    if show_progress:
        progress = progressbar.ProgressBar(
            max_value=snapshot_source.get_table_file_size(storage.get_postgres_table())
        )
        progress_func: Optional[ProgressCallback] = partial(progress_callback, progress)
    else:
        progress_func = None

    table_descriptor = snapshot_source.get_descriptor().get_table(
        storage.get_postgres_table()
    )
    if pre_processed:
        writer = table_writer.get_bulk_writer(
            metrics=environment.metrics,
            encoding="gzip" if table_descriptor.zip else None,
            column_names=[c.name for c in table_descriptor.columns or []],
            table_name=dest_table,
        )
        loader.load_preprocessed(
            writer, ignore_existing_data, progress_callback=progress_func
        )
    else:
        buffer_writer = BufferedWriterWrapper(
            table_writer.get_batch_writer(
                environment.metrics,
                table_name=dest_table,
                chunk_size=settings.BULK_CLICKHOUSE_BUFFER,
            ),
            settings.BULK_CLICKHOUSE_BUFFER,
            JSONRowEncoder(),
        )
        loader.load(
            buffer_writer, ignore_existing_data, progress_callback=progress_func
        )
Beispiel #25
0
def consumer(
    *,
    raw_events_topic: Optional[str],
    replacements_topic: Optional[str],
    commit_log_topic: Optional[str],
    control_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    dataset_name: Optional[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    stateful_consumer: bool,
    rapidjson_deserialize: bool,
    rapidjson_serialize: bool,
    log_level: Optional[str] = None,
) -> None:

    if not bootstrap_server:
        if dataset_name:
            bootstrap_server = settings.DEFAULT_DATASET_BROKERS.get(
                dataset_name,
                settings.DEFAULT_BROKERS,
            )
        else:
            bootstrap_server = settings.DEFAULT_STORAGE_BROKERS.get(
                storage_name,
                settings.DEFAULT_BROKERS,
            )

    setup_logging(log_level)
    setup_sentry()

    # TODO: Remove this once dataset_name is no longer being passed
    if dataset_name:
        dataset_writable_storage = get_dataset(
            dataset_name).get_writable_storage()
        if not dataset_writable_storage:
            raise click.ClickException(
                f"Dataset {dataset_name} has no writable storage")

        storage_name = {v: k
                        for k, v in WRITABLE_STORAGES.items()
                        }[dataset_writable_storage]

    consumer_builder = ConsumerBuilder(
        storage_name=storage_name,
        raw_topic=raw_events_topic,
        replacements_topic=replacements_topic,
        max_batch_size=max_batch_size,
        max_batch_time_ms=max_batch_time_ms,
        bootstrap_servers=bootstrap_server,
        group_id=consumer_group,
        commit_log_topic=commit_log_topic,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
        rapidjson_deserialize=rapidjson_deserialize,
        rapidjson_serialize=rapidjson_serialize,
    )

    if stateful_consumer:
        storage = get_cdc_storage(storage_name)
        assert storage is not None, "Only CDC storages have a control topic thus are supported."
        context = ConsumerStateMachine(
            consumer_builder=consumer_builder,
            topic=control_topic or storage.get_default_control_topic(),
            group_id=consumer_group,
            storage=storage,
        )

        def handler(signum, frame) -> None:
            context.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        context.run()
    else:
        consumer = consumer_builder.build_base_consumer()

        def handler(signum, frame) -> None:
            consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        consumer.run()
Beispiel #26
0
def confirm_load(
    *,
    control_topic: Optional[str],
    bootstrap_server: Sequence[str],
    dataset_name: str,
    source: Optional[str],
    log_level: Optional[str] = None,
) -> None:
    """
    Confirms the snapshot has been loaded by sending the
    snapshot-loaded message on the control topic.
    """

    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.loaded-snapshot")
    logger.info(
        "Sending load completion message for dataset %s, from source %s",
        dataset_name,
        source,
    )

    dataset = get_dataset(dataset_name)

    storage = dataset.get_writable_storage()

    assert isinstance(
        storage, CdcStorage
    ), "Only CDC storages have a control topic thus are supported."

    control_topic = control_topic or storage.get_default_control_topic()

    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    descriptor = snapshot_source.get_descriptor()

    if not bootstrap_server:
        bootstrap_server = settings.DEFAULT_DATASET_BROKERS.get(
            dataset,
            settings.DEFAULT_BROKERS,
        )

    producer = Producer({
        "bootstrap.servers": ",".join(bootstrap_server),
        "partitioner": "consistent",
        "message.max.bytes": 50000000,  # 50MB, default is 1MB
    })

    msg = SnapshotLoaded(
        id=descriptor.id,
        transaction_info=TransactionData(
            xmin=descriptor.xmin,
            xmax=descriptor.xmax,
            xip_list=descriptor.xip_list,
        ),
    )
    json_string = json.dumps(msg.to_dict())

    def delivery_callback(error, message) -> None:
        if error is not None:
            raise error
        else:
            logger.info("Message sent %r", message.value())

    producer.produce(
        control_topic,
        value=json_string,
        on_delivery=delivery_callback,
    )

    producer.flush()
Beispiel #27
0
def replacer(
    *,
    replacements_topic: Optional[str],
    consumer_group: str,
    bootstrap_server: Sequence[str],
    dataset_name: Optional[str],
    storage_name: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    log_level: Optional[str] = None,
) -> None:

    from snuba.clickhouse.native import ClickhousePool
    from snuba.replacer import ReplacerWorker
    from snuba.utils.codecs import PassthroughCodec
    from snuba.utils.streams.batching import BatchingConsumer
    from snuba.utils.streams.kafka import (
        KafkaConsumer,
        KafkaPayload,
        TransportError,
        build_kafka_consumer_configuration,
    )
    from snuba.utils.streams.types import Topic

    setup_logging(log_level)
    setup_sentry()

    storage = get_writable_storage(storage_name)
    metrics_tags = {"group": consumer_group, "storage": storage_name}

    # If dataset_name is provided, use the writable storage from that dataset.
    # This can be removed once we are passing storage_name instead of
    # dataset_name everywhere
    if dataset_name:
        dataset = get_dataset(dataset_name)
        storage = dataset.get_writable_storage()
        metrics_tags = {"group": consumer_group, "dataset": dataset_name}

    stream_loader = storage.get_table_writer().get_stream_loader()
    default_replacement_topic_spec = stream_loader.get_replacement_topic_spec()
    assert (default_replacement_topic_spec is not None
            ), f"Storage {type(storage)} does not have a replacement topic."
    replacements_topic = replacements_topic or default_replacement_topic_spec.topic_name

    metrics = MetricsWrapper(
        environment.metrics,
        "replacer",
        tags=metrics_tags,
    )

    client_settings = {
        # Replacing existing rows requires reconstructing the entire tuple for each
        # event (via a SELECT), which is a Hard Thing (TM) for columnstores to do. With
        # the default settings it's common for ClickHouse to go over the default max_memory_usage
        # of 10GB per query. Lowering the max_block_size reduces memory usage, and increasing the
        # max_memory_usage gives the query more breathing room.
        "max_block_size": settings.REPLACER_MAX_BLOCK_SIZE,
        "max_memory_usage": settings.REPLACER_MAX_MEMORY_USAGE,
        # Don't use up production cache for the count() queries.
        "use_uncompressed_cache": 0,
    }

    clickhouse = ClickhousePool(
        settings.CLICKHOUSE_HOST,
        settings.CLICKHOUSE_PORT,
        client_settings=client_settings,
    )

    codec: PassthroughCodec[KafkaPayload] = PassthroughCodec()
    replacer = BatchingConsumer(
        KafkaConsumer(
            build_kafka_consumer_configuration(
                bootstrap_servers=bootstrap_server,
                group_id=consumer_group,
                auto_offset_reset=auto_offset_reset,
                queued_max_messages_kbytes=queued_max_messages_kbytes,
                queued_min_messages=queued_min_messages,
            ),
            codec=codec,
        ),
        Topic(replacements_topic),
        worker=ReplacerWorker(clickhouse, storage, metrics=metrics),
        max_batch_size=max_batch_size,
        max_batch_time=max_batch_time_ms,
        metrics=metrics,
        recoverable_errors=[TransportError],
    )

    def handler(signum, frame) -> None:
        replacer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    replacer.run()
Beispiel #28
0
def bootstrap(
    *,
    bootstrap_server: Sequence[str],
    kafka: bool,
    migrate: bool,
    force: bool,
    log_level: Optional[str] = None,
) -> None:
    """
    Warning: Not intended to be used in production yet.
    """
    if not force:
        raise click.ClickException("Must use --force to run")

    setup_logging(log_level)

    logger = logging.getLogger("snuba.bootstrap")

    import time

    if kafka:
        logger.debug("Using Kafka with %r", bootstrap_server)
        from confluent_kafka.admin import AdminClient, NewTopic

        attempts = 0
        while True:
            try:
                logger.debug("Attempting to connect to Kafka (attempt %d)",
                             attempts)
                client = AdminClient({
                    "bootstrap.servers":
                    ",".join(bootstrap_server),
                    "socket.timeout.ms":
                    1000,
                })
                client.list_topics(timeout=1)
                break
            except Exception as e:
                logger.error("Connection to Kafka failed (attempt %d)",
                             attempts,
                             exc_info=e)
                attempts += 1
                if attempts == 60:
                    raise
                time.sleep(1)

        topics = {}
        for name in ACTIVE_DATASET_NAMES:
            dataset = get_dataset(name)
            for entity in dataset.get_all_entities():
                writable_storage = entity.get_writable_storage()
                if writable_storage:
                    table_writer = writable_storage.get_table_writer()
                    stream_loader = table_writer.get_stream_loader()
                    for topic_spec in stream_loader.get_all_topic_specs():
                        if topic_spec.topic_name in topics:
                            continue
                        logger.debug("Adding topic %s to creation list",
                                     topic_spec.topic_name)
                        topics[topic_spec.topic_name] = NewTopic(
                            topic_spec.topic_name,
                            num_partitions=topic_spec.partitions_number,
                            replication_factor=topic_spec.replication_factor,
                        )

        logger.debug("Initiating topic creation")
        for topic, future in client.create_topics(list(topics.values()),
                                                  operation_timeout=1).items():
            try:
                future.result()
                logger.info("Topic %s created", topic)
            except Exception as e:
                logger.error("Failed to create topic %s", topic, exc_info=e)

    if migrate:
        check_clickhouse_connections()
        Runner().run_all(force=True)
Beispiel #29
0
def multistorage_consumer(
    storage_names: Sequence[str],
    consumer_group: str,
    max_batch_size: int,
    max_batch_time_ms: int,
    auto_offset_reset: str,
    queued_max_messages_kbytes: int,
    queued_min_messages: int,
    processes: Optional[int],
    input_block_size: Optional[int],
    output_block_size: Optional[int],
    log_level: Optional[str] = None,
) -> None:

    DEFAULT_BLOCK_SIZE = int(32 * 1e6)

    if processes is not None:
        if input_block_size is None:
            input_block_size = DEFAULT_BLOCK_SIZE

        if output_block_size is None:
            output_block_size = DEFAULT_BLOCK_SIZE

    setup_logging(log_level)
    setup_sentry()

    storages = {
        key: get_writable_storage(key)
        for key in (getattr(StorageKey, name.upper())
                    for name in storage_names)
    }

    topics = {
        storage.get_table_writer().get_stream_loader().get_default_topic_spec(
        ).topic_name
        for storage in storages.values()
    }

    # XXX: The ``StreamProcessor`` only supports a single topic at this time,
    # but is easily modified. The topic routing in the processing strategy is a
    # bit trickier (but also shouldn't be too bad.)
    topic = Topic(topics.pop())
    if topics:
        raise ValueError("only one topic is supported")

    # XXX: The ``CommitLogConsumer`` also only supports a single topic at this
    # time. (It is less easily modified.) This also assumes the commit log
    # topic is on the same Kafka cluster as the input topic.
    commit_log_topics = {
        spec.topic_name
        for spec in (storage.get_table_writer().get_stream_loader(
        ).get_commit_log_topic_spec() for storage in storages.values())
        if spec is not None
    }

    commit_log_topic: Optional[Topic]
    if commit_log_topics:
        commit_log_topic = Topic(commit_log_topics.pop())
    else:
        commit_log_topic = None

    if commit_log_topics:
        raise ValueError("only one commit log topic is supported")

    # XXX: This requires that all storages are associated with the same Kafka
    # cluster so that they can be consumed by the same consumer instance.
    # Unfortunately, we don't have the concept of independently configurable
    # Kafka clusters in settings, only consumer configurations that are
    # associated with storages and/or global default configurations. To avoid
    # implementing yet another method of configuring Kafka clusters, this just
    # piggybacks on the existing configuration method(s), with the assumption
    # that most deployments are going to be using the default configuration.
    storage_keys = [*storages.keys()]

    kafka_topic = (storages[storage_keys[0]].get_table_writer().
                   get_stream_loader().get_default_topic_spec().topic)

    consumer_configuration = build_kafka_consumer_configuration(
        kafka_topic,
        consumer_group,
        auto_offset_reset=auto_offset_reset,
        queued_max_messages_kbytes=queued_max_messages_kbytes,
        queued_min_messages=queued_min_messages,
    )

    for storage_key in storage_keys[1:]:
        if (build_kafka_consumer_configuration(
                storages[storage_key].get_table_writer().get_stream_loader().
                get_default_topic_spec().topic,
                consumer_group,
        )["bootstrap.servers"] != consumer_configuration["bootstrap.servers"]):
            raise ValueError(
                "storages cannot be located on different Kafka clusters")

    if commit_log_topic is None:
        consumer = KafkaConsumer(consumer_configuration)
    else:
        # XXX: This relies on the assumptions that a.) all storages are
        # located on the same Kafka cluster (validated above.)

        commit_log_topic_spec = (storages[storage_keys[0]].get_table_writer(
        ).get_stream_loader().get_commit_log_topic_spec())
        assert commit_log_topic_spec is not None

        producer = ConfluentKafkaProducer(
            build_kafka_producer_configuration(commit_log_topic_spec.topic))
        consumer = KafkaConsumerWithCommitLog(
            consumer_configuration,
            producer=producer,
            commit_log_topic=commit_log_topic,
        )

    metrics = MetricsWrapper(environment.metrics, "consumer")

    configure_metrics(StreamMetricsAdapter(metrics))
    processor = StreamProcessor(
        consumer,
        topic,
        MultistorageConsumerProcessingStrategyFactory(
            [*storages.values()],
            max_batch_size,
            max_batch_time_ms / 1000.0,
            processes=processes,
            input_block_size=input_block_size,
            output_block_size=output_block_size,
            metrics=metrics,
        ),
    )

    def handler(signum: int, frame: Any) -> None:
        processor.signal_shutdown()

    signal.signal(signal.SIGINT, handler)
    signal.signal(signal.SIGTERM, handler)

    processor.run()
Beispiel #30
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    assert result_topic is not None

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    if not bootstrap_servers:
        bootstrap_servers = settings.DEFAULT_DATASET_BROKERS.get(
            dataset_name, settings.DEFAULT_BROKERS
        )

    loader = enforce_table_writer(dataset).get_stream_loader()

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                ),
                PassthroughCodec(),
            ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    bootstrap_servers,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                ),
                CommitCodec(),
            ),
            (
                Topic(commit_log_topic)
                if commit_log_topic is not None
                else Topic(loader.get_commit_log_topic_spec().topic_name)
            ),
            set(commit_log_groups),
        )
    )

    producer = KafkaProducer(
        {
            "bootstrap.servers": ",".join(bootstrap_servers),
            "partitioner": "consistent",
            "message.max.bytes": 50000000,  # 50MB, default is 1MB
        },
        SubscriptionResultCodec(),
    )

    with closing(consumer), closing(producer):
        batching_consumer = BatchingConsumer(
            consumer,
            (
                Topic(topic)
                if topic is not None
                else Topic(loader.get_default_topic_spec().topic_name)
            ),
            SubscriptionWorker(
                SubscriptionExecutor(
                    dataset,
                    ThreadPoolExecutor(
                        max_workers=settings.SUBSCRIPTIONS_MAX_CONCURRENT_QUERIES
                    ),
                ),
                {
                    index: SubscriptionScheduler(
                        RedisSubscriptionDataStore(
                            redis_client, dataset, PartitionId(index)
                        ),
                        PartitionId(index),
                        cache_ttl=timedelta(seconds=schedule_ttl),
                    )
                    for index in range(
                        partitions
                        if partitions is not None
                        else loader.get_default_topic_spec().partitions_number
                    )
                },
                producer,
                Topic(result_topic),
            ),
            max_batch_size,
            max_batch_time_ms,
            create_metrics(
                "snuba.subscriptions",
                tags={"group": consumer_group, "dataset": dataset_name},
            ),
        )

        def handler(signum, frame) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()