Example #1
0
    def drop(*, dataset: Dataset):
        for statement in dataset.get_dataset_schemas().get_drop_statements():
            clickhouse_rw.execute(statement.statement)

        ensure_table_exists(dataset, force=True)
        redis_client.flushdb()
        return ("ok", 200, {"Content-Type": "text/plain"})
Example #2
0
    def ensure_table_exists(dataset: Dataset, force: bool = False) -> None:
        if not force and _ensured.get(dataset, False):
            return

        assert local_dataset_mode(), "Cannot create table in distributed mode"

        from snuba import migrate

        # We cannot build distributed tables this way. So this only works in local
        # mode.
        for statement in dataset.get_dataset_schemas().get_create_statements():
            clickhouse_rw.execute(statement.statement)

        migrate.run(clickhouse_rw, dataset)

        _ensured[dataset] = True
Example #3
0
def run(events_file, dataset, repeat=1, profile_process=False, profile_write=False):
    """
    Measures the write performance of a dataset
    """

    from snuba.consumer import ConsumerWorker

    for storage in dataset.get_all_storages():
        for statement in storage.get_schemas().get_create_statements():
            clickhouse_rw.execute(statement.statement)

    writable_storage = dataset.get_writable_storage()

    consumer = ConsumerWorker(writable_storage, metrics=DummyMetricsBackend())

    messages = get_messages(events_file)
    messages = chain(*([messages] * repeat))
    processed = []

    def process():
        with settings_override({"DISCARD_OLD_EVENTS": False}):
            for message in messages:
                result = consumer.process_message(message)
                if result is not None:
                    processed.append(result)

    def write():
        consumer.flush_batch(processed)

    time_start = time.time()
    if profile_process:
        filename = tempfile.NamedTemporaryFile(
            prefix=os.path.basename(events_file) + ".process.",
            suffix=".pstats",
            delete=False,
        ).name
        cProfile.runctx("process()", globals(), locals(), filename=filename)
        logger.info("Profile Data: %s", filename)
    else:
        process()
    time_write = time.time()
    if profile_write:
        filename = tempfile.NamedTemporaryFile(
            prefix=os.path.basename(events_file) + ".write.",
            suffix=".pstats",
            delete=False,
        ).name
        cProfile.runctx("write()", globals(), locals(), filename=filename)
        logger.info("Profile Data: %s", filename)
    else:
        write()
    time_finish = time.time()

    time_to_process = (time_write - time_start) * 1000
    time_to_write = (time_finish - time_write) * 1000
    time_total = (time_finish - time_start) * 1000
    num_events = len(processed)

    logger.info("Number of events: %s" % str(num_events).rjust(10, " "))
    logger.info("Total:            %sms" % format_time(time_total))
    logger.info("Total process:    %sms" % format_time(time_to_process))
    logger.info("Total write:      %sms" % format_time(time_to_write))
    logger.info("Process event:    %sms/ea" % format_time(time_to_process / num_events))
    logger.info("Write event:      %sms/ea" % format_time(time_to_write / num_events))
Example #4
0
def bootstrap(
    *,
    bootstrap_server: Sequence[str],
    kafka: bool,
    force: bool,
    log_level: Optional[str] = None,
) -> None:
    """
    Warning: Not intended to be used in production yet.
    """
    if not force:
        raise click.ClickException("Must use --force to run")

    setup_logging(log_level)

    logger = logging.getLogger("snuba.bootstrap")

    import time

    if kafka:
        logger.debug("Using Kafka with %r", bootstrap_server)
        from confluent_kafka.admin import AdminClient, NewTopic

        attempts = 0
        while True:
            try:
                logger.debug("Attempting to connect to Kafka (attempt %d)", attempts)
                client = AdminClient(
                    {
                        "bootstrap.servers": ",".join(bootstrap_server),
                        "socket.timeout.ms": 1000,
                    }
                )
                client.list_topics(timeout=1)
                break
            except Exception as e:
                logger.error(
                    "Connection to Kafka failed (attempt %d)", attempts, exc_info=e
                )
                attempts += 1
                if attempts == 60:
                    raise
                time.sleep(1)

        topics = {}
        for name in DATASET_NAMES:
            dataset = get_dataset(name)
            table_writer = dataset.get_table_writer()
            if table_writer:
                stream_loader = table_writer.get_stream_loader()
                for topic_spec in stream_loader.get_all_topic_specs():
                    if topic_spec.topic_name in topics:
                        continue
                    logger.debug(
                        "Adding topic %s to creation list", topic_spec.topic_name
                    )
                    topics[topic_spec.topic_name] = NewTopic(
                        topic_spec.topic_name,
                        num_partitions=topic_spec.partitions_number,
                        replication_factor=topic_spec.replication_factor,
                    )

        logger.debug("Initiating topic creation")
        for topic, future in client.create_topics(
            list(topics.values()), operation_timeout=1
        ).items():
            try:
                future.result()
                logger.info("Topic %s created", topic)
            except Exception as e:
                logger.error("Failed to create topic %s", topic, exc_info=e)

    attempts = 0
    while True:
        try:
            logger.debug("Attempting to connect to Clickhouse (attempt %d)", attempts)
            clickhouse_rw.execute("SELECT 1")
            break
        except Exception as e:
            logger.error(
                "Connection to Clickhouse failed (attempt %d)", attempts, exc_info=e
            )
            attempts += 1
            if attempts == 60:
                raise
            time.sleep(1)

    # Need to better figure out if we are configured to use replicated
    # tables or distributed tables, etc.

    # Create the tables for every dataset.
    existing_tables = {row[0] for row in clickhouse_rw.execute("show tables")}
    for name in DATASET_NAMES:
        dataset = get_dataset(name)

        logger.debug("Creating tables for dataset %s", name)
        run_migrations = False
        for statement in dataset.get_dataset_schemas().get_create_statements():
            if statement.table_name not in existing_tables:
                # This is a hack to deal with updates to Materialized views.
                # It seems that ClickHouse would parse the SELECT statement that defines a
                # materialized view even if the view already exists and the CREATE statement
                # includes the IF NOT EXISTS clause.
                # When we add a column to a matview, though, we will be in a state where, by
                # running bootstrap, ClickHouse will parse the SQL statement to try to create
                # the view and fail because the column does not exist yet on the underlying table,
                # since the migration on the underlying table has not ran yet.
                # Migrations are per dataset so they can only run after the bootstrap of an
                # entire dataset has run. So we would have bootstrap depending on migration
                # and migration depending on bootstrap.
                # In order to break this dependency we skip bootstrap DDL calls here if the
                # table/view already exists, so it is always safe to run bootstrap first.
                logger.debug("Executing:\n%s", statement.statement)
                clickhouse_rw.execute(statement.statement)
            else:
                logger.debug("Skipping existing table %s", statement.table_name)
                run_migrations = True
        if run_migrations:
            logger.debug("Running missing migrations for dataset %s", name)
            run(clickhouse_rw, dataset)
        logger.info("Tables for dataset %s created.", name)