Exemple #1
0
    def ensure_table_exists(dataset, force=False):
        if not force and _ensured.get(dataset, False):
            return

        assert local_dataset_mode(), "Cannot create table in distributed mode"

        from snuba import migrate

        # We cannot build distributed tables this way. So this only works in local
        # mode.
        for statement in dataset.get_dataset_schemas().get_create_statements():
            clickhouse_rw.execute(statement)

        migrate.run(clickhouse_rw, dataset)

        _ensured[dataset] = True
Exemple #2
0
def migrate(log_level):
    from snuba.migrate import logger, run
    # TODO: this only supports one dataset so far. More work is needed for the others.
    dataset = get_dataset('events')
    logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s')

    if not local_dataset_mode():
        logger.error("The migration tool can only work on local dataset mode.")
        sys.exit(1)

    clickhouse = Client(
        host=settings.CLICKHOUSE_HOST,
        port=settings.CLICKHOUSE_PORT,
    )

    run(clickhouse, dataset)
Exemple #3
0
def migrate(log_level):
    from snuba.migrate import logger, run

    logging.basicConfig(level=getattr(logging, log_level.upper()),
                        format='%(asctime)s %(message)s')

    if settings.CLICKHOUSE_TABLE != 'dev':
        logger.error(
            "The migration tool is only intended for local development environment."
        )
        sys.exit(1)

    host, port = settings.CLICKHOUSE_SERVER.split(':')
    clickhouse = Client(
        host=host,
        port=port,
    )

    run(clickhouse, settings.CLICKHOUSE_TABLE)
Exemple #4
0
def migrate(*,
            log_level: Optional[str] = None,
            dataset_name: Optional[str] = None) -> None:
    from snuba.migrate import logger, run

    setup_logging(log_level)

    if not local_dataset_mode():
        logger.error("The migration tool can only work on local dataset mode.")
        sys.exit(1)

    dataset_names = [dataset_name] if dataset_name else DATASET_NAMES
    for name in dataset_names:
        dataset = get_dataset(name)
        logger.info("Migrating dataset %s", name)

        clickhouse = Client(
            host=settings.CLICKHOUSE_HOST,
            port=settings.CLICKHOUSE_PORT,
        )

        run(clickhouse, dataset)
Exemple #5
0
def bootstrap(
    *,
    bootstrap_server: Sequence[str],
    kafka: bool,
    force: bool,
    log_level: Optional[str] = None,
) -> None:
    """
    Warning: Not intended to be used in production yet.
    """
    if not force:
        raise click.ClickException("Must use --force to run")

    setup_logging(log_level)

    logger = logging.getLogger("snuba.bootstrap")

    import time

    if kafka:
        logger.debug("Using Kafka with %r", bootstrap_server)
        from confluent_kafka.admin import AdminClient, NewTopic

        attempts = 0
        while True:
            try:
                logger.debug("Attempting to connect to Kafka (attempt %d)", attempts)
                client = AdminClient(
                    {
                        "bootstrap.servers": ",".join(bootstrap_server),
                        "socket.timeout.ms": 1000,
                    }
                )
                client.list_topics(timeout=1)
                break
            except Exception as e:
                logger.error(
                    "Connection to Kafka failed (attempt %d)", attempts, exc_info=e
                )
                attempts += 1
                if attempts == 60:
                    raise
                time.sleep(1)

        topics = {}
        for name in DATASET_NAMES:
            dataset = get_dataset(name)
            table_writer = dataset.get_table_writer()
            if table_writer:
                stream_loader = table_writer.get_stream_loader()
                for topic_spec in stream_loader.get_all_topic_specs():
                    if topic_spec.topic_name in topics:
                        continue
                    logger.debug(
                        "Adding topic %s to creation list", topic_spec.topic_name
                    )
                    topics[topic_spec.topic_name] = NewTopic(
                        topic_spec.topic_name,
                        num_partitions=topic_spec.partitions_number,
                        replication_factor=topic_spec.replication_factor,
                    )

        logger.debug("Initiating topic creation")
        for topic, future in client.create_topics(
            list(topics.values()), operation_timeout=1
        ).items():
            try:
                future.result()
                logger.info("Topic %s created", topic)
            except Exception as e:
                logger.error("Failed to create topic %s", topic, exc_info=e)

    attempts = 0
    while True:
        try:
            logger.debug("Attempting to connect to Clickhouse (attempt %d)", attempts)
            clickhouse_rw.execute("SELECT 1")
            break
        except Exception as e:
            logger.error(
                "Connection to Clickhouse failed (attempt %d)", attempts, exc_info=e
            )
            attempts += 1
            if attempts == 60:
                raise
            time.sleep(1)

    # Need to better figure out if we are configured to use replicated
    # tables or distributed tables, etc.

    # Create the tables for every dataset.
    existing_tables = {row[0] for row in clickhouse_rw.execute("show tables")}
    for name in DATASET_NAMES:
        dataset = get_dataset(name)

        logger.debug("Creating tables for dataset %s", name)
        run_migrations = False
        for statement in dataset.get_dataset_schemas().get_create_statements():
            if statement.table_name not in existing_tables:
                # This is a hack to deal with updates to Materialized views.
                # It seems that ClickHouse would parse the SELECT statement that defines a
                # materialized view even if the view already exists and the CREATE statement
                # includes the IF NOT EXISTS clause.
                # When we add a column to a matview, though, we will be in a state where, by
                # running bootstrap, ClickHouse will parse the SQL statement to try to create
                # the view and fail because the column does not exist yet on the underlying table,
                # since the migration on the underlying table has not ran yet.
                # Migrations are per dataset so they can only run after the bootstrap of an
                # entire dataset has run. So we would have bootstrap depending on migration
                # and migration depending on bootstrap.
                # In order to break this dependency we skip bootstrap DDL calls here if the
                # table/view already exists, so it is always safe to run bootstrap first.
                logger.debug("Executing:\n%s", statement.statement)
                clickhouse_rw.execute(statement.statement)
            else:
                logger.debug("Skipping existing table %s", statement.table_name)
                run_migrations = True
        if run_migrations:
            logger.debug("Running missing migrations for dataset %s", name)
            run(clickhouse_rw, dataset)
        logger.info("Tables for dataset %s created.", name)
Exemple #6
0
    def ensure_table_exists():
        from snuba.clickhouse import get_table_definition, get_test_engine

        clickhouse_rw.execute(
            get_table_definition(
                name=settings.CLICKHOUSE_TABLE,
                engine=get_test_engine(),
            )
        )

    ensure_table_exists()

    if settings.CLICKHOUSE_TABLE == 'dev':
        from snuba import migrate
        migrate.run(clickhouse_rw, settings.CLICKHOUSE_TABLE)

    @application.route('/tests/insert', methods=['POST'])
    def write():
        from snuba.processor import process_message
        from snuba.writer import row_from_processed_event, write_rows

        body = json.loads(request.data)

        rows = []
        for event in body:
            _, processed = process_message(event)
            row = row_from_processed_event(processed)
            rows.append(row)

        ensure_table_exists()