def drop(*, dataset: Dataset): for statement in dataset.get_dataset_schemas().get_drop_statements(): clickhouse_rw.execute(statement.statement) ensure_table_exists(dataset, force=True) redis_client.flushdb() return ("ok", 200, {"Content-Type": "text/plain"})
def ensure_table_exists(dataset: Dataset, force: bool = False) -> None: if not force and _ensured.get(dataset, False): return assert local_dataset_mode(), "Cannot create table in distributed mode" from snuba import migrate # We cannot build distributed tables this way. So this only works in local # mode. for statement in dataset.get_dataset_schemas().get_create_statements(): clickhouse_rw.execute(statement.statement) migrate.run(clickhouse_rw, dataset) _ensured[dataset] = True
def run(events_file, dataset, repeat=1, profile_process=False, profile_write=False): """ Measures the write performance of a dataset """ from snuba.consumer import ConsumerWorker for storage in dataset.get_all_storages(): for statement in storage.get_schemas().get_create_statements(): clickhouse_rw.execute(statement.statement) writable_storage = dataset.get_writable_storage() consumer = ConsumerWorker(writable_storage, metrics=DummyMetricsBackend()) messages = get_messages(events_file) messages = chain(*([messages] * repeat)) processed = [] def process(): with settings_override({"DISCARD_OLD_EVENTS": False}): for message in messages: result = consumer.process_message(message) if result is not None: processed.append(result) def write(): consumer.flush_batch(processed) time_start = time.time() if profile_process: filename = tempfile.NamedTemporaryFile( prefix=os.path.basename(events_file) + ".process.", suffix=".pstats", delete=False, ).name cProfile.runctx("process()", globals(), locals(), filename=filename) logger.info("Profile Data: %s", filename) else: process() time_write = time.time() if profile_write: filename = tempfile.NamedTemporaryFile( prefix=os.path.basename(events_file) + ".write.", suffix=".pstats", delete=False, ).name cProfile.runctx("write()", globals(), locals(), filename=filename) logger.info("Profile Data: %s", filename) else: write() time_finish = time.time() time_to_process = (time_write - time_start) * 1000 time_to_write = (time_finish - time_write) * 1000 time_total = (time_finish - time_start) * 1000 num_events = len(processed) logger.info("Number of events: %s" % str(num_events).rjust(10, " ")) logger.info("Total: %sms" % format_time(time_total)) logger.info("Total process: %sms" % format_time(time_to_process)) logger.info("Total write: %sms" % format_time(time_to_write)) logger.info("Process event: %sms/ea" % format_time(time_to_process / num_events)) logger.info("Write event: %sms/ea" % format_time(time_to_write / num_events))
def bootstrap( *, bootstrap_server: Sequence[str], kafka: bool, force: bool, log_level: Optional[str] = None, ) -> None: """ Warning: Not intended to be used in production yet. """ if not force: raise click.ClickException("Must use --force to run") setup_logging(log_level) logger = logging.getLogger("snuba.bootstrap") import time if kafka: logger.debug("Using Kafka with %r", bootstrap_server) from confluent_kafka.admin import AdminClient, NewTopic attempts = 0 while True: try: logger.debug("Attempting to connect to Kafka (attempt %d)", attempts) client = AdminClient( { "bootstrap.servers": ",".join(bootstrap_server), "socket.timeout.ms": 1000, } ) client.list_topics(timeout=1) break except Exception as e: logger.error( "Connection to Kafka failed (attempt %d)", attempts, exc_info=e ) attempts += 1 if attempts == 60: raise time.sleep(1) topics = {} for name in DATASET_NAMES: dataset = get_dataset(name) table_writer = dataset.get_table_writer() if table_writer: stream_loader = table_writer.get_stream_loader() for topic_spec in stream_loader.get_all_topic_specs(): if topic_spec.topic_name in topics: continue logger.debug( "Adding topic %s to creation list", topic_spec.topic_name ) topics[topic_spec.topic_name] = NewTopic( topic_spec.topic_name, num_partitions=topic_spec.partitions_number, replication_factor=topic_spec.replication_factor, ) logger.debug("Initiating topic creation") for topic, future in client.create_topics( list(topics.values()), operation_timeout=1 ).items(): try: future.result() logger.info("Topic %s created", topic) except Exception as e: logger.error("Failed to create topic %s", topic, exc_info=e) attempts = 0 while True: try: logger.debug("Attempting to connect to Clickhouse (attempt %d)", attempts) clickhouse_rw.execute("SELECT 1") break except Exception as e: logger.error( "Connection to Clickhouse failed (attempt %d)", attempts, exc_info=e ) attempts += 1 if attempts == 60: raise time.sleep(1) # Need to better figure out if we are configured to use replicated # tables or distributed tables, etc. # Create the tables for every dataset. existing_tables = {row[0] for row in clickhouse_rw.execute("show tables")} for name in DATASET_NAMES: dataset = get_dataset(name) logger.debug("Creating tables for dataset %s", name) run_migrations = False for statement in dataset.get_dataset_schemas().get_create_statements(): if statement.table_name not in existing_tables: # This is a hack to deal with updates to Materialized views. # It seems that ClickHouse would parse the SELECT statement that defines a # materialized view even if the view already exists and the CREATE statement # includes the IF NOT EXISTS clause. # When we add a column to a matview, though, we will be in a state where, by # running bootstrap, ClickHouse will parse the SQL statement to try to create # the view and fail because the column does not exist yet on the underlying table, # since the migration on the underlying table has not ran yet. # Migrations are per dataset so they can only run after the bootstrap of an # entire dataset has run. So we would have bootstrap depending on migration # and migration depending on bootstrap. # In order to break this dependency we skip bootstrap DDL calls here if the # table/view already exists, so it is always safe to run bootstrap first. logger.debug("Executing:\n%s", statement.statement) clickhouse_rw.execute(statement.statement) else: logger.debug("Skipping existing table %s", statement.table_name) run_migrations = True if run_migrations: logger.debug("Running missing migrations for dataset %s", name) run(clickhouse_rw, dataset) logger.info("Tables for dataset %s created.", name)