def test() -> None: cv = threading.Condition() query_result = QueryResult({}, {"stats": {}, "sql": ""}) mock_query_runner = Mock(return_value=query_result) def callback_func(args: List[Tuple[str, QueryResult]]) -> None: with cv: cv.notify() mock_callback = Mock(side_effect=callback_func) query_body = { "selected_columns": ["type", "project_id"], } events = get_dataset("events") query = parse_query(query_body, events) events_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.EVENTS)), ) errors_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.ERRORS)), ) delegator = PipelineDelegator( query_pipeline_builders={ "events": events_pipeline, "errors": errors_pipeline }, selector_func=lambda query, referrer: ("events", ["errors"]), callback_func=mock_callback, ) with cv: request_settings = HTTPRequestSettings() delegator.build_execution_pipeline( Request( "", query_body, query, request_settings, "ref", ), mock_query_runner, ).execute() cv.wait(timeout=5) assert mock_query_runner.call_count == 2 assert mock_callback.call_args == call( query, request_settings, "ref", [ Result("events", query_result, ANY), Result("errors", query_result, ANY) ], )
def test_clusters() -> None: importlib.reload(cluster) assert (get_storage(StorageKey("events")).get_cluster() == get_storage( StorageKey("errors")).get_cluster()) assert (get_storage(StorageKey("events")).get_cluster() != get_storage( StorageKey("transactions")).get_cluster())
def test_cache_partition() -> None: get_storage( StorageKey("transactions") ).get_cluster().get_reader().cache_partition_id == "host_2_cache" get_storage( StorageKey("errors") ).get_cluster().get_reader().cache_partition_id is None
def test_clusters() -> None: assert ( get_storage(StorageKey("events")).get_cluster() == get_storage(StorageKey("errors")).get_cluster() ) assert ( get_storage(StorageKey("events")).get_cluster() != get_storage(StorageKey("transactions")).get_cluster() )
def test_storage_selector() -> None: state.set_config("enable_events_readonly_table", True) storage = get_storage(StorageKey.EVENTS) storage_ro = get_storage(StorageKey.EVENTS_RO) query = Query({}, storage.get_schema().get_data_source()) storage_selector = EventsQueryStorageSelector(storage, storage_ro) assert (storage_selector.select_storage( query, HTTPRequestSettings(consistent=False)).storage == storage_ro) assert (storage_selector.select_storage( query, HTTPRequestSettings(consistent=True)).storage == storage)
def test_storage_selector() -> None: state.set_config("enable_events_readonly_table", True) storage = get_storage(StorageKey.ERRORS) storage_ro = get_storage(StorageKey.ERRORS_RO) query = Query(Entity(EntityKey.EVENTS, ColumnSet([])), selected_columns=[]) storage_selector = ErrorsQueryStorageSelector(mappers=errors_translators) assert (storage_selector.select_storage( query, HTTPRequestSettings(consistent=False)).storage == storage_ro) assert (storage_selector.select_storage( query, HTTPRequestSettings(consistent=True)).storage == storage)
def test_tags_hash_map(self) -> None: """ Adds an event and ensures the tags_hash_map is properly populated including escaping. """ self.event = get_raw_event() self.event["data"]["tags"].append(["test_tag1", "value1"]) self.event["data"]["tags"].append(["test_tag=2", "value2"]) # Requires escaping self.write_unprocessed_events([self.event]) clickhouse = ( get_storage(StorageKey.EVENTS) .get_cluster() .get_query_connection(ClickhouseClientSettings.QUERY) ) hashed = clickhouse.execute( "SELECT cityHash64('test_tag1=value1'), cityHash64('test_tag\\\\=2=value2')" ) tag1, tag2 = hashed[0] event = clickhouse.execute( ( f"SELECT event_id FROM sentry_local WHERE has(_tags_hash_map, {tag1}) " f"AND has(_tags_hash_map, {tag2})" ) ) assert len(event) == 1 assert event[0][0] == self.event["data"]["id"]
def _get_local_table_name(storage_key: StorageKey) -> str: try: schema = get_storage(storage_key).get_schema() assert isinstance(schema, TableSchema) return schema.get_table_name() except UndefinedClickhouseCluster: return "badcluster"
def test_offsets(self): event = self.event message: Message[KafkaPayload] = Message( Partition(Topic("events"), 456), 123, KafkaPayload(None, json.dumps((2, "insert", event)).encode("utf-8"), []), # event doesn't really matter datetime.now(), ) test_worker = ConsumerWorker( self.dataset.get_writable_storage(), producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset).get_stream_loader(). get_replacement_topic_spec().topic_name), metrics=self.metrics, ) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) clickhouse = (get_storage( StorageKey.EVENTS).get_cluster().get_query_connection( ClickhouseClientSettings.QUERY)) assert clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
def test_send_message( self, value: str, expected: Optional[ProcessedMessage], ) -> None: storage = get_storage("groupedmessages") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( storage=storage, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 1, KafkaPayload( None, value.encode("utf-8"), [("table", "sentry_groupedmessage".encode())], ), datetime.now(), ) ret = worker.process_message(message) assert ret == expected
def run_migrations() -> Iterator[None]: from snuba.migrations.runner import Runner Runner().run_all(force=True) yield for storage_key in STORAGES: storage = get_storage(storage_key) cluster = storage.get_cluster() database = cluster.get_database() schema = storage.get_schema() if isinstance(schema, WritableTableSchema): table_name = schema.get_local_table_name() nodes = [ *cluster.get_local_nodes(), *cluster.get_distributed_nodes() ] for node in nodes: connection = cluster.get_node_connection( ClickhouseClientSettings.MIGRATE, node) connection.execute( f"TRUNCATE TABLE IF EXISTS {database}.{table_name}") redis_client.flushdb()
def __init__(self) -> None: # The raw table we write onto, and that potentially we could # query. writable_storage = get_writable_storage(StorageKey.OUTCOMES_RAW) # The materialized view we query aggregate data from. materialized_storage = get_storage(StorageKey.OUTCOMES_HOURLY) read_schema = materialized_storage.get_schema() super().__init__( storages=[writable_storage, materialized_storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( # TODO: Once we are ready to expose the raw data model and select whether to use # materialized storage or the raw one here, replace this with a custom storage # selector that decides when to use the materialized data. storage=materialized_storage, ), ), abstract_column_set=read_schema.get_columns(), join_relationships={}, writable_storage=writable_storage, validators=[EntityRequiredColumnValidator({"org_id"})], required_time_column="timestamp", )
def __init__( self, writable_storage_key: StorageKey, readable_storage_key: StorageKey, value_schema: Sequence[Column[SchemaModifiers]], mappers: TranslationMappers, ) -> None: writable_storage = get_writable_storage(writable_storage_key) readable_storage = get_storage(readable_storage_key) super().__init__( storages=[writable_storage, readable_storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( readable_storage, mappers=TranslationMappers(subscriptables=[ SubscriptableMapper(None, "tags", None, "tags"), ], ).concat(mappers), )), abstract_column_set=ColumnSet([ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("timestamp", DateTime()), Column("tags", Nested([("key", UInt(64)), ("value", UInt(64))])), *value_schema, ]), join_relationships={}, writable_storage=writable_storage, validators=[ EntityRequiredColumnValidator({"org_id", "project_id"}) ], required_time_column="timestamp", )
def __init__(self) -> None: storage = get_storage(StorageKey.OUTCOMES_RAW) super().__init__( storages=[storage], query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage), abstract_column_set=storage.get_schema().get_columns(), writable_storage=None, )
def optimize( *, clickhouse_host: Optional[str], clickhouse_port: Optional[int], storage_name: str, parallel: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import logger, run_optimize setup_logging(log_level) setup_sentry() storage: ReadableTableStorage storage_key = StorageKey(storage_name) storage = get_storage(storage_key) (clickhouse_user, clickhouse_password) = storage.get_cluster().get_credentials() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) database = storage.get_cluster().get_database() # TODO: In distributed mode, optimize currently must be run once for each node # with the host and port of that node provided via the CLI. In the future, # passing this information won't be necessary, and running this command once # will ensure that optimize is performed on all of the individual nodes for # that cluster. if clickhouse_host and clickhouse_port: connection = ClickhousePool( clickhouse_host, clickhouse_port, clickhouse_user, clickhouse_password, database, send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout, ) elif not storage.get_cluster().is_single_node(): raise click.ClickException("Provide Clickhouse host and port for optimize") else: connection = storage.get_cluster().get_query_connection( ClickhouseClientSettings.OPTIMIZE ) num_dropped = run_optimize( connection, storage, database, before=today, parallel=parallel, clickhouse_host=clickhouse_host, ) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def test_get_local_nodes() -> None: with patch.object(ClickhousePool, "execute") as execute: execute.return_value = [ ("host_1", 9000, 1, 1), ("host_2", 9000, 2, 1), ] local_cluster = get_storage(StorageKey("events")).get_cluster() assert len(local_cluster.get_local_nodes()) == 1 assert local_cluster.get_local_nodes()[0].host_name == "host_1" assert local_cluster.get_local_nodes()[0].port == 9000 assert local_cluster.get_local_nodes()[0].shard is None assert local_cluster.get_local_nodes()[0].replica is None distributed_cluster = get_storage(StorageKey("transactions")).get_cluster() assert len(distributed_cluster.get_local_nodes()) == 2 assert distributed_cluster.get_local_nodes()[0].host_name == "host_1" assert distributed_cluster.get_local_nodes()[1].host_name == "host_2"
def get_storage_info() -> Sequence[Storage]: return [{ "storage_name": storage_key.value, "local_table_name": _get_local_table_name(storage_key), "local_nodes": _get_local_nodes(storage_key), } for storage_key in sorted(STORAGES, key=lambda storage_key: storage_key.value) if get_storage(storage_key).get_storage_set_key() not in DEV_STORAGE_SETS or settings.ENABLE_DEV_FEATURES]
def _get_local_nodes(storage_key: StorageKey) -> Sequence[Node]: try: storage = get_storage(storage_key) return [{ "host": node.host_name, "port": node.port } for node in storage.get_cluster().get_local_nodes()] except (AssertionError, KeyError, UndefinedClickhouseCluster): # If cluster_name is not defined just return an empty list return []
def __init__(self) -> None: storage = get_storage(StorageKey.OUTCOMES_RAW) self.__time_group_columns = {"time": "timestamp"} self.__time_parse_columns = ("timestamp", ) super().__init__( storages=[storage], query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage), abstract_column_set=storage.get_schema().get_columns(), writable_storage=None, )
def get_ro_node_connection( clickhouse_host: str, clickhouse_port: int, storage_name: str, client_settings: ClickhouseClientSettings, ) -> ClickhousePool: storage_key = None try: storage_key = StorageKey(storage_name) except ValueError: raise InvalidStorageError( f"storage {storage_name} is not a valid storage name", extra_data={"storage_name": storage_name}, ) key = f"{storage_key}-{clickhouse_host}" if key in NODE_CONNECTIONS: return NODE_CONNECTIONS[key] storage = get_storage(storage_key) cluster = storage.get_cluster() if not is_valid_node(clickhouse_host, clickhouse_port, cluster): raise InvalidNodeError( f"host {clickhouse_host} and port {clickhouse_port} are not valid", extra_data={"host": clickhouse_host, "port": clickhouse_port}, ) database = cluster.get_database() assert client_settings in { ClickhouseClientSettings.QUERY, ClickhouseClientSettings.TRACING, }, "admin can only use QUERY or TRACING ClickhouseClientSettings" if client_settings == ClickhouseClientSettings.QUERY: username = settings.CLICKHOUSE_READONLY_USER password = settings.CLICKHOUSE_READONLY_PASSWORD else: username = settings.CLICKHOUSE_TRACE_USER password = settings.CLICKHOUSE_TRACE_PASSWORD connection = ClickhousePool( clickhouse_host, clickhouse_port, username, password, database, max_pool_size=2, client_settings=client_settings.value.settings, ) NODE_CONNECTIONS[key] = connection return connection
def __init__(self) -> None: storage = get_storage(StorageKey.OUTCOMES_RAW) super().__init__( storages=[storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=storage), ), abstract_column_set=storage.get_schema().get_columns(), join_relationships={}, writable_storage=None, )
def __init__(self) -> None: storage = get_storage("outcomes_raw") read_schema = storage.get_schemas().get_read_schema() self.__time_group_columns = {"time": "timestamp"} super().__init__( storages=[storage], query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage), abstract_column_set=read_schema.get_columns(), writable_storage=None, time_group_columns=self.__time_group_columns, time_parse_columns=("timestamp",), )
def __init__(self) -> None: storage = get_storage(StorageKey.OUTCOMES_RAW) super().__init__( storages=[storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage), ), abstract_column_set=storage.get_schema().get_columns(), join_relationships={}, writable_storage=None, validators=[EntityRequiredColumnValidator({"org_id"})], required_time_column="timestamp", )
def setup_method(self): from snuba.web.views import application assert application.testing is True self.app = application.test_client() self.app.post = partial(self.app.post, headers={"referer": "test"}) self.storage = get_storage(StorageKey.EVENTS) self.replacer = replacer.ReplacerWorker( self.storage, DummyMetricsBackend(strict=True)) self.project_id = 1 self.event = get_raw_event()
def test(self): dataset = get_dataset("events") storage = dataset.get_writable_storage() assert storage is not None table = storage.get_table_writer().get_schema().get_local_table_name() clickhouse = (get_storage( StorageKey.EVENTS).get_cluster().get_query_connection( ClickhouseClientSettings.QUERY)) assert clickhouse.execute("SELECT COUNT() FROM %s" % table)[0][0] == 0 perf.run("tests/perf-event.json", dataset) assert clickhouse.execute("SELECT COUNT() FROM %s" % table)[0][0] == 1
def __init__( self, writable_storage_key: Optional[StorageKey], readable_storage_key: StorageKey, value_schema: Sequence[Column[SchemaModifiers]], mappers: TranslationMappers, abstract_column_set: Optional[ColumnSet] = None, validators: Optional[Sequence[QueryValidator]] = None, ) -> None: writable_storage = (get_writable_storage(writable_storage_key) if writable_storage_key else None) readable_storage = get_storage(readable_storage_key) storages = [readable_storage] if writable_storage: storages.append(writable_storage) if abstract_column_set is None: abstract_column_set = ColumnSet([ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("timestamp", DateTime()), Column("bucketed_time", DateTime()), Column("tags", Nested([("key", UInt(64)), ("value", UInt(64))])), *value_schema, ]) if validators is None: validators = [ EntityRequiredColumnValidator({"org_id", "project_id"}), GranularityValidator(minimum=10), ] super().__init__( storages=storages, query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( readable_storage, mappers=TranslationMappers(subscriptables=[ SubscriptableMapper(None, "tags", None, "tags"), ], ).concat(mappers), )), abstract_column_set=abstract_column_set, join_relationships={}, writable_storage=writable_storage, validators=validators, required_time_column="timestamp", )
def setup_method(self, test_method): super(TestReplacer, self).setup_method(test_method, "events") from snuba.web.views import application assert application.testing is True self.app = application.test_client() self.app.post = partial(self.app.post, headers={"referer": "test"}) storage = get_storage("events") self.replacer = replacer.ReplacerWorker( self.clickhouse, storage, DummyMetricsBackend(strict=True)) self.project_id = 1
def __init__(self) -> None: storage = get_writable_storage(StorageKey.EVENTS) schema = storage.get_table_writer().get_schema() columns = schema.get_columns() ro_storage = get_storage(StorageKey.EVENTS_RO) super().__init__( storages=[storage], query_plan_builder=SelectedStorageQueryPlanBuilder( selector=EventsQueryStorageSelector( events_table=storage, events_ro_table=ro_storage, )), abstract_column_set=columns, writable_storage=storage, )
def test_capture_trace() -> None: storage = get_storage(StorageKey.ERRORS) clickhouse = storage.get_cluster().get_query_connection( ClickhouseClientSettings.QUERY ) data = clickhouse.execute( "SELECT count() FROM errors_local", with_column_types=True, capture_trace=True ) assert data.results == [] assert data.meta == [] assert data.trace_output != "" assert data.profile is not None assert data.profile["elapsed"] > 0 assert data.profile["bytes"] > 0 assert data.profile["rows"] > 0 assert data.profile["blocks"] > 0
def __init__(self) -> None: writable_storage = get_writable_storage("sessions_raw") materialized_storage = get_storage("sessions_hourly") read_schema = materialized_storage.get_schemas().get_read_schema() self.__time_group_columns = {"bucketed_started": "started"} super().__init__( storages=[writable_storage, materialized_storage], # TODO: Once we are ready to expose the raw data model and select whether to use # materialized storage or the raw one here, replace this with a custom storage # selector that decides when to use the materialized data. query_plan_builder=SingleStorageQueryPlanBuilder( storage=materialized_storage, ), abstract_column_set=read_schema.get_columns(), writable_storage=writable_storage, time_group_columns=self.__time_group_columns, time_parse_columns=("started", "received"), )