def eventstream(dataset_name): dataset = get_dataset(dataset_name) ensure_table_exists(dataset) record = json.loads(http_request.data) version = record[0] if version != 2: raise RuntimeError("Unsupported protocol version: %s" % record) message = KafkaMessage( TopicPartition('topic', 0), 0, http_request.data, ) type_ = record[1] metrics = DummyMetricsBackend() if type_ == 'insert': from snuba.consumer import ConsumerWorker worker = ConsumerWorker(dataset, producer=None, replacements_topic=None, metrics=metrics) else: from snuba.replacer import ReplacerWorker worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics) processed = worker.process_message(message) if processed is not None: batch = [processed] worker.flush_batch(batch) return ('ok', 200, {'Content-Type': 'text/plain'})
def bulk_load(dataset, dest_table, source, log_level): import sentry_sdk sentry_sdk.init(dsn=settings.SENTRY_DSN) logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') logger = logging.getLogger('snuba.load-snapshot') logger.info("Start bulk load process for dataset %s, from source %s", dataset, source) dataset = get_dataset(dataset) # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = enforce_table_writer(dataset).get_bulk_loader( snapshot_source, dest_table) # TODO: see whether we need to pass options to the writer writer = BufferedWriterWrapper( enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table), settings.BULK_CLICKHOUSE_BUFFER, ) loader.load(writer)
class TestHTTPBatchWriter: dataset = get_dataset("events") metrics = DummyMetricsBackend(strict=True) def test_empty_batch(self) -> None: enforce_table_writer( self.dataset).get_batch_writer(metrics=self.metrics).write([]) def test_error_handling(self) -> None: table_writer = enforce_table_writer(self.dataset) with pytest.raises(ClickhouseWriterError) as error: table_writer.get_batch_writer(table_name="invalid", metrics=self.metrics).write([ rapidjson.dumps({ "x": "y" }).encode("utf-8") ]) assert error.value.code == 60 with pytest.raises(ClickhouseWriterError) as error: table_writer.get_batch_writer(metrics=self.metrics).write([ b"{}", rapidjson.dumps({ "timestamp": "invalid" }).encode("utf-8") ]) assert error.value.code == 41 assert error.value.row == 2
def test_col_split_conditions( id_column: str, project_column: str, timestamp_column: str, query, expected_result ) -> None: dataset = get_dataset("events") query = parse_query(query, dataset) splitter = ColumnSplitQueryStrategy(id_column, project_column, timestamp_column) request = Request("a", query, HTTPRequestSettings(), {}, "r") entity = get_entity(query.get_from_clause().key) plan = entity.get_query_plan_builder().build_plan(request) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: return QueryResult( { "data": [ { id_column: "asd123", project_column: 123, timestamp_column: "2019-10-01 22:33:42", } ] }, {}, ) assert ( splitter.execute(plan.query, HTTPRequestSettings(), do_query) is not None ) == expected_result
def setup_method(self, test_method, dataset_name: Optional[str] = None): self.dataset_name = dataset_name if dataset_name is not None: self.dataset = get_dataset(dataset_name) else: self.dataset = None
def test_find_projects( query_body: MutableMapping[str, Any], expected_projects: Set[int] ) -> None: events = get_dataset("events") query = identity_translate(parse_query(query_body, events)) project_ids_ast = get_project_ids_in_query_ast(query, "project_id") assert project_ids_ast == expected_projects
def test_skip_execution_for_entity() -> None: state.set_config("subscription_mode_metrics_sets", "new") state.set_config("subscription_mode_metrics_counter", "new") # Skips execution if the entity name is not on the list dataset = get_dataset("metrics") entity_names = ["metrics_sets"] metrics = TestingMetricsBackend() next_step = mock.Mock() commit = mock.Mock() total_concurrent_queries = 4 strategy = ExecuteQuery( dataset, entity_names, 4, total_concurrent_queries, None, metrics, next_step, commit, ) metrics_sets_message = next(generate_message(EntityKey.METRICS_SETS)) strategy.submit(metrics_sets_message) metrics_counters_message = next( generate_message(EntityKey.METRICS_COUNTERS)) strategy.submit(metrics_counters_message) assert (Increment("skipped_execution", 1, {"entity": "metrics_sets"}) not in metrics.calls) assert (Increment("skipped_execution", 1, {"entity": "metrics_counters"}) in metrics.calls)
def test_format_expressions(query_body: str, expected_query: LogicalQuery) -> None: state.set_config("query_parsing_expand_aliases", 1) events = get_dataset("events") # TODO: Potentially remove this once entities have actual join relationships mapping = { "contains": (EntityKey.TRANSACTIONS, "event_id"), "assigned": (EntityKey.GROUPASSIGNEE, "group_id"), "bookmark": (EntityKey.GROUPEDMESSAGES, "first_release_id"), "activity": (EntityKey.SESSIONS, "org_id"), } def events_mock(relationship: str) -> JoinRelationship: entity_key, rhs_column = mapping[relationship] return JoinRelationship( rhs_entity=entity_key, join_type=JoinType.INNER, columns=[("event_id", rhs_column)], equivalences=[], ) events_entity = get_entity(EntityKey.EVENTS) setattr(events_entity, "get_join_relationship", events_mock) query = parse_snql_query(query_body, events) eq, reason = query.equals(expected_query) assert eq, reason
def test_entity_column_validation(query_body: str, expected_query: LogicalQuery, set_configs: Any) -> None: events = get_dataset("events") # TODO: Potentially remove this once entities have actual join relationships mapping = { "contains": (EntityKey.TRANSACTIONS, "event_id"), } def events_mock(relationship: str) -> JoinRelationship: entity_key, rhs_column = mapping[relationship] return JoinRelationship( rhs_entity=entity_key, join_type=JoinType.INNER, columns=[("event_id", rhs_column)], equivalences=[], ) events_entity = get_entity(EntityKey.EVENTS) old_get_join = events_entity.get_join_relationship try: setattr(events_entity, "get_join_relationship", events_mock) query, _ = parse_snql_query(query_body, events) eq, reason = query.equals(expected_query) assert eq, reason finally: setattr(events_entity, "get_join_relationship", old_get_join)
def test_too_many_concurrent_queries() -> None: state.set_config("subscription_mode_events", "new") state.set_config("executor_queue_size_factor", 1) dataset = get_dataset("events") entity_names = ["events"] metrics = TestingMetricsBackend() next_step = mock.Mock() commit = mock.Mock() total_concurrent_queries = 4 strategy = ExecuteQuery( dataset, entity_names, 4, total_concurrent_queries, None, metrics, next_step, commit, ) make_message = generate_message(EntityKey.EVENTS) for _ in range(4): strategy.submit(next(make_message)) with pytest.raises(MessageRejected): strategy.submit(next(make_message)) strategy.close() strategy.join()
def perf( *, events_file: Optional[str], repeat: int, profile_process: bool, profile_write: bool, dataset_name: str, log_level: Optional[str] = None, ) -> None: from snuba.perf import run, logger setup_logging(log_level) dataset = get_dataset(dataset_name) if not all(storage.get_cluster().is_single_node() for storage in dataset.get_all_storages()): logger.error( "The perf tool is only intended for single node environment.") sys.exit(1) run( events_file, dataset, repeat=repeat, profile_process=profile_process, profile_write=profile_write, )
def test_select_storage(query_body: MutableMapping[str, Any], is_subscription: bool, expected_table: str) -> None: sessions = get_dataset("sessions") snql_query = json_to_snql(query_body, "sessions") query, snql_anonymized = parse_snql_query(str(snql_query), sessions) query_body = json.loads(snql_query.snuba()) subscription_settings = (SubscriptionQuerySettings if is_subscription else HTTPQuerySettings) request = Request( id="a", original_body=query_body, query=query, snql_anonymized=snql_anonymized, query_settings=subscription_settings(referrer=""), attribution_info=AttributionInfo(get_app_id("default"), "blah", None, None, None), ) def query_runner(query: Query, settings: QuerySettings, reader: Reader) -> QueryResult: assert query.get_from_clause().table_name == expected_table return QueryResult({}, {}) sessions.get_default_entity().get_query_pipeline_builder( ).build_execution_pipeline(request, query_runner).execute()
def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = """ MATCH (events) SELECT event_id WHERE timestamp >= toDateTime('2019-09-18T10:00:00') AND timestamp >= toDateTime('2000-09-18T10:00:00') AND timestamp < toDateTime('2019-09-19T12:00:00') AND (timestamp < toDateTime('2019-09-18T12:00:00') OR project_id IN tuple(1)) AND project_id IN tuple(1) """ events = get_dataset("events") query, _ = parse_snql_query(body, events) processors = events.get_default_entity().get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPQuerySettings()) from_date_ast, to_date_ast = get_time_range(identity_translate(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def test_no_split(dataset_name: str): events = get_dataset(dataset_name) query = Query( { "selected_columns": ["event_id"], "conditions": [""], "orderby": "event_id", "sample": 10, "limit": 100, "offset": 50, }, events.get_dataset_schemas().get_read_schema().get_data_source() ) @split_query def do_query(dataset: Dataset, request: Request, timer: Timer): assert request.query == query request = Request( query, RequestSettings(False, False, False), {}, ) do_query(events, request, None)
def bulk_load( *, dataset_name: Optional[str], dest_table: Optional[str], source: Optional[str], log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info("Start bulk load process for dataset %s, from source %s", dataset_name, source) dataset = get_dataset(dataset_name) # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = enforce_table_writer(dataset).get_bulk_loader( snapshot_source, dest_table) # TODO: see whether we need to pass options to the writer writer = BufferedWriterWrapper( enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table), settings.BULK_CLICKHOUSE_BUFFER, ) loader.load(writer)
def test_send_message( self, message: str, expected: Optional[ProcessedMessage], ) -> None: dataset = get_dataset("groupedmessage") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( dataset=dataset, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) ret = worker.process_message( KafkaMessage( TopicPartition('topic', 0), 1, message.encode('utf-8'), )) assert ret == expected
def test_failures( query_body: MutableMapping[str, Any], expected_exception: Type[InvalidQueryException], ) -> None: with pytest.raises(expected_exception): events = get_dataset("events") parse_query(query_body, events)
def get_dataset_source(dataset_name): return ( get_dataset(dataset_name) .get_dataset_schemas() .get_read_schema() .get_data_source() )
def test() -> None: cv = threading.Condition() query_result = QueryResult({}, {"stats": {}, "sql": ""}) mock_query_runner = Mock(return_value=query_result) def callback_func(args: List[Tuple[str, QueryResult]]) -> None: with cv: cv.notify() mock_callback = Mock(side_effect=callback_func) query_body = { "selected_columns": ["type", "project_id"], } events = get_dataset("events") query = parse_query(query_body, events) events_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.EVENTS)), ) errors_pipeline = SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=get_storage(StorageKey.ERRORS)), ) delegator = PipelineDelegator( query_pipeline_builders={ "events": events_pipeline, "errors": errors_pipeline }, selector_func=lambda query, referrer: ("events", ["errors"]), callback_func=mock_callback, ) with cv: request_settings = HTTPRequestSettings() delegator.build_execution_pipeline( Request( "", query_body, query, request_settings, "ref", ), mock_query_runner, ).execute() cv.wait(timeout=5) assert mock_query_runner.call_count == 2 assert mock_callback.call_args == call( query, request_settings, "ref", [ Result("events", query_result, ANY), Result("errors", query_result, ANY) ], )
def test_no_split(dataset_name: str, id_column: str, project_column: str, timestamp_column: str) -> None: events = get_dataset(dataset_name) query = ClickhouseQuery( events.get_default_entity().get_all_storages() [0].get_schema().get_data_source(), ) def do_query( query: ClickhouseQuery, request_settings: RequestSettings, reader: Reader, ) -> QueryResult: assert query == query return QueryResult({}, {}) strategy = SimpleQueryPlanExecutionStrategy( ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True), [], [ ColumnSplitQueryStrategy( id_column=id_column, project_column=project_column, timestamp_column=timestamp_column, ), TimeSplitQueryStrategy(timestamp_col=timestamp_column), ], ) strategy.execute(query, HTTPRequestSettings(), do_query)
def optimize( *, clickhouse_host: str, clickhouse_port: int, database: str, dataset_name: str, timeout: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import run_optimize, logger setup_logging(log_level) dataset = get_dataset(dataset_name) table = enforce_table_writer(dataset).get_schema().get_local_table_name() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) clickhouse = ClickhousePool(clickhouse_host, clickhouse_port, send_receive_timeout=timeout) num_dropped = run_optimize(clickhouse, database, table, before=today) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def test_failures(query_body: str, exception: Exception, message: str) -> None: state.set_config("query_parsing_expand_aliases", 1) events = get_dataset("events") # TODO: Potentially remove this once entities have actual join relationships mapping = { "contains": (EntityKey.TRANSACTIONS, "event_id"), "assigned": (EntityKey.GROUPASSIGNEE, "group_id"), "bookmark": (EntityKey.GROUPEDMESSAGES, "first_release_id"), "activity": (EntityKey.SESSIONS, "org_id"), } def events_mock(relationship: str) -> Optional[JoinRelationship]: if relationship not in mapping: return None entity_key, rhs_column = mapping[relationship] return JoinRelationship( rhs_entity=entity_key, join_type=JoinType.INNER, columns=[("event_id", rhs_column)], equivalences=[], ) events_entity = get_entity(EntityKey.EVENTS) setattr(events_entity, "get_join_relationship", events_mock) with pytest.raises(exception, match=re.escape(message)): parse_snql_query(query_body, events)
def sdk_distribution(*, timer: Timer): dataset = get_dataset('events') request = validate_request_content( parse_request_body(http_request), RequestSchema( schemas.SDK_STATS_BASE_SCHEMA, SETTINGS_SCHEMA, schemas.SDK_STATS_EXTENSIONS_SCHEMA, ), timer, dataset, ) request.query.set_aggregations([ ['uniq', 'project_id', 'projects'], ['count()', None, 'count'], ]) request.query.add_groupby(['sdk_name', 'rtime']) request.extensions['project'] = { 'project': [], } ensure_table_exists(dataset) query_result = parse_and_run_query(dataset, request, timer) return (json.dumps(query_result.result, for_json=True, default=lambda obj: obj.isoformat() if isinstance(obj, datetime) else obj), query_result.status, { 'Content-Type': 'application/json' })
def test_failures( query_body: str, expected_exception, ) -> None: with pytest.raises(expected_exception): events = get_dataset("events") parse_snql_query(query_body, events)
def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = { "selected_columns": ["event_id"], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", ">=", "2000-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), [("timestamp", "<", "2019-09-18T12:00:00"), ("project_id", "IN", [1])], ("project_id", "IN", [1]), ], } events = get_dataset("events") query = parse_query(body, events) processors = events.get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPRequestSettings()) from_date_ast, to_date_ast = get_time_range(ClickhouseQuery(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def test_send_message( self, value: str, expected: Optional[ProcessedMessage], ) -> None: dataset = get_dataset("groupedmessage") snapshot_id = uuid1() transact_data = TransactionData(xmin=100, xmax=200, xip_list=[120, 130]) worker = SnapshotAwareWorker( dataset=dataset, producer=FakeConfluentKafkaProducer(), snapshot_id=str(snapshot_id), transaction_data=transact_data, replacements_topic=None, metrics=DummyMetricsBackend(strict=True), ) message: Message[KafkaPayload] = Message( Partition(Topic("topic"), 0), 1, KafkaPayload(None, value.encode("utf-8")), datetime.now(), ) ret = worker.process_message(message) assert ret == expected
def test_tags_processor(query_body, expected_query) -> None: state.set_config("ast_tag_processor_enabled", 1) dataset = get_dataset("transactions") query = parse_query(query_body, dataset) request_settings = HTTPRequestSettings() assert (DictClickhouseQuery( dataset, query, request_settings).format_sql() == expected_query)
def generate_event(self): self.dataset = get_dataset("events") event = get_event() event["project_id"] = self.project_id event = (enforce_table_writer( self.dataset).get_stream_loader().get_processor().process_insert( event)) self.write_processed_records([event])
def drop(dataset_name): dataset = get_dataset(dataset_name) for statement in dataset.get_dataset_schemas().get_drop_statements(): clickhouse_rw.execute(statement) ensure_table_exists(dataset, force=True) redis_client.flushdb() return ('ok', 200, {'Content-Type': 'text/plain'})
def test_data_source( query_body: MutableMapping[str, Any], expected_entity: EntityKey, ) -> None: dataset = get_dataset("discover") query = parse_query(query_body, dataset) assert query.get_from_clause().key == expected_entity