Beispiel #1
0
    def eventstream(dataset_name):
        dataset = get_dataset(dataset_name)
        ensure_table_exists(dataset)
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message = KafkaMessage(
            TopicPartition('topic', 0),
            0,
            http_request.data,
        )

        type_ = record[1]
        metrics = DummyMetricsBackend()
        if type_ == 'insert':
            from snuba.consumer import ConsumerWorker
            worker = ConsumerWorker(dataset,
                                    producer=None,
                                    replacements_topic=None,
                                    metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker
            worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ('ok', 200, {'Content-Type': 'text/plain'})
Beispiel #2
0
def bulk_load(dataset, dest_table, source, log_level):
    import sentry_sdk

    sentry_sdk.init(dsn=settings.SENTRY_DSN)
    logging.basicConfig(level=getattr(logging, log_level.upper()),
                        format='%(asctime)s %(message)s')

    logger = logging.getLogger('snuba.load-snapshot')
    logger.info("Start bulk load process for dataset %s, from source %s",
                dataset, source)
    dataset = get_dataset(dataset)

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = enforce_table_writer(dataset).get_bulk_loader(
        snapshot_source, dest_table)
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table),
        settings.BULK_CLICKHOUSE_BUFFER,
    )

    loader.load(writer)
Beispiel #3
0
class TestHTTPBatchWriter:
    dataset = get_dataset("events")
    metrics = DummyMetricsBackend(strict=True)

    def test_empty_batch(self) -> None:
        enforce_table_writer(
            self.dataset).get_batch_writer(metrics=self.metrics).write([])

    def test_error_handling(self) -> None:
        table_writer = enforce_table_writer(self.dataset)

        with pytest.raises(ClickhouseWriterError) as error:
            table_writer.get_batch_writer(table_name="invalid",
                                          metrics=self.metrics).write([
                                              rapidjson.dumps({
                                                  "x": "y"
                                              }).encode("utf-8")
                                          ])

        assert error.value.code == 60

        with pytest.raises(ClickhouseWriterError) as error:
            table_writer.get_batch_writer(metrics=self.metrics).write([
                b"{}",
                rapidjson.dumps({
                    "timestamp": "invalid"
                }).encode("utf-8")
            ])

        assert error.value.code == 41
        assert error.value.row == 2
Beispiel #4
0
def test_col_split_conditions(
    id_column: str, project_column: str, timestamp_column: str, query, expected_result
) -> None:
    dataset = get_dataset("events")
    query = parse_query(query, dataset)
    splitter = ColumnSplitQueryStrategy(id_column, project_column, timestamp_column)
    request = Request("a", query, HTTPRequestSettings(), {}, "r")
    entity = get_entity(query.get_from_clause().key)
    plan = entity.get_query_plan_builder().build_plan(request)

    def do_query(
        query: ClickhouseQuery, request_settings: RequestSettings,
    ) -> QueryResult:
        return QueryResult(
            {
                "data": [
                    {
                        id_column: "asd123",
                        project_column: 123,
                        timestamp_column: "2019-10-01 22:33:42",
                    }
                ]
            },
            {},
        )

    assert (
        splitter.execute(plan.query, HTTPRequestSettings(), do_query) is not None
    ) == expected_result
Beispiel #5
0
    def setup_method(self, test_method, dataset_name: Optional[str] = None):
        self.dataset_name = dataset_name

        if dataset_name is not None:
            self.dataset = get_dataset(dataset_name)
        else:
            self.dataset = None
Beispiel #6
0
def test_find_projects(
    query_body: MutableMapping[str, Any], expected_projects: Set[int]
) -> None:
    events = get_dataset("events")
    query = identity_translate(parse_query(query_body, events))
    project_ids_ast = get_project_ids_in_query_ast(query, "project_id")
    assert project_ids_ast == expected_projects
def test_skip_execution_for_entity() -> None:
    state.set_config("subscription_mode_metrics_sets", "new")
    state.set_config("subscription_mode_metrics_counter", "new")

    # Skips execution if the entity name is not on the list
    dataset = get_dataset("metrics")
    entity_names = ["metrics_sets"]
    metrics = TestingMetricsBackend()
    next_step = mock.Mock()
    commit = mock.Mock()

    total_concurrent_queries = 4

    strategy = ExecuteQuery(
        dataset,
        entity_names,
        4,
        total_concurrent_queries,
        None,
        metrics,
        next_step,
        commit,
    )

    metrics_sets_message = next(generate_message(EntityKey.METRICS_SETS))
    strategy.submit(metrics_sets_message)
    metrics_counters_message = next(
        generate_message(EntityKey.METRICS_COUNTERS))
    strategy.submit(metrics_counters_message)

    assert (Increment("skipped_execution", 1, {"entity": "metrics_sets"})
            not in metrics.calls)
    assert (Increment("skipped_execution", 1, {"entity": "metrics_counters"})
            in metrics.calls)
Beispiel #8
0
def test_format_expressions(query_body: str, expected_query: LogicalQuery) -> None:
    state.set_config("query_parsing_expand_aliases", 1)
    events = get_dataset("events")

    # TODO: Potentially remove this once entities have actual join relationships
    mapping = {
        "contains": (EntityKey.TRANSACTIONS, "event_id"),
        "assigned": (EntityKey.GROUPASSIGNEE, "group_id"),
        "bookmark": (EntityKey.GROUPEDMESSAGES, "first_release_id"),
        "activity": (EntityKey.SESSIONS, "org_id"),
    }

    def events_mock(relationship: str) -> JoinRelationship:
        entity_key, rhs_column = mapping[relationship]
        return JoinRelationship(
            rhs_entity=entity_key,
            join_type=JoinType.INNER,
            columns=[("event_id", rhs_column)],
            equivalences=[],
        )

    events_entity = get_entity(EntityKey.EVENTS)
    setattr(events_entity, "get_join_relationship", events_mock)

    query = parse_snql_query(query_body, events)

    eq, reason = query.equals(expected_query)
    assert eq, reason
def test_entity_column_validation(query_body: str,
                                  expected_query: LogicalQuery,
                                  set_configs: Any) -> None:
    events = get_dataset("events")

    # TODO: Potentially remove this once entities have actual join relationships
    mapping = {
        "contains": (EntityKey.TRANSACTIONS, "event_id"),
    }

    def events_mock(relationship: str) -> JoinRelationship:
        entity_key, rhs_column = mapping[relationship]
        return JoinRelationship(
            rhs_entity=entity_key,
            join_type=JoinType.INNER,
            columns=[("event_id", rhs_column)],
            equivalences=[],
        )

    events_entity = get_entity(EntityKey.EVENTS)
    old_get_join = events_entity.get_join_relationship

    try:
        setattr(events_entity, "get_join_relationship", events_mock)
        query, _ = parse_snql_query(query_body, events)
        eq, reason = query.equals(expected_query)
        assert eq, reason
    finally:
        setattr(events_entity, "get_join_relationship", old_get_join)
def test_too_many_concurrent_queries() -> None:
    state.set_config("subscription_mode_events", "new")
    state.set_config("executor_queue_size_factor", 1)
    dataset = get_dataset("events")
    entity_names = ["events"]
    metrics = TestingMetricsBackend()
    next_step = mock.Mock()
    commit = mock.Mock()

    total_concurrent_queries = 4

    strategy = ExecuteQuery(
        dataset,
        entity_names,
        4,
        total_concurrent_queries,
        None,
        metrics,
        next_step,
        commit,
    )

    make_message = generate_message(EntityKey.EVENTS)

    for _ in range(4):
        strategy.submit(next(make_message))

    with pytest.raises(MessageRejected):
        strategy.submit(next(make_message))

    strategy.close()
    strategy.join()
Beispiel #11
0
def perf(
    *,
    events_file: Optional[str],
    repeat: int,
    profile_process: bool,
    profile_write: bool,
    dataset_name: str,
    log_level: Optional[str] = None,
) -> None:
    from snuba.perf import run, logger

    setup_logging(log_level)

    dataset = get_dataset(dataset_name)

    if not all(storage.get_cluster().is_single_node()
               for storage in dataset.get_all_storages()):
        logger.error(
            "The perf tool is only intended for single node environment.")
        sys.exit(1)

    run(
        events_file,
        dataset,
        repeat=repeat,
        profile_process=profile_process,
        profile_write=profile_write,
    )
Beispiel #12
0
def test_select_storage(query_body: MutableMapping[str, Any],
                        is_subscription: bool, expected_table: str) -> None:
    sessions = get_dataset("sessions")
    snql_query = json_to_snql(query_body, "sessions")
    query, snql_anonymized = parse_snql_query(str(snql_query), sessions)
    query_body = json.loads(snql_query.snuba())
    subscription_settings = (SubscriptionQuerySettings
                             if is_subscription else HTTPQuerySettings)

    request = Request(
        id="a",
        original_body=query_body,
        query=query,
        snql_anonymized=snql_anonymized,
        query_settings=subscription_settings(referrer=""),
        attribution_info=AttributionInfo(get_app_id("default"), "blah", None,
                                         None, None),
    )

    def query_runner(query: Query, settings: QuerySettings,
                     reader: Reader) -> QueryResult:
        assert query.get_from_clause().table_name == expected_table
        return QueryResult({}, {})

    sessions.get_default_entity().get_query_pipeline_builder(
    ).build_execution_pipeline(request, query_runner).execute()
Beispiel #13
0
def test_get_time_range() -> None:
    """
    Test finding the time range of a query.
    """
    body = """
        MATCH (events)
        SELECT event_id
        WHERE timestamp >= toDateTime('2019-09-18T10:00:00')
            AND timestamp >= toDateTime('2000-09-18T10:00:00')
            AND timestamp < toDateTime('2019-09-19T12:00:00')
            AND (timestamp < toDateTime('2019-09-18T12:00:00') OR project_id IN tuple(1))
            AND project_id IN tuple(1)
        """

    events = get_dataset("events")
    query, _ = parse_snql_query(body, events)
    processors = events.get_default_entity().get_query_processors()
    for processor in processors:
        if isinstance(processor, TimeSeriesProcessor):
            processor.process_query(query, HTTPQuerySettings())

    from_date_ast, to_date_ast = get_time_range(identity_translate(query),
                                                "timestamp")
    assert (from_date_ast is not None and isinstance(from_date_ast, datetime)
            and from_date_ast.isoformat() == "2019-09-18T10:00:00")
    assert (to_date_ast is not None and isinstance(to_date_ast, datetime)
            and to_date_ast.isoformat() == "2019-09-19T12:00:00")
Beispiel #14
0
def test_no_split(dataset_name: str):
    events = get_dataset(dataset_name)
    query = Query(
        {
            "selected_columns": ["event_id"],
            "conditions": [""],
            "orderby": "event_id",
            "sample": 10,
            "limit": 100,
            "offset": 50,
        },
        events.get_dataset_schemas().get_read_schema().get_data_source()
    )

    @split_query
    def do_query(dataset: Dataset, request: Request, timer: Timer):
        assert request.query == query

    request = Request(
        query,
        RequestSettings(False, False, False),
        {},
    )

    do_query(events, request, None)
Beispiel #15
0
def bulk_load(
    *,
    dataset_name: Optional[str],
    dest_table: Optional[str],
    source: Optional[str],
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info("Start bulk load process for dataset %s, from source %s",
                dataset_name, source)
    dataset = get_dataset(dataset_name)

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = enforce_table_writer(dataset).get_bulk_loader(
        snapshot_source, dest_table)
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table),
        settings.BULK_CLICKHOUSE_BUFFER,
    )

    loader.load(writer)
Beispiel #16
0
    def test_send_message(
        self,
        message: str,
        expected: Optional[ProcessedMessage],
    ) -> None:
        dataset = get_dataset("groupedmessage")
        snapshot_id = uuid1()
        transact_data = TransactionData(xmin=100,
                                        xmax=200,
                                        xip_list=[120, 130])

        worker = SnapshotAwareWorker(
            dataset=dataset,
            producer=FakeConfluentKafkaProducer(),
            snapshot_id=str(snapshot_id),
            transaction_data=transact_data,
            replacements_topic=None,
            metrics=DummyMetricsBackend(strict=True),
        )

        ret = worker.process_message(
            KafkaMessage(
                TopicPartition('topic', 0),
                1,
                message.encode('utf-8'),
            ))
        assert ret == expected
Beispiel #17
0
def test_failures(
    query_body: MutableMapping[str, Any],
    expected_exception: Type[InvalidQueryException],
) -> None:
    with pytest.raises(expected_exception):
        events = get_dataset("events")
        parse_query(query_body, events)
Beispiel #18
0
def get_dataset_source(dataset_name):
    return (
        get_dataset(dataset_name)
        .get_dataset_schemas()
        .get_read_schema()
        .get_data_source()
    )
def test() -> None:
    cv = threading.Condition()
    query_result = QueryResult({}, {"stats": {}, "sql": ""})
    mock_query_runner = Mock(return_value=query_result)

    def callback_func(args: List[Tuple[str, QueryResult]]) -> None:
        with cv:
            cv.notify()

    mock_callback = Mock(side_effect=callback_func)

    query_body = {
        "selected_columns": ["type", "project_id"],
    }

    events = get_dataset("events")
    query = parse_query(query_body, events)

    events_pipeline = SimplePipelineBuilder(
        query_plan_builder=SingleStorageQueryPlanBuilder(
            storage=get_storage(StorageKey.EVENTS)), )

    errors_pipeline = SimplePipelineBuilder(
        query_plan_builder=SingleStorageQueryPlanBuilder(
            storage=get_storage(StorageKey.ERRORS)), )

    delegator = PipelineDelegator(
        query_pipeline_builders={
            "events": events_pipeline,
            "errors": errors_pipeline
        },
        selector_func=lambda query, referrer: ("events", ["errors"]),
        callback_func=mock_callback,
    )

    with cv:
        request_settings = HTTPRequestSettings()
        delegator.build_execution_pipeline(
            Request(
                "",
                query_body,
                query,
                request_settings,
                "ref",
            ),
            mock_query_runner,
        ).execute()
        cv.wait(timeout=5)

    assert mock_query_runner.call_count == 2

    assert mock_callback.call_args == call(
        query,
        request_settings,
        "ref",
        [
            Result("events", query_result, ANY),
            Result("errors", query_result, ANY)
        ],
    )
Beispiel #20
0
def test_no_split(dataset_name: str, id_column: str, project_column: str,
                  timestamp_column: str) -> None:
    events = get_dataset(dataset_name)
    query = ClickhouseQuery(
        events.get_default_entity().get_all_storages()
        [0].get_schema().get_data_source(), )

    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
        reader: Reader,
    ) -> QueryResult:
        assert query == query
        return QueryResult({}, {})

    strategy = SimpleQueryPlanExecutionStrategy(
        ClickhouseCluster("localhost", 1024, "default", "", "default", 80,
                          set(), True),
        [],
        [
            ColumnSplitQueryStrategy(
                id_column=id_column,
                project_column=project_column,
                timestamp_column=timestamp_column,
            ),
            TimeSplitQueryStrategy(timestamp_col=timestamp_column),
        ],
    )

    strategy.execute(query, HTTPRequestSettings(), do_query)
Beispiel #21
0
def optimize(
    *,
    clickhouse_host: str,
    clickhouse_port: int,
    database: str,
    dataset_name: str,
    timeout: int,
    log_level: Optional[str] = None,
) -> None:
    from datetime import datetime
    from snuba.clickhouse.native import ClickhousePool
    from snuba.optimize import run_optimize, logger

    setup_logging(log_level)

    dataset = get_dataset(dataset_name)
    table = enforce_table_writer(dataset).get_schema().get_local_table_name()

    today = datetime.utcnow().replace(hour=0,
                                      minute=0,
                                      second=0,
                                      microsecond=0)
    clickhouse = ClickhousePool(clickhouse_host,
                                clickhouse_port,
                                send_receive_timeout=timeout)
    num_dropped = run_optimize(clickhouse, database, table, before=today)
    logger.info("Optimized %s partitions on %s" %
                (num_dropped, clickhouse_host))
Beispiel #22
0
def test_failures(query_body: str, exception: Exception, message: str) -> None:
    state.set_config("query_parsing_expand_aliases", 1)
    events = get_dataset("events")

    # TODO: Potentially remove this once entities have actual join relationships
    mapping = {
        "contains": (EntityKey.TRANSACTIONS, "event_id"),
        "assigned": (EntityKey.GROUPASSIGNEE, "group_id"),
        "bookmark": (EntityKey.GROUPEDMESSAGES, "first_release_id"),
        "activity": (EntityKey.SESSIONS, "org_id"),
    }

    def events_mock(relationship: str) -> Optional[JoinRelationship]:
        if relationship not in mapping:
            return None
        entity_key, rhs_column = mapping[relationship]
        return JoinRelationship(
            rhs_entity=entity_key,
            join_type=JoinType.INNER,
            columns=[("event_id", rhs_column)],
            equivalences=[],
        )

    events_entity = get_entity(EntityKey.EVENTS)
    setattr(events_entity, "get_join_relationship", events_mock)

    with pytest.raises(exception, match=re.escape(message)):
        parse_snql_query(query_body, events)
Beispiel #23
0
def sdk_distribution(*, timer: Timer):
    dataset = get_dataset('events')
    request = validate_request_content(
        parse_request_body(http_request),
        RequestSchema(
            schemas.SDK_STATS_BASE_SCHEMA,
            SETTINGS_SCHEMA,
            schemas.SDK_STATS_EXTENSIONS_SCHEMA,
        ),
        timer,
        dataset,
    )

    request.query.set_aggregations([
        ['uniq', 'project_id', 'projects'],
        ['count()', None, 'count'],
    ])
    request.query.add_groupby(['sdk_name', 'rtime'])
    request.extensions['project'] = {
        'project': [],
    }

    ensure_table_exists(dataset)

    query_result = parse_and_run_query(dataset, request, timer)
    return (json.dumps(query_result.result,
                       for_json=True,
                       default=lambda obj: obj.isoformat()
                       if isinstance(obj, datetime) else obj),
            query_result.status, {
                'Content-Type': 'application/json'
            })
Beispiel #24
0
def test_failures(
    query_body: str,
    expected_exception,
) -> None:
    with pytest.raises(expected_exception):
        events = get_dataset("events")
        parse_snql_query(query_body, events)
Beispiel #25
0
def test_get_time_range() -> None:
    """
    Test finding the time range of a query.
    """
    body = {
        "selected_columns": ["event_id"],
        "conditions": [
            ("timestamp", ">=", "2019-09-18T10:00:00"),
            ("timestamp", ">=", "2000-09-18T10:00:00"),
            ("timestamp", "<", "2019-09-19T12:00:00"),
            [("timestamp", "<", "2019-09-18T12:00:00"),
             ("project_id", "IN", [1])],
            ("project_id", "IN", [1]),
        ],
    }

    events = get_dataset("events")
    query = parse_query(body, events)
    processors = events.get_query_processors()
    for processor in processors:
        if isinstance(processor, TimeSeriesProcessor):
            processor.process_query(query, HTTPRequestSettings())

    from_date_ast, to_date_ast = get_time_range(ClickhouseQuery(query),
                                                "timestamp")
    assert (from_date_ast is not None and isinstance(from_date_ast, datetime)
            and from_date_ast.isoformat() == "2019-09-18T10:00:00")
    assert (to_date_ast is not None and isinstance(to_date_ast, datetime)
            and to_date_ast.isoformat() == "2019-09-19T12:00:00")
    def test_send_message(
        self,
        value: str,
        expected: Optional[ProcessedMessage],
    ) -> None:
        dataset = get_dataset("groupedmessage")
        snapshot_id = uuid1()
        transact_data = TransactionData(xmin=100,
                                        xmax=200,
                                        xip_list=[120, 130])

        worker = SnapshotAwareWorker(
            dataset=dataset,
            producer=FakeConfluentKafkaProducer(),
            snapshot_id=str(snapshot_id),
            transaction_data=transact_data,
            replacements_topic=None,
            metrics=DummyMetricsBackend(strict=True),
        )

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            1,
            KafkaPayload(None, value.encode("utf-8")),
            datetime.now(),
        )

        ret = worker.process_message(message)
        assert ret == expected
def test_tags_processor(query_body, expected_query) -> None:
    state.set_config("ast_tag_processor_enabled", 1)
    dataset = get_dataset("transactions")
    query = parse_query(query_body, dataset)
    request_settings = HTTPRequestSettings()

    assert (DictClickhouseQuery(
        dataset, query, request_settings).format_sql() == expected_query)
Beispiel #28
0
 def generate_event(self):
     self.dataset = get_dataset("events")
     event = get_event()
     event["project_id"] = self.project_id
     event = (enforce_table_writer(
         self.dataset).get_stream_loader().get_processor().process_insert(
             event))
     self.write_processed_records([event])
Beispiel #29
0
    def drop(dataset_name):
        dataset = get_dataset(dataset_name)
        for statement in dataset.get_dataset_schemas().get_drop_statements():
            clickhouse_rw.execute(statement)

        ensure_table_exists(dataset, force=True)
        redis_client.flushdb()
        return ('ok', 200, {'Content-Type': 'text/plain'})
Beispiel #30
0
def test_data_source(
    query_body: MutableMapping[str, Any],
    expected_entity: EntityKey,
) -> None:
    dataset = get_dataset("discover")
    query = parse_query(query_body, dataset)

    assert query.get_from_clause().key == expected_entity