def test() -> None:
    cv = threading.Condition()
    query_result = QueryResult({}, {"stats": {}, "sql": ""})
    mock_query_runner = Mock(return_value=query_result)

    def callback_func(args: List[Tuple[str, QueryResult]]) -> None:
        with cv:
            cv.notify()

    mock_callback = Mock(side_effect=callback_func)

    query_body = {
        "selected_columns": ["type", "project_id"],
    }

    events = get_dataset("events")
    query = parse_query(query_body, events)

    events_pipeline = SimplePipelineBuilder(
        query_plan_builder=SingleStorageQueryPlanBuilder(
            storage=get_storage(StorageKey.EVENTS)), )

    errors_pipeline = SimplePipelineBuilder(
        query_plan_builder=SingleStorageQueryPlanBuilder(
            storage=get_storage(StorageKey.ERRORS)), )

    delegator = PipelineDelegator(
        query_pipeline_builders={
            "events": events_pipeline,
            "errors": errors_pipeline
        },
        selector_func=lambda query, referrer: ("events", ["errors"]),
        callback_func=mock_callback,
    )

    with cv:
        request_settings = HTTPRequestSettings()
        delegator.build_execution_pipeline(
            Request(
                "",
                query_body,
                query,
                request_settings,
                "ref",
            ),
            mock_query_runner,
        ).execute()
        cv.wait(timeout=5)

    assert mock_query_runner.call_count == 2

    assert mock_callback.call_args == call(
        query,
        request_settings,
        "ref",
        [
            Result("events", query_result, ANY),
            Result("errors", query_result, ANY)
        ],
    )
Example #2
0
def test_clusters() -> None:
    importlib.reload(cluster)
    assert (get_storage(StorageKey("events")).get_cluster() == get_storage(
        StorageKey("errors")).get_cluster())

    assert (get_storage(StorageKey("events")).get_cluster() != get_storage(
        StorageKey("transactions")).get_cluster())
Example #3
0
def test_cache_partition() -> None:
    get_storage(
        StorageKey("transactions")
    ).get_cluster().get_reader().cache_partition_id == "host_2_cache"

    get_storage(
        StorageKey("errors")
    ).get_cluster().get_reader().cache_partition_id is None
Example #4
0
def test_clusters() -> None:
    assert (
        get_storage(StorageKey("events")).get_cluster()
        == get_storage(StorageKey("errors")).get_cluster()
    )

    assert (
        get_storage(StorageKey("events")).get_cluster()
        != get_storage(StorageKey("transactions")).get_cluster()
    )
Example #5
0
def test_storage_selector() -> None:
    state.set_config("enable_events_readonly_table", True)

    storage = get_storage(StorageKey.EVENTS)
    storage_ro = get_storage(StorageKey.EVENTS_RO)

    query = Query({}, storage.get_schema().get_data_source())

    storage_selector = EventsQueryStorageSelector(storage, storage_ro)
    assert (storage_selector.select_storage(
        query, HTTPRequestSettings(consistent=False)).storage == storage_ro)
    assert (storage_selector.select_storage(
        query, HTTPRequestSettings(consistent=True)).storage == storage)
Example #6
0
def test_storage_selector() -> None:
    state.set_config("enable_events_readonly_table", True)

    storage = get_storage(StorageKey.ERRORS)
    storage_ro = get_storage(StorageKey.ERRORS_RO)

    query = Query(Entity(EntityKey.EVENTS, ColumnSet([])), selected_columns=[])

    storage_selector = ErrorsQueryStorageSelector(mappers=errors_translators)
    assert (storage_selector.select_storage(
        query, HTTPRequestSettings(consistent=False)).storage == storage_ro)
    assert (storage_selector.select_storage(
        query, HTTPRequestSettings(consistent=True)).storage == storage)
Example #7
0
    def test_tags_hash_map(self) -> None:
        """
        Adds an event and ensures the tags_hash_map is properly populated
        including escaping.
        """
        self.event = get_raw_event()
        self.event["data"]["tags"].append(["test_tag1", "value1"])
        self.event["data"]["tags"].append(["test_tag=2", "value2"])  # Requires escaping
        self.write_unprocessed_events([self.event])

        clickhouse = (
            get_storage(StorageKey.EVENTS)
            .get_cluster()
            .get_query_connection(ClickhouseClientSettings.QUERY)
        )

        hashed = clickhouse.execute(
            "SELECT cityHash64('test_tag1=value1'), cityHash64('test_tag\\\\=2=value2')"
        )
        tag1, tag2 = hashed[0]

        event = clickhouse.execute(
            (
                f"SELECT event_id FROM sentry_local WHERE has(_tags_hash_map, {tag1}) "
                f"AND has(_tags_hash_map, {tag2})"
            )
        )
        assert len(event) == 1
        assert event[0][0] == self.event["data"]["id"]
Example #8
0
def _get_local_table_name(storage_key: StorageKey) -> str:
    try:
        schema = get_storage(storage_key).get_schema()
        assert isinstance(schema, TableSchema)
        return schema.get_table_name()
    except UndefinedClickhouseCluster:
        return "badcluster"
Example #9
0
    def test_offsets(self):
        event = self.event

        message: Message[KafkaPayload] = Message(
            Partition(Topic("events"), 456),
            123,
            KafkaPayload(None,
                         json.dumps((2, "insert", event)).encode("utf-8"),
                         []),  # event doesn't really matter
            datetime.now(),
        )

        test_worker = ConsumerWorker(
            self.dataset.get_writable_storage(),
            producer=FakeConfluentKafkaProducer(),
            replacements_topic=Topic(
                enforce_table_writer(self.dataset).get_stream_loader().
                get_replacement_topic_spec().topic_name),
            metrics=self.metrics,
        )
        batch = [test_worker.process_message(message)]
        test_worker.flush_batch(batch)

        clickhouse = (get_storage(
            StorageKey.EVENTS).get_cluster().get_query_connection(
                ClickhouseClientSettings.QUERY))

        assert clickhouse.execute(
            "SELECT project_id, event_id, offset, partition FROM %s" %
            self.table) == [(self.event["project_id"], self.event["event_id"],
                             123, 456)]
Example #10
0
    def test_send_message(
        self,
        value: str,
        expected: Optional[ProcessedMessage],
    ) -> None:
        storage = get_storage("groupedmessages")
        snapshot_id = uuid1()
        transact_data = TransactionData(xmin=100,
                                        xmax=200,
                                        xip_list=[120, 130])

        worker = SnapshotAwareWorker(
            storage=storage,
            producer=FakeConfluentKafkaProducer(),
            snapshot_id=str(snapshot_id),
            transaction_data=transact_data,
            replacements_topic=None,
            metrics=DummyMetricsBackend(strict=True),
        )

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            1,
            KafkaPayload(
                None,
                value.encode("utf-8"),
                [("table", "sentry_groupedmessage".encode())],
            ),
            datetime.now(),
        )

        ret = worker.process_message(message)
        assert ret == expected
Example #11
0
def run_migrations() -> Iterator[None]:
    from snuba.migrations.runner import Runner

    Runner().run_all(force=True)

    yield

    for storage_key in STORAGES:
        storage = get_storage(storage_key)
        cluster = storage.get_cluster()
        database = cluster.get_database()

        schema = storage.get_schema()
        if isinstance(schema, WritableTableSchema):
            table_name = schema.get_local_table_name()

            nodes = [
                *cluster.get_local_nodes(), *cluster.get_distributed_nodes()
            ]
            for node in nodes:
                connection = cluster.get_node_connection(
                    ClickhouseClientSettings.MIGRATE, node)
                connection.execute(
                    f"TRUNCATE TABLE IF EXISTS {database}.{table_name}")

    redis_client.flushdb()
Example #12
0
    def __init__(self) -> None:

        # The raw table we write onto, and that potentially we could
        # query.
        writable_storage = get_writable_storage(StorageKey.OUTCOMES_RAW)

        # The materialized view we query aggregate data from.
        materialized_storage = get_storage(StorageKey.OUTCOMES_HOURLY)
        read_schema = materialized_storage.get_schema()
        super().__init__(
            storages=[writable_storage, materialized_storage],
            query_pipeline_builder=SimplePipelineBuilder(
                query_plan_builder=SingleStorageQueryPlanBuilder(
                    # TODO: Once we are ready to expose the raw data model and select whether to use
                    # materialized storage or the raw one here, replace this with a custom storage
                    # selector that decides when to use the materialized data.
                    storage=materialized_storage,
                ),
            ),
            abstract_column_set=read_schema.get_columns(),
            join_relationships={},
            writable_storage=writable_storage,
            validators=[EntityRequiredColumnValidator({"org_id"})],
            required_time_column="timestamp",
        )
Example #13
0
    def __init__(
        self,
        writable_storage_key: StorageKey,
        readable_storage_key: StorageKey,
        value_schema: Sequence[Column[SchemaModifiers]],
        mappers: TranslationMappers,
    ) -> None:
        writable_storage = get_writable_storage(writable_storage_key)
        readable_storage = get_storage(readable_storage_key)

        super().__init__(
            storages=[writable_storage, readable_storage],
            query_pipeline_builder=SimplePipelineBuilder(
                query_plan_builder=SingleStorageQueryPlanBuilder(
                    readable_storage,
                    mappers=TranslationMappers(subscriptables=[
                        SubscriptableMapper(None, "tags", None, "tags"),
                    ], ).concat(mappers),
                )),
            abstract_column_set=ColumnSet([
                Column("org_id", UInt(64)),
                Column("project_id", UInt(64)),
                Column("metric_id", UInt(64)),
                Column("timestamp", DateTime()),
                Column("tags", Nested([("key", UInt(64)),
                                       ("value", UInt(64))])),
                *value_schema,
            ]),
            join_relationships={},
            writable_storage=writable_storage,
            validators=[
                EntityRequiredColumnValidator({"org_id", "project_id"})
            ],
            required_time_column="timestamp",
        )
Example #14
0
    def __init__(self) -> None:
        storage = get_storage(StorageKey.OUTCOMES_RAW)

        super().__init__(
            storages=[storage],
            query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            abstract_column_set=storage.get_schema().get_columns(),
            writable_storage=None,
        )
Example #15
0
def optimize(
    *,
    clickhouse_host: Optional[str],
    clickhouse_port: Optional[int],
    storage_name: str,
    parallel: int,
    log_level: Optional[str] = None,
) -> None:
    from datetime import datetime

    from snuba.clickhouse.native import ClickhousePool
    from snuba.optimize import logger, run_optimize

    setup_logging(log_level)
    setup_sentry()

    storage: ReadableTableStorage

    storage_key = StorageKey(storage_name)
    storage = get_storage(storage_key)

    (clickhouse_user, clickhouse_password) = storage.get_cluster().get_credentials()

    today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)

    database = storage.get_cluster().get_database()

    # TODO: In distributed mode, optimize currently must be run once for each node
    # with the host and port of that node provided via the CLI. In the future,
    # passing this information won't be necessary, and running this command once
    # will ensure that optimize is performed on all of the individual nodes for
    # that cluster.
    if clickhouse_host and clickhouse_port:
        connection = ClickhousePool(
            clickhouse_host,
            clickhouse_port,
            clickhouse_user,
            clickhouse_password,
            database,
            send_receive_timeout=ClickhouseClientSettings.OPTIMIZE.value.timeout,
        )
    elif not storage.get_cluster().is_single_node():
        raise click.ClickException("Provide Clickhouse host and port for optimize")
    else:
        connection = storage.get_cluster().get_query_connection(
            ClickhouseClientSettings.OPTIMIZE
        )

    num_dropped = run_optimize(
        connection,
        storage,
        database,
        before=today,
        parallel=parallel,
        clickhouse_host=clickhouse_host,
    )
    logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
Example #16
0
def test_get_local_nodes() -> None:
    with patch.object(ClickhousePool, "execute") as execute:
        execute.return_value = [
            ("host_1", 9000, 1, 1),
            ("host_2", 9000, 2, 1),
        ]

        local_cluster = get_storage(StorageKey("events")).get_cluster()
        assert len(local_cluster.get_local_nodes()) == 1
        assert local_cluster.get_local_nodes()[0].host_name == "host_1"
        assert local_cluster.get_local_nodes()[0].port == 9000
        assert local_cluster.get_local_nodes()[0].shard is None
        assert local_cluster.get_local_nodes()[0].replica is None

        distributed_cluster = get_storage(StorageKey("transactions")).get_cluster()
        assert len(distributed_cluster.get_local_nodes()) == 2
        assert distributed_cluster.get_local_nodes()[0].host_name == "host_1"
        assert distributed_cluster.get_local_nodes()[1].host_name == "host_2"
Example #17
0
def get_storage_info() -> Sequence[Storage]:
    return [{
        "storage_name": storage_key.value,
        "local_table_name": _get_local_table_name(storage_key),
        "local_nodes": _get_local_nodes(storage_key),
    } for storage_key in sorted(STORAGES,
                                key=lambda storage_key: storage_key.value)
            if get_storage(storage_key).get_storage_set_key() not in
            DEV_STORAGE_SETS or settings.ENABLE_DEV_FEATURES]
Example #18
0
def _get_local_nodes(storage_key: StorageKey) -> Sequence[Node]:
    try:
        storage = get_storage(storage_key)
        return [{
            "host": node.host_name,
            "port": node.port
        } for node in storage.get_cluster().get_local_nodes()]
    except (AssertionError, KeyError, UndefinedClickhouseCluster):
        # If cluster_name is not defined just return an empty list
        return []
Example #19
0
    def __init__(self) -> None:
        storage = get_storage(StorageKey.OUTCOMES_RAW)

        self.__time_group_columns = {"time": "timestamp"}
        self.__time_parse_columns = ("timestamp", )
        super().__init__(
            storages=[storage],
            query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            abstract_column_set=storage.get_schema().get_columns(),
            writable_storage=None,
        )
Example #20
0
def get_ro_node_connection(
    clickhouse_host: str,
    clickhouse_port: int,
    storage_name: str,
    client_settings: ClickhouseClientSettings,
) -> ClickhousePool:
    storage_key = None
    try:
        storage_key = StorageKey(storage_name)
    except ValueError:
        raise InvalidStorageError(
            f"storage {storage_name} is not a valid storage name",
            extra_data={"storage_name": storage_name},
        )

    key = f"{storage_key}-{clickhouse_host}"
    if key in NODE_CONNECTIONS:
        return NODE_CONNECTIONS[key]

    storage = get_storage(storage_key)
    cluster = storage.get_cluster()

    if not is_valid_node(clickhouse_host, clickhouse_port, cluster):
        raise InvalidNodeError(
            f"host {clickhouse_host} and port {clickhouse_port} are not valid",
            extra_data={"host": clickhouse_host, "port": clickhouse_port},
        )

    database = cluster.get_database()

    assert client_settings in {
        ClickhouseClientSettings.QUERY,
        ClickhouseClientSettings.TRACING,
    }, "admin can only use QUERY or TRACING ClickhouseClientSettings"

    if client_settings == ClickhouseClientSettings.QUERY:
        username = settings.CLICKHOUSE_READONLY_USER
        password = settings.CLICKHOUSE_READONLY_PASSWORD
    else:
        username = settings.CLICKHOUSE_TRACE_USER
        password = settings.CLICKHOUSE_TRACE_PASSWORD

    connection = ClickhousePool(
        clickhouse_host,
        clickhouse_port,
        username,
        password,
        database,
        max_pool_size=2,
        client_settings=client_settings.value.settings,
    )
    NODE_CONNECTIONS[key] = connection
    return connection
Example #21
0
    def __init__(self) -> None:
        storage = get_storage(StorageKey.OUTCOMES_RAW)

        super().__init__(
            storages=[storage],
            query_pipeline_builder=SimplePipelineBuilder(
                query_plan_builder=SingleStorageQueryPlanBuilder(
                    storage=storage), ),
            abstract_column_set=storage.get_schema().get_columns(),
            join_relationships={},
            writable_storage=None,
        )
Example #22
0
    def __init__(self) -> None:
        storage = get_storage("outcomes_raw")
        read_schema = storage.get_schemas().get_read_schema()

        self.__time_group_columns = {"time": "timestamp"}
        super().__init__(
            storages=[storage],
            query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            abstract_column_set=read_schema.get_columns(),
            writable_storage=None,
            time_group_columns=self.__time_group_columns,
            time_parse_columns=("timestamp",),
        )
Example #23
0
    def __init__(self) -> None:
        storage = get_storage(StorageKey.OUTCOMES_RAW)

        super().__init__(
            storages=[storage],
            query_pipeline_builder=SimplePipelineBuilder(
                query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            ),
            abstract_column_set=storage.get_schema().get_columns(),
            join_relationships={},
            writable_storage=None,
            validators=[EntityRequiredColumnValidator({"org_id"})],
            required_time_column="timestamp",
        )
Example #24
0
    def setup_method(self):
        from snuba.web.views import application

        assert application.testing is True

        self.app = application.test_client()
        self.app.post = partial(self.app.post, headers={"referer": "test"})
        self.storage = get_storage(StorageKey.EVENTS)

        self.replacer = replacer.ReplacerWorker(
            self.storage, DummyMetricsBackend(strict=True))

        self.project_id = 1
        self.event = get_raw_event()
Example #25
0
    def test(self):
        dataset = get_dataset("events")
        storage = dataset.get_writable_storage()
        assert storage is not None
        table = storage.get_table_writer().get_schema().get_local_table_name()
        clickhouse = (get_storage(
            StorageKey.EVENTS).get_cluster().get_query_connection(
                ClickhouseClientSettings.QUERY))

        assert clickhouse.execute("SELECT COUNT() FROM %s" % table)[0][0] == 0

        perf.run("tests/perf-event.json", dataset)

        assert clickhouse.execute("SELECT COUNT() FROM %s" % table)[0][0] == 1
Example #26
0
    def __init__(
        self,
        writable_storage_key: Optional[StorageKey],
        readable_storage_key: StorageKey,
        value_schema: Sequence[Column[SchemaModifiers]],
        mappers: TranslationMappers,
        abstract_column_set: Optional[ColumnSet] = None,
        validators: Optional[Sequence[QueryValidator]] = None,
    ) -> None:
        writable_storage = (get_writable_storage(writable_storage_key)
                            if writable_storage_key else None)
        readable_storage = get_storage(readable_storage_key)
        storages = [readable_storage]
        if writable_storage:
            storages.append(writable_storage)

        if abstract_column_set is None:
            abstract_column_set = ColumnSet([
                Column("org_id", UInt(64)),
                Column("project_id", UInt(64)),
                Column("metric_id", UInt(64)),
                Column("timestamp", DateTime()),
                Column("bucketed_time", DateTime()),
                Column("tags", Nested([("key", UInt(64)),
                                       ("value", UInt(64))])),
                *value_schema,
            ])

        if validators is None:
            validators = [
                EntityRequiredColumnValidator({"org_id", "project_id"}),
                GranularityValidator(minimum=10),
            ]

        super().__init__(
            storages=storages,
            query_pipeline_builder=SimplePipelineBuilder(
                query_plan_builder=SingleStorageQueryPlanBuilder(
                    readable_storage,
                    mappers=TranslationMappers(subscriptables=[
                        SubscriptableMapper(None, "tags", None, "tags"),
                    ], ).concat(mappers),
                )),
            abstract_column_set=abstract_column_set,
            join_relationships={},
            writable_storage=writable_storage,
            validators=validators,
            required_time_column="timestamp",
        )
Example #27
0
    def setup_method(self, test_method):
        super(TestReplacer, self).setup_method(test_method, "events")

        from snuba.web.views import application

        assert application.testing is True

        self.app = application.test_client()
        self.app.post = partial(self.app.post, headers={"referer": "test"})

        storage = get_storage("events")
        self.replacer = replacer.ReplacerWorker(
            self.clickhouse, storage, DummyMetricsBackend(strict=True))

        self.project_id = 1
Example #28
0
    def __init__(self) -> None:
        storage = get_writable_storage(StorageKey.EVENTS)
        schema = storage.get_table_writer().get_schema()
        columns = schema.get_columns()
        ro_storage = get_storage(StorageKey.EVENTS_RO)

        super().__init__(
            storages=[storage],
            query_plan_builder=SelectedStorageQueryPlanBuilder(
                selector=EventsQueryStorageSelector(
                    events_table=storage,
                    events_ro_table=ro_storage,
                )),
            abstract_column_set=columns,
            writable_storage=storage,
        )
Example #29
0
def test_capture_trace() -> None:
    storage = get_storage(StorageKey.ERRORS)
    clickhouse = storage.get_cluster().get_query_connection(
        ClickhouseClientSettings.QUERY
    )

    data = clickhouse.execute(
        "SELECT count() FROM errors_local", with_column_types=True, capture_trace=True
    )
    assert data.results == []
    assert data.meta == []
    assert data.trace_output != ""
    assert data.profile is not None
    assert data.profile["elapsed"] > 0
    assert data.profile["bytes"] > 0
    assert data.profile["rows"] > 0
    assert data.profile["blocks"] > 0
Example #30
0
    def __init__(self) -> None:
        writable_storage = get_writable_storage("sessions_raw")
        materialized_storage = get_storage("sessions_hourly")
        read_schema = materialized_storage.get_schemas().get_read_schema()

        self.__time_group_columns = {"bucketed_started": "started"}
        super().__init__(
            storages=[writable_storage, materialized_storage],
            # TODO: Once we are ready to expose the raw data model and select whether to use
            # materialized storage or the raw one here, replace this with a custom storage
            # selector that decides when to use the materialized data.
            query_plan_builder=SingleStorageQueryPlanBuilder(
                storage=materialized_storage, ),
            abstract_column_set=read_schema.get_columns(),
            writable_storage=writable_storage,
            time_group_columns=self.__time_group_columns,
            time_parse_columns=("started", "received"),
        )