Ejemplo n.º 1
0
 def _get_engine_type(self, cluster: ClickhouseCluster, table_name: str) -> str:
     database_name = cluster.get_database()
     if cluster.is_single_node():
         return f"ReplacingMergeTree({self.__version_column})"
     elif self._unsharded is True:
         return f"ReplicatedReplacingMergeTree('/clickhouse/tables/{self._storage_set_value}/all/{database_name}/{table_name}', '{{replica}}', {self.__version_column})"
     else:
         return f"ReplicatedReplacingMergeTree('/clickhouse/tables/{self._storage_set_value}/{{shard}}/{database_name}/{table_name}', '{{replica}}', {self.__version_column})"
Ejemplo n.º 2
0
    def get_sql(self, cluster: ClickhouseCluster, table_name: str) -> str:
        cluster_name = cluster.get_clickhouse_cluster_name()
        assert not cluster.is_single_node()
        assert cluster_name is not None
        database_name = cluster.get_database()
        optional_sharding_key = (f", {self.__sharding_key}"
                                 if self.__sharding_key else "")

        return f"Distributed({cluster_name}, {database_name}, {self.__local_table_name}{optional_sharding_key})"
Ejemplo n.º 3
0
def test_no_split(dataset_name: str, id_column: str, project_column: str,
                  timestamp_column: str) -> None:
    events = get_dataset(dataset_name)
    query = ClickhouseQuery(
        events.get_default_entity().get_all_storages()
        [0].get_schema().get_data_source(), )

    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
        reader: Reader,
    ) -> QueryResult:
        assert query == query
        return QueryResult({}, {})

    strategy = SimpleQueryPlanExecutionStrategy(
        ClickhouseCluster("localhost", 1024, "default", "", "default", 80,
                          set(), True),
        [],
        [
            ColumnSplitQueryStrategy(
                id_column=id_column,
                project_column=project_column,
                timestamp_column=timestamp_column,
            ),
            TimeSplitQueryStrategy(timestamp_col=timestamp_column),
        ],
    )

    strategy.execute(query, HTTPRequestSettings(), do_query)
Ejemplo n.º 4
0
 def _get_engine_type(self, cluster: ClickhouseCluster,
                      table_name: str) -> str:
     if cluster.is_single_node():
         return "MergeTree()"
     else:
         zoo_path = self._get_zookeeper_path(cluster, table_name)
         return f"ReplicatedMergeTree({zoo_path}, '{{replica}}')"
Ejemplo n.º 5
0
 def _get_engine_type(self, cluster: ClickhouseCluster,
                      table_name: str) -> str:
     if cluster.is_single_node():
         return "MergeTree()"
     elif self._unsharded is True:
         return f"ReplicatedMergeTree('/clickhouse/tables/{self._storage_set_value}/all/{table_name}', '{{replica}}')"
     else:
         return f"ReplicatedMergeTree('/clickhouse/tables/{self._storage_set_value}/{{shard}}/{table_name}', '{{replica}}')"
Ejemplo n.º 6
0
def test_col_split(
    dataset_name: str,
    id_column: str,
    project_column: str,
    timestamp_column: str,
    first_query_data: Sequence[MutableMapping[str, Any]],
    second_query_data: Sequence[MutableMapping[str, Any]],
) -> None:
    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
        reader: Reader[SqlQuery],
    ) -> QueryResult:
        selected_cols = query.get_selected_columns()
        assert selected_cols == [
            c.expression.column_name
            for c in query.get_selected_columns_from_ast() or []
            if isinstance(c.expression, Column)
        ]
        if selected_cols == list(first_query_data[0].keys()):
            return QueryResult({"data": first_query_data}, {})
        elif selected_cols == list(second_query_data[0].keys()):
            return QueryResult({"data": second_query_data}, {})
        else:
            raise ValueError(f"Unexpected selected columns: {selected_cols}")

    events = get_dataset(dataset_name)
    query = ClickhouseQuery(
        LogicalQuery(
            {
                "selected_columns": list(second_query_data[0].keys()),
                "conditions": [""],
                "orderby": "events.event_id",
                "sample": 10,
                "limit": 100,
                "offset": 50,
            },
            events.get_all_storages()[0].get_schema().get_data_source(),
            selected_columns=[
                SelectedExpression(name=col_name,
                                   expression=Column(None, None, col_name))
                for col_name in second_query_data[0].keys()
            ],
        ))

    strategy = SimpleQueryPlanExecutionStrategy(
        ClickhouseCluster("localhost", 1024, "default", "", "default", 80,
                          set(), True),
        [],
        [
            ColumnSplitQueryStrategy(id_column, project_column,
                                     timestamp_column),
            TimeSplitQueryStrategy(timestamp_col=timestamp_column),
        ],
    )

    strategy.execute(query, HTTPRequestSettings(), do_query)
Ejemplo n.º 7
0
 def _get_engine_type(self, cluster: ClickhouseCluster,
                      table_name: str) -> str:
     if cluster.is_single_node():
         if self.__version_column:
             return f"ReplacingMergeTree({self.__version_column})"
         return "ReplacingMergeTree()"
     else:
         zoo_path = self._get_zookeeper_path(cluster, table_name)
         if self.__version_column:
             return f"ReplicatedReplacingMergeTree({zoo_path}, '{{replica}}', {self.__version_column})"
         return f"ReplicatedReplacingMergeTree({zoo_path}, '{{replica}}')"
Ejemplo n.º 8
0
    def _get_zookeeper_path(self, cluster: ClickhouseCluster,
                            table_name: str) -> str:
        database_name = cluster.get_database()

        if self._unsharded is True:
            path = f"/clickhouse/tables/{self._storage_set_value}/all/{database_name}/{table_name}"
        else:
            path = f"/clickhouse/tables/{self._storage_set_value}/{{shard}}/{database_name}/{table_name}"

        path_with_override = settings.CLICKHOUSE_ZOOKEEPER_OVERRIDE.get(
            path, path)

        return f"'{path_with_override}'"
Ejemplo n.º 9
0
def pytest_configure() -> None:
    """
    Set up the Sentry SDK to avoid errors hidden by configuration.
    Ensure the snuba_test database exists
    """
    assert (
        settings.TESTING
    ), "settings.TESTING is False, try `SNUBA_SETTINGS=test` or `make test`"

    setup_sentry()

    for cluster in settings.CLUSTERS:
        clickhouse_cluster = ClickhouseCluster(
            host=cluster["host"],
            port=cluster["port"],
            user="******",
            password="",
            database="default",
            http_port=cluster["http_port"],
            storage_sets=cluster["storage_sets"],
            single_node=cluster["single_node"],
            cluster_name=cluster["cluster_name"]
            if "cluster_name" in cluster else None,
            distributed_cluster_name=cluster["distributed_cluster_name"]
            if "distributed_cluster_name" in cluster else None,
        )

        database_name = cluster["database"]
        nodes = [
            *clickhouse_cluster.get_local_nodes(),
            *clickhouse_cluster.get_distributed_nodes(),
        ]

        for node in nodes:
            connection = clickhouse_cluster.get_node_connection(
                ClickhouseClientSettings.MIGRATE, node)
            connection.execute(f"DROP DATABASE IF EXISTS {database_name};")
            connection.execute(f"CREATE DATABASE {database_name};")
Ejemplo n.º 10
0
def test_col_split(
    dataset_name: str,
    id_column: str,
    project_column: str,
    timestamp_column: str,
    first_query_data: Sequence[MutableMapping[str, Any]],
    second_query_data: Sequence[MutableMapping[str, Any]],
) -> None:
    def do_query(
        query: ClickhouseQuery,
        query_settings: QuerySettings,
        reader: Reader,
    ) -> QueryResult:
        selected_col_names = [
            c.expression.column_name
            for c in query.get_selected_columns() or []
            if isinstance(c.expression, Column)
        ]
        if selected_col_names == list(first_query_data[0].keys()):
            return QueryResult({"data": first_query_data}, {})
        elif selected_col_names == list(second_query_data[0].keys()):
            return QueryResult({"data": second_query_data}, {})
        else:
            raise ValueError(
                f"Unexpected selected columns: {selected_col_names}")

    events = get_dataset(dataset_name)
    query = ClickhouseQuery(
        events.get_default_entity().get_all_storages()
        [0].get_schema().get_data_source(),
        selected_columns=[
            SelectedExpression(name=col_name,
                               expression=Column(None, None, col_name))
            for col_name in second_query_data[0].keys()
        ],
    )

    strategy = SimpleQueryPlanExecutionStrategy(
        ClickhouseCluster("localhost", 1024, "default", "", "default", 80,
                          set(), True),
        [],
        [
            ColumnSplitQueryStrategy(id_column, project_column,
                                     timestamp_column),
            TimeSplitQueryStrategy(timestamp_col=timestamp_column),
        ],
    )

    strategy.execute(query, HTTPQuerySettings(), do_query)
Ejemplo n.º 11
0
def test_no_split(
    dataset_name: str, id_column: str, project_column: str, timestamp_column: str
) -> None:
    events = get_dataset(dataset_name)
    query = ClickhouseQuery(
        LogicalQuery(
            {
                "selected_columns": ["event_id"],
                "conditions": [""],
                "orderby": "event_id",
                "sample": 10,
                "limit": 100,
                "offset": 50,
            },
            events.get_all_storages()[0].get_schema().get_data_source(),
        )
    )

    def do_query(
        query: ClickhouseQuery,
        request_settings: RequestSettings,
        reader: Reader[SqlQuery],
    ) -> QueryResult:
        assert query == query
        return QueryResult({}, {})

    strategy = SimpleQueryPlanExecutionStrategy(
        ClickhouseCluster("localhost", 1024, "default", "", "default", 80, set(), True),
        [],
        [
            ColumnSplitQueryStrategy(
                id_column=id_column,
                project_column=project_column,
                timestamp_column=timestamp_column,
            ),
            TimeSplitQueryStrategy(timestamp_col=timestamp_column),
        ],
    )

    strategy.execute(query, HTTPRequestSettings(), do_query)
Ejemplo n.º 12
0
import pytest

from snuba.clusters.cluster import ClickhouseCluster
from snuba.clusters.storage_sets import StorageSetKey
from snuba.migrations import table_engines

single_node_cluster = ClickhouseCluster(
    host="host_1",
    port=9000,
    user="******",
    password="",
    database="default",
    http_port=8123,
    storage_sets={"events"},
    single_node=True,
)

multi_node_cluster = ClickhouseCluster(
    host="host_2",
    port=9000,
    user="******",
    password="",
    database="default",
    http_port=8123,
    storage_sets={"events"},
    single_node=False,
    cluster_name="cluster_1",
    distributed_cluster_name="dist_hosts",
)

merge_test_cases = [
Ejemplo n.º 13
0
def is_valid_node(host: str, port: int, cluster: ClickhouseCluster) -> bool:
    nodes = [*cluster.get_local_nodes(), cluster.get_query_node()]

    return any(node.host_name == host and node.port == port for node in nodes)