Ejemplo n.º 1
0
    def __init__(self):
        read_columns = ColumnSet([
            ('org_id', UInt(64)),
            ('project_id', UInt(64)),
            ('key_id', Nullable(UInt(64))),
            ('timestamp', DateTime()),
            ('outcome', UInt(8)),
            ('reason', LowCardinality(Nullable(String()))),
            ('event_id', Nullable(UUID())),
        ])

        read_schema = MergeTreeSchema(
            columns=read_columns,
            local_table_name='outcomes_raw_local',
            dist_table_name='outcomes_raw_dist',
            order_by='(org_id, project_id, timestamp)',
            partition_by='(toMonday(timestamp))',
            settings={'index_granularity': 16384})

        dataset_schemas = DatasetSchemas(read_schema=read_schema,
                                         write_schema=None,
                                         intermediary_schemas=[])

        super().__init__(dataset_schemas=dataset_schemas,
                         time_group_columns={
                             'time': 'timestamp',
                         },
                         time_parse_columns=('timestamp', ))
Ejemplo n.º 2
0
    def __init__(self) -> None:
        read_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("key_id", Nullable(UInt(64))),
            ("timestamp", DateTime()),
            ("outcome", UInt(8)),
            ("reason", LowCardinality(Nullable(String()))),
            ("event_id", Nullable(UUID())),
        ])

        read_schema = MergeTreeSchema(
            columns=read_columns,
            local_table_name="outcomes_raw_local",
            dist_table_name="outcomes_raw_dist",
            order_by="(org_id, project_id, timestamp)",
            partition_by="(toMonday(timestamp))",
            settings={"index_granularity": 16384},
            migration_function=outcomes_raw_migrations,
        )

        dataset_schemas = DatasetSchemas(read_schema=read_schema,
                                         write_schema=None,
                                         intermediary_schemas=[])

        super().__init__(
            dataset_schemas=dataset_schemas,
            time_group_columns={"time": "timestamp"},
            time_parse_columns=("timestamp", ),
        )
Ejemplo n.º 3
0
)
from snuba.datasets.schemas.tables import MergeTreeSchema
from snuba.datasets.schemas.join import (
    JoinConditionExpression,
    JoinCondition,
    JoinClause,
    JoinType,
    TableJoinNode,
)

table1 = MergeTreeSchema(
    columns=ColumnSet([
        ("t1c1", UInt(64)),
        ("t1c2", String()),
        ("t1c3", Nested([("t11c4", UInt(64))])),
    ]),
    local_table_name="table1",
    dist_table_name="table1",
    order_by="",
    partition_by="",
).get_data_source()

table2 = MergeTreeSchema(
    columns=ColumnSet([
        ("t2c1", UInt(64)),
        ("t2c2", String()),
        ("t2c3", Nested([("t21c4", UInt(64))])),
    ]),
    local_table_name="table2",
    dist_table_name="table2",
    order_by="",
Ejemplo n.º 4
0
    def __init__(self):
        write_columns = ColumnSet([
            ('org_id', UInt(64)),
            ('project_id', UInt(64)),
            ('key_id', Nullable(UInt(64))),
            ('timestamp', DateTime()),
            ('outcome', UInt(8)),
            ('reason', LowCardinality(Nullable(String()))),
            ('event_id', Nullable(UUID())),
        ])

        write_schema = MergeTreeSchema(
            columns=write_columns,
            # TODO: change to outcomes.raw_local when we add multi DB support
            local_table_name=WRITE_LOCAL_TABLE_NAME,
            dist_table_name=WRITE_DIST_TABLE_NAME,
            order_by='(org_id, project_id, timestamp)',
            partition_by='(toMonday(timestamp))',
            settings={'index_granularity': 16384})

        read_columns = ColumnSet([
            ('org_id', UInt(64)),
            ('project_id', UInt(64)),
            ('key_id', UInt(64)),
            ('timestamp', DateTime()),
            ('outcome', UInt(8)),
            ('reason', LowCardinality(String())),
            ('times_seen', UInt(64)),
        ])

        read_schema = SummingMergeTreeSchema(
            columns=read_columns,
            local_table_name=READ_LOCAL_TABLE_NAME,
            dist_table_name=READ_DIST_TABLE_NAME,
            order_by='(org_id, project_id, key_id, outcome, reason, timestamp)',
            partition_by='(toMonday(timestamp))',
            settings={'index_granularity': 256})

        materialized_view_columns = ColumnSet([
            ('org_id', UInt(64)),
            ('project_id', UInt(64)),
            ('key_id', UInt(64)),
            ('timestamp', DateTime()),
            ('outcome', UInt(8)),
            ('reason', String()),
            ('times_seen', UInt(64)),
        ])

        # TODO: Find a better way to specify a query for a materialized view
        # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query
        # doesn't use it.
        query = """
               SELECT
                   org_id,
                   project_id,
                   ifNull(key_id, 0) AS key_id,
                   toStartOfHour(timestamp) AS timestamp,
                   outcome,
                   ifNull(reason, 'none') AS reason,
                   count() AS times_seen
               FROM %(source_table_name)s
               GROUP BY org_id, project_id, key_id, timestamp, outcome, reason
               """

        materialized_view = MaterializedViewSchema(
            local_materialized_view_name='outcomes_mv_hourly_local',
            dist_materialized_view_name='outcomes_mv_hourly_dist',
            columns=materialized_view_columns,
            query=query,
            local_source_table_name=WRITE_LOCAL_TABLE_NAME,
            local_destination_table_name=READ_LOCAL_TABLE_NAME,
            dist_source_table_name=WRITE_DIST_TABLE_NAME,
            dist_destination_table_name=READ_DIST_TABLE_NAME)

        dataset_schemas = DatasetSchemas(
            read_schema=read_schema,
            write_schema=write_schema,
            intermediary_schemas=[materialized_view])

        super(OutcomesDataset, self).__init__(
            dataset_schemas=dataset_schemas,
            processor=OutcomesProcessor(),
            default_topic="outcomes",
        )
Ejemplo n.º 5
0
    ("project_id", UInt(64)),
    ("retention_days", UInt(16)),
    ("duration", UInt(32)),
    ("status", UInt(8)),
    ("errors", UInt(16)),
    ("received", DateTime()),
    ("started", DateTime()),
    ("release", LowCardinality(String())),
    ("environment", LowCardinality(String())),
])

raw_schema = MergeTreeSchema(
    columns=all_columns,
    local_table_name=WRITE_LOCAL_TABLE_NAME,
    dist_table_name=WRITE_DIST_TABLE_NAME,
    storage_set_key=StorageSetKey.SESSIONS,
    order_by="(org_id, project_id, release, environment, started)",
    partition_by="(toMonday(started))",
    settings={"index_granularity": "16384"},
)

read_columns = ColumnSet([
    ("org_id", UInt(64)),
    ("project_id", UInt(64)),
    ("started", DateTime()),
    ("release", LowCardinality(String())),
    ("environment", LowCardinality(String())),
    (
        "duration_quantiles",
        AggregateFunction("quantilesIf(0.5, 0.9)", UInt(32), UInt(8)),
    ),
Ejemplo n.º 6
0
            ("cache_hit", UInt(8)),
            ("sample", Float(32)),
            ("max_threads", UInt(8)),
            ("num_days", UInt(32)),
            ("clickhouse_table", LowCardinality(String())),
            ("query_id", String()),
            ("is_duplicate", UInt(8)),
            ("consistent", UInt(8)),
        ]),
    ),
])

schema = MergeTreeSchema(
    columns=columns,
    local_table_name="querylog_local",
    dist_table_name="querylog_dist",
    order_by="(toStartOfDay(timestamp), request_id)",
    partition_by="(toMonday(timestamp))",
    sample_expr="request_id",
)

storage = WritableTableStorage(
    schemas=StorageSchemas(read_schema=schema, write_schema=schema),
    table_writer=TableWriter(
        write_schema=schema,
        stream_loader=KafkaStreamLoader(
            processor=QuerylogProcessor(),
            default_topic=settings.QUERIES_TOPIC,
        ),
    ),
    query_processors=[],
)
Ejemplo n.º 7
0
    def __init__(self) -> None:
        write_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("key_id", Nullable(UInt(64))),
            ("timestamp", DateTime()),
            ("outcome", UInt(8)),
            ("reason", LowCardinality(Nullable(String()))),
            ("event_id", Nullable(UUID())),
        ])

        write_schema = MergeTreeSchema(
            columns=write_columns,
            # TODO: change to outcomes.raw_local when we add multi DB support
            local_table_name=WRITE_LOCAL_TABLE_NAME,
            dist_table_name=WRITE_DIST_TABLE_NAME,
            order_by="(org_id, project_id, timestamp)",
            partition_by="(toMonday(timestamp))",
            settings={"index_granularity": 16384},
        )

        read_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("key_id", UInt(64)),
            ("timestamp", DateTime()),
            ("outcome", UInt(8)),
            ("reason", LowCardinality(String())),
            ("times_seen", UInt(64)),
        ])

        read_schema = SummingMergeTreeSchema(
            columns=read_columns,
            local_table_name=READ_LOCAL_TABLE_NAME,
            dist_table_name=READ_DIST_TABLE_NAME,
            order_by="(org_id, project_id, key_id, outcome, reason, timestamp)",
            partition_by="(toMonday(timestamp))",
            settings={"index_granularity": 256},
        )

        materialized_view_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("key_id", UInt(64)),
            ("timestamp", DateTime()),
            ("outcome", UInt(8)),
            ("reason", String()),
            ("times_seen", UInt(64)),
        ])

        # TODO: Find a better way to specify a query for a materialized view
        # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query
        # doesn't use it.
        query = """
               SELECT
                   org_id,
                   project_id,
                   ifNull(key_id, 0) AS key_id,
                   toStartOfHour(timestamp) AS timestamp,
                   outcome,
                   ifNull(reason, 'none') AS reason,
                   count() AS times_seen
               FROM %(source_table_name)s
               GROUP BY org_id, project_id, key_id, timestamp, outcome, reason
               """

        materialized_view = MaterializedViewSchema(
            local_materialized_view_name="outcomes_mv_hourly_local",
            dist_materialized_view_name="outcomes_mv_hourly_dist",
            prewhere_candidates=["project_id", "org_id"],
            columns=materialized_view_columns,
            query=query,
            local_source_table_name=WRITE_LOCAL_TABLE_NAME,
            local_destination_table_name=READ_LOCAL_TABLE_NAME,
            dist_source_table_name=WRITE_DIST_TABLE_NAME,
            dist_destination_table_name=READ_DIST_TABLE_NAME,
        )

        dataset_schemas = DatasetSchemas(
            read_schema=read_schema,
            write_schema=write_schema,
            intermediary_schemas=[materialized_view],
        )

        table_writer = TableWriter(
            write_schema=write_schema,
            stream_loader=KafkaStreamLoader(
                processor=OutcomesProcessor(),
                default_topic="outcomes",
            ),
        )

        super().__init__(
            dataset_schemas=dataset_schemas,
            table_writer=table_writer,
            time_group_columns={"time": "timestamp"},
            time_parse_columns=("timestamp", ),
        )
Ejemplo n.º 8
0
write_columns = ColumnSet([
    ("org_id", UInt(64)),
    ("project_id", UInt(64)),
    ("key_id", Nullable(UInt(64))),
    ("timestamp", DateTime()),
    ("outcome", UInt(8)),
    ("reason", LowCardinality(Nullable(String()))),
    ("event_id", Nullable(UUID())),
])

raw_schema = MergeTreeSchema(
    columns=write_columns,
    # TODO: change to outcomes.raw_local when we add multi DB support
    local_table_name=WRITE_LOCAL_TABLE_NAME,
    dist_table_name=WRITE_DIST_TABLE_NAME,
    storage_set_key=StorageSetKey.OUTCOMES,
    order_by="(org_id, project_id, timestamp)",
    partition_by="(toMonday(timestamp))",
    settings={"index_granularity": "16384"},
)

read_columns = ColumnSet([
    ("org_id", UInt(64)),
    ("project_id", UInt(64)),
    ("key_id", UInt(64)),
    ("timestamp", DateTime()),
    ("outcome", UInt(8)),
    ("reason", LowCardinality(String())),
    ("times_seen", UInt(64)),
])
Ejemplo n.º 9
0
import pytest

from snuba.datasets.schemas.tables import MergeTreeSchema
from snuba.datasets.schemas.join import (
    JoinConditionExpression,
    JoinCondition,
    JoinStructure,
    SchemaJoinedSource,
    SubJoinSource,
    JoinType,
)

table1 = MergeTreeSchema(
    columns=None,
    local_table_name="table1",
    dist_table_name="table1",
    order_by="",
    partition_by="",
)

table2 = MergeTreeSchema(
    columns=None,
    local_table_name="table2",
    dist_table_name="table2",
    order_by="",
    partition_by="",
)

table3 = MergeTreeSchema(
    columns=None,
    local_table_name="table3",
Ejemplo n.º 10
0
from snuba.datasets.schemas.tables import MergeTreeSchema
from snuba.datasets.schemas.join import (
    JoinConditionExpression,
    JoinCondition,
    JoinClause,
    JoinType,
    TableJoinNode,
    JoinClause,
)

table1 = MergeTreeSchema(
    columns=ColumnSet([
        ("t1c1", UInt(64)),
        ("t1c2", String()),
        ("t1c3", Nested([("t11c4", UInt(64))])),
    ]),
    local_table_name="table1",
    dist_table_name="table1",
    order_by="",
    partition_by="",
)

table2 = MergeTreeSchema(
    columns=ColumnSet([
        ("t2c1", UInt(64)),
        ("t2c2", String()),
        ("t2c3", Nested([("t21c4", UInt(64))])),
    ]),
    local_table_name="table2",
    dist_table_name="table2",
    order_by="",