Beispiel #1
0
def run_optimize(
    clickhouse: ClickhousePool,
    storage: ReadableTableStorage,
    database: str,
    before: Optional[datetime] = None,
    ignore_cutoff: bool = False,
    parallel: int = 1,
    clickhouse_host: Optional[str] = None,
) -> int:
    start = time.time()
    schema = storage.get_schema()
    assert isinstance(schema, TableSchema)
    table = schema.get_local_table_name()
    database = storage.get_cluster().get_database()

    parts = get_partitions_to_optimize(clickhouse, storage, database, table,
                                       before)
    optimize_partition_runner(clickhouse, database, table, parts,
                              ignore_cutoff, parallel, clickhouse_host)
    metrics.timing(
        "optimized_all_parts",
        time.time() - start,
        tags=_get_metrics_tags(table, clickhouse_host),
    )
    return len(parts)
Beispiel #2
0
def run_optimize(
    clickhouse: ClickhousePool,
    storage: ReadableTableStorage,
    database: str,
    before: Optional[datetime] = None,
) -> int:
    schema = storage.get_schema()
    assert isinstance(schema, TableSchema)
    table = schema.get_local_table_name()
    database = storage.get_cluster().get_database()

    parts = get_partitions_to_optimize(clickhouse, storage, database, table,
                                       before)
    optimize_partitions(clickhouse, database, table, parts)
    return len(parts)
Beispiel #3
0
storage = ReadableTableStorage(
    storage_key=StorageKey.DISCOVER,
    storage_set_key=StorageSetKey.DISCOVER,
    schema=schema,
    query_processors=[
        MappingColumnPromoter(
            mapping_specs={
                "tags": {
                    "environment": "environment",
                    "sentry:release": "release",
                    "sentry:dist": "dist",
                    "sentry:user": "******",
                },
                "contexts": {"trace.trace_id": "trace_id"},
            }
        ),
        MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
        ArrayJoinKeyValueOptimizer("tags"),
        UUIDColumnProcessor(set(["event_id", "trace_id"])),
        EventsBooleanContextsProcessor(),
        PrewhereProcessor(
            [
                "event_id",
                "release",
                "message",
                "transaction_name",
                "environment",
                "project_id",
            ]
        ),
    ],
    query_splitters=[
        ColumnSplitQueryStrategy(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="timestamp",
        ),
        TimeSplitQueryStrategy(timestamp_col="timestamp"),
    ],
)
Beispiel #4
0
])

schema = TableSchema(
    columns=columns,
    local_table_name="discover_local",
    dist_table_name="discover_dist",
    storage_set_key=StorageSetKey.DISCOVER,
    mandatory_conditions=mandatory_conditions,
    prewhere_candidates=[
        "event_id",
        "release",
        "message",
        "transaction_name",
        "environment",
        "project_id",
    ],
)

storage = ReadableTableStorage(
    storage_key=StorageKey.DISCOVER,
    storage_set_key=StorageSetKey.DISCOVER,
    schema=schema,
    query_processors=[
        MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
        EventIdColumnProcessor(),
        ArrayJoinKeyValueOptimizer("tags"),
        PrewhereProcessor(),
    ],
    query_splitters=[TimeSplitQueryStrategy(timestamp_col="timestamp")],
)
Beispiel #5
0
    dist_table_name=READ_DIST_TABLE_NAME,
    storage_set_key=StorageSetKey.SESSIONS,
)
materialized_view_schema = TableSchema(
    local_table_name=READ_LOCAL_MV_NAME,
    dist_table_name=READ_DIST_MV_NAME,
    storage_set_key=StorageSetKey.SESSIONS,
    columns=read_columns,
)

# The raw table we write onto, and that potentially we could
# query.
raw_storage = WritableTableStorage(
    storage_key=StorageKey.SESSIONS_RAW,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=raw_schema,
    query_processors=[],
    stream_loader=build_kafka_stream_loader_from_settings(
        StorageKey.SESSIONS_RAW,
        processor=SessionsProcessor(),
        default_topic_name="ingest-sessions",
    ),
)
# The materialized view we query aggregate data from.
materialized_storage = ReadableTableStorage(
    storage_key=StorageKey.SESSIONS_HOURLY,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=read_schema,
    query_processors=[PrewhereProcessor(["project_id", "org_id"])],
)
Beispiel #6
0
    Column("granularity", UInt(8)),
    Column("timestamp", DateTime()),
    Column("retention_days", UInt(16)),
    Column(
        "tags",
        Nested([("key", UInt(64)), ("indexed_value", UInt(64)),
                ("raw_value", String())]),
    ),
    Column("_raw_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))),
    Column("_indexed_tags_hash", Array(UInt(64),
                                       SchemaModifiers(readonly=True))),
]

sets_storage = ReadableTableStorage(
    storage_key=StorageKey.GENERIC_METRICS_SETS,
    storage_set_key=StorageSetKey.GENERIC_METRICS_SETS,
    schema=TableSchema(
        local_table_name="generic_metrics_sets_local",
        dist_table_name="generic_metrics_sets_dist",
        storage_set_key=StorageSetKey.GENERIC_METRICS_SETS,
        columns=ColumnSet([
            *aggregated_columns,
            Column("value", AggregateFunction("uniqCombined64", [UInt(64)])),
        ]),
    ),
    query_processors=[
        ArrayJoinKeyValueOptimizer("tags"),
        TableRateLimit(),
    ],
)
Beispiel #7
0
def get_partitions_to_optimize(
    clickhouse: ClickhousePool,
    storage: ReadableTableStorage,
    database: str,
    table: str,
    before: Optional[datetime] = None,
) -> Sequence[util.Part]:
    engine = clickhouse.execute(
        """
        SELECT engine
        FROM system.tables
        WHERE (database = %(database)s) AND (name = %(table)s)
        """,
        {
            "database": database,
            "table": table
        },
    )

    if not engine:
        logger.warning("Table %s.%s doesn't exist on %s:%s" %
                       (database, table, clickhouse.host, clickhouse.port))
        return []

    if engine[0][0].startswith("Replicated"):
        is_leader = clickhouse.execute(
            """
            SELECT is_leader
            FROM system.replicas
            WHERE (database = %(database)s) AND (table = %(table)s)
            """,
            {
                "database": database,
                "table": table
            },
        )

        # response: [(0,)] for non-leader or [(1,)] for leader
        if not (len(is_leader) == 1 and is_leader[0][0]):
            return []

    active_parts = clickhouse.execute(
        """
        SELECT
            partition,
            count() AS c
        FROM system.parts
        WHERE active
        AND database = %(database)s
        AND table = %(table)s
        GROUP BY partition
        HAVING c > 1
        ORDER BY c DESC, partition
        """,
        {
            "database": database,
            "table": table
        },
    )

    schema = storage.get_schema()
    assert isinstance(schema, TableSchema)
    part_format = schema.get_part_format()
    assert part_format is not None

    parts = [
        util.decode_part_str(part, part_format) for part, count in active_parts
    ]

    if before:
        parts = [
            p for p in parts
            if (p.date + timedelta(days=6 - p.date.weekday())) < before
        ]

    return parts
Beispiel #8
0
materialized_view_schema = TableSchema(
    local_table_name="outcomes_mv_hourly_local",
    dist_table_name="outcomes_mv_hourly_dist",
    storage_set_key=StorageSetKey.OUTCOMES,
    columns=materialized_view_columns,
)

raw_storage = WritableTableStorage(
    storage_key=StorageKey.OUTCOMES_RAW,
    storage_set_key=StorageSetKey.OUTCOMES,
    schema=raw_schema,
    query_processors=[TableRateLimit()],
    mandatory_condition_checkers=[OrgIdEnforcer()],
    stream_loader=build_kafka_stream_loader_from_settings(
        processor=OutcomesProcessor(),
        default_topic=Topic.OUTCOMES,
    ),
)

materialized_storage = ReadableTableStorage(
    storage_key=StorageKey.OUTCOMES_HOURLY,
    storage_set_key=StorageSetKey.OUTCOMES,
    schema=read_schema,
    query_processors=[
        PrewhereProcessor(["project_id", "org_id"]),
        TableRateLimit()
    ],
    mandatory_condition_checkers=[OrgIdEnforcer()],
)
Beispiel #9
0
    ("reason", String()),
    ("times_seen", UInt(64)),
])

materialized_view_schema = TableSchema(
    local_table_name="outcomes_mv_hourly_local",
    dist_table_name="outcomes_mv_hourly_dist",
    storage_set_key=StorageSetKey.OUTCOMES,
    prewhere_candidates=["project_id", "org_id"],
    columns=materialized_view_columns,
)

raw_storage = WritableTableStorage(
    storage_key=StorageKey.OUTCOMES_RAW,
    storage_set_key=StorageSetKey.OUTCOMES,
    schema=raw_schema,
    query_processors=[],
    stream_loader=build_kafka_stream_loader_from_settings(
        StorageKey.OUTCOMES_RAW,
        processor=OutcomesProcessor(),
        default_topic_name="outcomes",
    ),
)

materialized_storage = ReadableTableStorage(
    storage_key=StorageKey.OUTCOMES_HOURLY,
    storage_set_key=StorageSetKey.OUTCOMES,
    schema=read_schema,
    query_processors=[PrewhereProcessor()],
)
Beispiel #10
0
from snuba.clusters.storage_sets import StorageSetKey
from snuba.datasets.storage import ReadableTableStorage
from snuba.datasets.schemas.tables import TableSchema
from snuba.datasets.storages import StorageKey

from snuba.datasets.storages.events_common import (
    all_columns,
    mandatory_conditions,
    query_processors,
    query_splitters,
)

schema = TableSchema(
    columns=all_columns,
    local_table_name="sentry_local",
    dist_table_name="sentry_dist_ro",
    storage_set_key=StorageSetKey.EVENTS_RO,
    mandatory_conditions=mandatory_conditions,
)

storage = ReadableTableStorage(
    storage_key=StorageKey.EVENTS_RO,
    storage_set_key=StorageSetKey.EVENTS_RO,
    schema=schema,
    query_processors=query_processors,
    query_splitters=query_splitters,
)
Beispiel #11
0
    dist_table_name="discover_dist",
    storage_set_key=StorageSetKey.DISCOVER,
    mandatory_conditions=[],
)

schema2 = TableSchema(
    columns=columns2,
    local_table_name="discover_local",
    dist_table_name="discover_dist",
    storage_set_key=StorageSetKey.DISCOVER,
    mandatory_conditions=[],
)

Storage1 = ReadableTableStorage(
    storage_key=StorageKey.DISCOVER,
    storage_set_key=StorageSetKey.DISCOVER,
    schema=schema1,
)

Storage2 = ReadableTableStorage(
    storage_key=StorageKey.DISCOVER,
    storage_set_key=StorageSetKey.DISCOVER,
    schema=schema2,
)

merged_columns = ColumnSet([
    ("timestamp", DateTime()),
    ("mismatched1", String(Modifiers(nullable=True))),
    ("mismatched2", String(Modifiers(nullable=True))),
])
Beispiel #12
0
from snuba.clusters.storage_sets import StorageSetKey
from snuba.datasets.schemas.tables import TableSchema
from snuba.datasets.storage import ReadableTableStorage
from snuba.datasets.storages import StorageKey
from snuba.datasets.storages.transactions_common import (
    columns,
    mandatory_condition_checkers,
    query_processors,
    query_splitters,
)

schema = TableSchema(
    columns=columns,
    local_table_name="transactions_local",
    dist_table_name="transactions_dist",
    storage_set_key=StorageSetKey.TRANSACTIONS_RO,
    mandatory_conditions=[],
)

storage = ReadableTableStorage(
    storage_key=StorageKey.TRANSACTIONS_RO,
    storage_set_key=StorageSetKey.TRANSACTIONS_RO,
    schema=schema,
    query_processors=query_processors,
    query_splitters=query_splitters,
    mandatory_condition_checkers=mandatory_condition_checkers,
)
Beispiel #13
0
    dist_materialized_view_name="outcomes_mv_hourly_dist",
    prewhere_candidates=["project_id", "org_id"],
    columns=materialized_view_columns,
    query=query,
    local_source_table_name=WRITE_LOCAL_TABLE_NAME,
    local_destination_table_name=READ_LOCAL_TABLE_NAME,
    dist_source_table_name=WRITE_DIST_TABLE_NAME,
    dist_destination_table_name=READ_DIST_TABLE_NAME,
)

raw_storage = WritableTableStorage(
    schemas=StorageSchemas(read_schema=raw_schema, write_schema=raw_schema),
    table_writer=TableWriter(
        write_schema=raw_schema,
        stream_loader=KafkaStreamLoader(
            processor=OutcomesProcessor(),
            default_topic="outcomes",
        ),
    ),
    query_processors=[],
)

materialized_storage = ReadableTableStorage(
    schemas=StorageSchemas(
        read_schema=read_schema,
        write_schema=None,
        intermediary_schemas=[materialized_view_schema],
    ),
    query_processors=[PrewhereProcessor()],
)
Beispiel #14
0
    Column("project_id", UInt(64)),
    Column("metric_id", UInt(64)),
    Column("granularity", UInt(32)),
    Column("timestamp", DateTime()),
    Column("retention_days", UInt(16)),
    Column("tags", Nested([("key", UInt(64)), ("value", UInt(64))])),
    Column("_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))),
]

sets_storage = ReadableTableStorage(
    storage_key=StorageKey.METRICS_SETS,
    storage_set_key=StorageSetKey.METRICS,
    schema=TableSchema(
        local_table_name="metrics_sets_local",
        dist_table_name="metrics_sets_dist",
        storage_set_key=StorageSetKey.METRICS,
        columns=ColumnSet([
            *aggregated_columns,
            Column("value", AggregateFunction("uniqCombined64", [UInt(64)])),
        ]),
    ),
    query_processors=[ArrayJoinKeyValueOptimizer("tags")],
)

counters_storage = ReadableTableStorage(
    storage_key=StorageKey.METRICS_COUNTERS,
    storage_set_key=StorageSetKey.METRICS,
    schema=TableSchema(
        local_table_name="metrics_counters_local",
        dist_table_name="metrics_counters_dist",
        storage_set_key=StorageSetKey.METRICS,
        columns=ColumnSet([
Beispiel #15
0
)

storage = ReadableTableStorage(
    storage_key=StorageKey.DISCOVER,
    storage_set_key=StorageSetKey.DISCOVER,
    schema=schema,
    query_processors=[
        MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
        EventIdColumnProcessor(),
        ArrayJoinKeyValueOptimizer("tags"),
        PrewhereProcessor(
            [
                "event_id",
                "release",
                "message",
                "transaction_name",
                "environment",
                "project_id",
            ]
        ),
    ],
    query_splitters=[
        ColumnSplitQueryStrategy(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="timestamp",
        ),
        TimeSplitQueryStrategy(timestamp_col="timestamp"),
    ],
)
Beispiel #16
0
        CounterAggregateProcessor(),
        default_topic=Topic.METRICS,
        dead_letter_queue_policy_creator=produce_policy_creator,
    ),
    write_format=WriteFormat.VALUES,
)

org_counters_storage = ReadableTableStorage(
    storage_key=StorageKey.ORG_METRICS_COUNTERS,
    storage_set_key=StorageSetKey.METRICS,
    schema=TableSchema(
        local_table_name="metrics_counters_v2_local",
        dist_table_name="metrics_counters_v2_dist",
        storage_set_key=StorageSetKey.METRICS,
        columns=ColumnSet([
            Column("org_id", UInt(64)),
            Column("project_id", UInt(64)),
            Column("metric_id", UInt(64)),
            Column("granularity", UInt(32)),
            Column("timestamp", DateTime()),
        ]),
    ),
    query_processors=[TableRateLimit()],
)

distributions_storage = WritableTableStorage(
    storage_key=StorageKey.METRICS_DISTRIBUTIONS,
    storage_set_key=StorageSetKey.METRICS,
    schema=WritableTableSchema(
        local_table_name="metrics_distributions_v2_local",
        dist_table_name="metrics_distributions_v2_dist",