Exemple #1
0
    def test_accepted_services(self):
        v1 = ServiceVersionRequirement(service="postgresql",
                                       supported_version="==14.0.0")
        v2 = ServiceVersionRequirement(service="clickhouse",
                                       supported_version="==21.6.0")
        v3 = ServiceVersionRequirement(service="redis",
                                       supported_version="==6.2.6")

        self.assertEqual(v1.service, "postgresql")
        self.assertEqual(v2.service, "clickhouse")
        self.assertEqual(v3.service, "redis")

        self.assertEqual(type(v1.supported_version), SimpleSpec)
        self.assertEqual(type(v2.supported_version), SimpleSpec)
        self.assertEqual(type(v3.supported_version), SimpleSpec)

        self.assertEqual(str(v1.supported_version), "==14.0.0")
        self.assertEqual(str(v2.supported_version), "==21.6.0")
        self.assertEqual(str(v3.supported_version), "==6.2.6")

        try:
            ServiceVersionRequirement(service="kea",
                                      supported_version="==2.5.0")
        except Exception as e:
            self.assertEqual(
                str(e),
                "service kea cannot be used to specify a version requirement. service should be one of clickhouse, postgresql, redis",
            )
class Migration(AsyncMigrationDefinition):

    description = (
        "Schema change to the events table ensuring our SAMPLE BY clause is compatible with ClickHouse >=21.7.0."
    )

    depends_on = "0001_events_sample_by"

    posthog_min_version = "1.30.0"
    posthog_max_version = "1.33.9"

    service_version_requirements = [
        ServiceVersionRequirement(service="clickhouse",
                                  supported_version=">=21.6.0"),
    ]

    @cached_property
    def operations(self):
        if self._events_table_engine() == "Distributed":
            # Note: This _should_ be impossible but hard to ensure.
            raise RuntimeError(
                "Cannot run the migration as `events` table is already Distributed engine."
            )

        create_table_op = [
            AsyncMigrationOperation.simple_op(
                database=AnalyticsDBMS.CLICKHOUSE,
                sql=f"""
                CREATE TABLE IF NOT EXISTS {TEMPORARY_TABLE_NAME} ON CLUSTER '{CLICKHOUSE_CLUSTER}' AS {EVENTS_TABLE_NAME}
                ENGINE = ReplacingMergeTree(_timestamp)
                PARTITION BY toYYYYMM(timestamp)
                ORDER BY (team_id, toDate(timestamp), event, cityHash64(distinct_id), cityHash64(uuid))
                SAMPLE BY cityHash64(distinct_id)
                """,
                rollback=
                f"DROP TABLE IF EXISTS {TEMPORARY_TABLE_NAME} ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            )
        ]

        old_partition_ops = []
        previous_partition = self._partitions[0] if len(
            self._partitions) > 0 else None
        for partition in self._partitions[1:]:
            old_partition_ops.append(
                generate_insert_into_op(previous_partition, partition))
            previous_partition = partition

        detach_mv_ops = [
            AsyncMigrationOperation(
                fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", False),
                rollback_fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", True),
            ),
            AsyncMigrationOperation.simple_op(
                database=AnalyticsDBMS.CLICKHOUSE,
                sql=
                f"DETACH TABLE {EVENTS_TABLE_NAME}_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
                rollback=
                f"ATTACH TABLE {EVENTS_TABLE_NAME}_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            ),
        ]

        last_partition_op = [
            generate_insert_into_op(
                self._partitions[-1] if len(self._partitions) > 0 else 0)
        ]

        def optimize_table_fn(query_id):
            default_timeout = ASYNC_MIGRATIONS_DEFAULT_TIMEOUT_SECONDS
            try:
                execute_op_clickhouse(
                    f"OPTIMIZE TABLE {EVENTS_TABLE_NAME} FINAL",
                    query_id,
                    settings={
                        "max_execution_time": default_timeout,
                        "send_timeout": default_timeout,
                        "receive_timeout": default_timeout,
                    },
                )
            except:  # TODO: we should only pass the timeout one here
                pass

        post_insert_ops = [
            AsyncMigrationOperation.simple_op(
                database=AnalyticsDBMS.CLICKHOUSE,
                sql=f"""
                    RENAME TABLE
                        {EVENTS_TABLE_NAME} to {BACKUP_TABLE_NAME},
                        {TEMPORARY_TABLE_NAME} to {EVENTS_TABLE_NAME}
                    ON CLUSTER '{CLICKHOUSE_CLUSTER}'
                """,
                rollback=f"""
                    RENAME TABLE
                        {EVENTS_TABLE_NAME} to {FAILED_EVENTS_TABLE_NAME},
                        {BACKUP_TABLE_NAME} to {EVENTS_TABLE_NAME}
                    ON CLUSTER '{CLICKHOUSE_CLUSTER}'
                """,
            ),
            AsyncMigrationOperation.simple_op(
                database=AnalyticsDBMS.CLICKHOUSE,
                sql=
                f"ATTACH TABLE {EVENTS_TABLE_NAME}_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
                rollback=
                f"DETACH TABLE {EVENTS_TABLE_NAME}_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            ),
            AsyncMigrationOperation(
                fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", True),
                rollback_fn=lambda _: setattr(
                    config, "COMPUTE_MATERIALIZED_COLUMNS_ENABLED", False),
            ),
            AsyncMigrationOperation(fn=optimize_table_fn),
        ]

        _operations = create_table_op + old_partition_ops + detach_mv_ops + last_partition_op + post_insert_ops
        return _operations

    def is_required(self):
        if settings.MULTI_TENANCY:
            return False

        res = sync_execute(f"SHOW CREATE TABLE {EVENTS_TABLE_NAME}")
        return (
            "ORDER BY (team_id, toDate(timestamp), event, cityHash64(distinct_id), cityHash64(uuid))"
            not in res[0][0])

    def precheck(self):
        events_failed_table_exists = sync_execute(
            f"EXISTS {FAILED_EVENTS_TABLE_NAME}")[0][0]
        if events_failed_table_exists:
            return (
                False,
                f"{FAILED_EVENTS_TABLE_NAME} already exists. We use this table as a backup if the migration fails. You can delete or rename it and restart the migration.",
            )

        events_table = "sharded_events" if CLICKHOUSE_REPLICATION else "events"
        result = sync_execute(f"""
        SELECT (free_space.size / greatest(event_table_size.size, 1)) FROM
            (SELECT 1 as jc, 'event_table_size', sum(bytes) as size FROM system.parts WHERE table = '{events_table}' AND database='{CLICKHOUSE_DATABASE}') event_table_size
        JOIN
            (SELECT 1 as jc, 'free_disk_space', free_space as size FROM system.disks WHERE name = 'default') free_space
        ON event_table_size.jc=free_space.jc
        """)
        event_size_to_free_space_ratio = result[0][0]

        # Require 1.5x the events table in free space to be available
        if event_size_to_free_space_ratio > 1.5:
            return (True, None)
        else:
            result = sync_execute(f"""
            SELECT formatReadableSize(free_space.size - (free_space.free_space - (1.5 * event_table_size.size ))) as required FROM
                (SELECT 1 as jc, 'event_table_size', sum(bytes) as size FROM system.parts WHERE table = '{events_table}' AND database='{CLICKHOUSE_DATABASE}') event_table_size
            JOIN
                (SELECT 1 as jc, 'free_disk_space', free_space, total_space as size FROM system.disks WHERE name = 'default') free_space
            ON event_table_size.jc=free_space.jc
            """)
            required_space = result[0][0]
            return (
                False,
                f"Upgrade your ClickHouse storage to at least {required_space}."
            )

    def healthcheck(self):
        result = sync_execute("SELECT free_space FROM system.disks")
        # 100mb or less left
        if int(result[0][0]) < 100000000:
            return (False, "ClickHouse available storage below 100MB")

        return (True, None)

    @cached_property
    def _partitions(self):
        return list(
            sorted(row[0] for row in sync_execute(
                f"SELECT DISTINCT toUInt32(partition) FROM system.parts WHERE database = %(database)s AND table='{EVENTS_TABLE}'",
                {"database": CLICKHOUSE_DATABASE},
            )))

    def _events_table_engine(self) -> str:
        rows = sync_execute(
            "SELECT engine FROM system.tables WHERE database = %(database)s AND name = 'events'",
            {"database": CLICKHOUSE_DATABASE},
        )
        return rows[0][0]
Exemple #3
0
class Migration(AsyncMigrationDefinition):

    description = "An example async migration."

    posthog_min_version = "1.29.0"
    posthog_max_version = "1.30.0"

    service_version_requirements = [
        ServiceVersionRequirement(service="clickhouse",
                                  supported_version=">=21.6.0,<21.7.0"),
    ]

    operations = [
        AsyncMigrationOperation.simple_op(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=PERSONS_DISTINCT_ID_TABLE_SQL().replace(
                PERSONS_DISTINCT_ID_TABLE, TEMPORARY_TABLE_NAME, 1),
            rollback=
            f"DROP TABLE IF EXISTS {TEMPORARY_TABLE_NAME} ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
        ),
        AsyncMigrationOperation.simple_op(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=
            f"DROP TABLE person_distinct_id_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            rollback=PERSONS_DISTINCT_ID_TABLE_MV_SQL,
        ),
        AsyncMigrationOperation.simple_op(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=
            f"DROP TABLE kafka_person_distinct_id ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
            rollback=KAFKA_PERSONS_DISTINCT_ID_TABLE_SQL(),
        ),
        AsyncMigrationOperation.simple_op(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=f"""
                INSERT INTO {TEMPORARY_TABLE_NAME} (distinct_id, person_id, team_id, _sign, _timestamp, _offset)
                SELECT
                    distinct_id,
                    person_id,
                    team_id,
                    if(is_deleted==0, 1, -1) as _sign,
                    _timestamp,
                    _offset
                FROM {PERSONS_DISTINCT_ID_TABLE}
            """,
            rollback=f"DROP TABLE IF EXISTS {TEMPORARY_TABLE_NAME}",
        ),
        AsyncMigrationOperation.simple_op(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=f"""
                RENAME TABLE
                    {CLICKHOUSE_DATABASE}.{PERSONS_DISTINCT_ID_TABLE} to {CLICKHOUSE_DATABASE}.person_distinct_id_async_migration_backup,
                    {CLICKHOUSE_DATABASE}.{TEMPORARY_TABLE_NAME} to {CLICKHOUSE_DATABASE}.{PERSONS_DISTINCT_ID_TABLE}
                ON CLUSTER '{CLICKHOUSE_CLUSTER}'
            """,
            rollback=f"""
                RENAME TABLE
                    {CLICKHOUSE_DATABASE}.{PERSONS_DISTINCT_ID_TABLE} to {CLICKHOUSE_DATABASE}.{TEMPORARY_TABLE_NAME}
                    {CLICKHOUSE_DATABASE}.person_distinct_id_async_migration_backup to {CLICKHOUSE_DATABASE}.person_distinct_id,
                ON CLUSTER '{CLICKHOUSE_CLUSTER}'
            """,
        ),
        AsyncMigrationOperation.simple_op(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=KAFKA_PERSONS_DISTINCT_ID_TABLE_SQL(),
            rollback=
            f"DROP TABLE IF EXISTS kafka_person_distinct_id ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
        ),
        AsyncMigrationOperation.simple_op(
            database=AnalyticsDBMS.CLICKHOUSE,
            sql=PERSONS_DISTINCT_ID_TABLE_MV_SQL,
            rollback=
            f"DROP TABLE IF EXISTS person_distinct_id_mv ON CLUSTER '{CLICKHOUSE_CLUSTER}'",
        ),
        AsyncMigrationOperation(fn=example_fn,
                                rollback_fn=example_rollback_fn),
    ]

    def healthcheck(self):
        result = sync_execute(
            "SELECT total_space, free_space FROM system.disks")
        total_space = result[0][0]
        free_space = result[0][1]
        if free_space > total_space / 3:
            return (True, None)
        else:
            return (False, "Upgrade your ClickHouse storage.")

    def progress(self, _):
        result = sync_execute(f"SELECT COUNT(1) FROM {TEMPORARY_TABLE_NAME}")
        result2 = sync_execute(
            f"SELECT COUNT(1) FROM {PERSONS_DISTINCT_ID_TABLE}")
        total_events_to_move = result2[0][0]
        total_events_moved = result[0][0]

        progress = 100 * total_events_moved / total_events_to_move
        return progress

    def is_required(self):
        res = sync_execute("SHOW CREATE TABLE person_distinct_id")
        return "ReplacingMergeTree" in res[0][0]
Exemple #4
0
    def test_ranges(self):
        v1 = ServiceVersionRequirement(service="postgresql",
                                       supported_version="==14.0.0")
        in_range, service_version = v1.is_service_in_accepted_version()
        self.assertEqual(in_range, False)
        self.assertEqual(str(service_version), "12.1.2")

        v2 = ServiceVersionRequirement(service="postgresql",
                                       supported_version="==12.1.2")
        in_range, _ = v2.is_service_in_accepted_version()
        self.assertEqual(in_range, True)

        v3 = ServiceVersionRequirement(service="postgresql",
                                       supported_version=">=12.0.0,<12.1.2")
        in_range, _ = v3.is_service_in_accepted_version()
        self.assertEqual(in_range, False)

        v4 = ServiceVersionRequirement(service="postgresql",
                                       supported_version=">=12.0.0,<=12.1.2")
        in_range, _ = v4.is_service_in_accepted_version()
        self.assertEqual(in_range, True)

        v5 = ServiceVersionRequirement(service="postgresql",
                                       supported_version=">=11.0.0,<=13.0.0")
        in_range, _ = v5.is_service_in_accepted_version()
        self.assertEqual(in_range, True)
from posthog.settings.base_variables import DEBUG, IS_COLLECT_STATIC, TEST
from posthog.settings.utils import get_from_env, print_warning, str_to_bool
from posthog.version_requirement import ServiceVersionRequirement

SKIP_SERVICE_VERSION_REQUIREMENTS = get_from_env(
    "SKIP_SERVICE_VERSION_REQUIREMENTS", TEST or IS_COLLECT_STATIC or DEBUG, type_cast=str_to_bool
)

if SKIP_SERVICE_VERSION_REQUIREMENTS and not (TEST or DEBUG):
    print_warning(["Skipping service version requirements. This is dangerous and PostHog might not work as expected!"])

SERVICE_VERSION_REQUIREMENTS = [
    ServiceVersionRequirement(service="postgresql", supported_version=">=11.0.0,<=14.1.0",),
    ServiceVersionRequirement(service="redis", supported_version=">=5.0.0,<=6.3.0",),
    ServiceVersionRequirement(service="clickhouse", supported_version=">=21.6.0,<21.12.0"),
]