def attempt_migration_rollback(migration_instance: AsyncMigration): """ Cycle through the operations in reverse order starting from the last completed op and run the specified rollback statements. """ migration_instance.refresh_from_db() ops = get_async_migration_definition(migration_instance.name).operations # if the migration was completed the index is set 1 after, normally we should try rollback for current op current_index = min(migration_instance.current_operation_index, len(ops) - 1) for op_index in range(current_index, -1, -1): try: op = ops[op_index] execute_op(op, str(UUIDT()), rollback=True) except Exception as e: error = f"At operation {op_index} rollback failed with error:{str(e)}" process_error( migration_instance=migration_instance, error=error, rollback=False, alert=True, current_operation_index=op_index, ) return update_async_migration(migration_instance=migration_instance, status=MigrationStatus.RolledBack, progress=0)
def run_async_migration_next_op( migration_name: str, migration_instance: Optional[AsyncMigration] = None): """ Runs the next operation specified by the currently running migration We run the next operation of the migration which needs attention Returns (run_next, success) Terminology: - migration_instance: The migration object as stored in the DB - migration_definition: The actual migration class outlining the operations (e.g. async_migrations/examples/example.py) """ if not migration_instance: try: migration_instance = AsyncMigration.objects.get( name=migration_name, status=MigrationStatus.Running) except AsyncMigration.DoesNotExist: return (False, False) else: migration_instance.refresh_from_db() assert migration_instance is not None migration_definition = get_async_migration_definition(migration_name) if migration_instance.current_operation_index > len( migration_definition.operations) - 1: complete_migration(migration_instance) return (False, True) error = None current_query_id = str(UUIDT()) try: op = migration_definition.operations[ migration_instance.current_operation_index] execute_op(op, current_query_id) update_async_migration( migration_instance=migration_instance, current_query_id=current_query_id, current_operation_index=migration_instance.current_operation_index + 1, ) except Exception as e: error = f"Exception was thrown while running operation {migration_instance.current_operation_index} : {str(e)}" process_error(migration_instance, error, alert=True) if error: return (False, False) update_migration_progress(migration_instance) return (True, False)
def test_process_error(self, _): sm = create_async_migration() process_error(sm, "some error") process_error(sm, "second error") sm.refresh_from_db() self.assertEqual(sm.status, MigrationStatus.Errored) self.assertGreater(sm.finished_at, datetime.now(timezone.utc) - timedelta(hours=1)) errors = AsyncMigrationError.objects.filter( async_migration=sm).order_by("created_at") self.assertEqual(errors.count(), 2) self.assertEqual(errors[0].description, "some error") self.assertEqual(errors[1].description, "second error")
def check_async_migration_health() -> None: from posthog.models.async_migration import AsyncMigration, MigrationStatus try: migration_instance: AsyncMigration = AsyncMigration.objects.get(status=MigrationStatus.Running) except AsyncMigration.DoesNotExist: return migration_task_celery_state = AsyncResult(migration_instance.celery_task_id).state # we only care about "supposedly running" tasks here # failures and successes are handled elsewhere # pending means we haven't picked up the task yet # retry is not possible as max_retries == 0 if migration_task_celery_state != states.STARTED: return inspector = app.control.inspect() active_tasks_per_node = inspector.active() active_task_ids = [] if active_tasks_per_node: for _, tasks in active_tasks_per_node.items(): active_task_ids += [task["id"] for task in tasks] # the worker crashed - this is how we find out and process the error if migration_instance.celery_task_id not in active_task_ids: if getattr(config, "ASYNC_MIGRATIONS_AUTO_CONTINUE"): trigger_migration(migration_instance, fresh_start=False) else: process_error(migration_instance, "Celery worker crashed while running migration.") return ok, error = run_migration_healthcheck(migration_instance) if not ok: force_stop_migration(migration_instance, f"Healthcheck failed with error: {error}") return update_migration_progress(migration_instance)
def start_async_migration(migration_name: str, ignore_posthog_version=False) -> bool: """ Performs some basic checks to ensure the migration can indeed run, and then kickstarts the chain of operations Returns whether migration was successful Checks: 1. We're not over the concurrent migrations limit 2. The migration can be run with the current PostHog version 3. The migration is not already running 4. The migration is required given the instance configuration 5. The service version requirements are met (e.g. X < ClickHouse version < Y) 6. The migration's healthcheck passes 7. The migration's dependency has been completed """ migration_instance = AsyncMigration.objects.get(name=migration_name) over_concurrent_migrations_limit = len( get_all_running_async_migrations()) >= MAX_CONCURRENT_ASYNC_MIGRATIONS posthog_version_valid = ignore_posthog_version or is_posthog_version_compatible( migration_instance.posthog_min_version, migration_instance.posthog_max_version) if (not migration_instance or over_concurrent_migrations_limit or not posthog_version_valid or migration_instance.status == MigrationStatus.Running): return False migration_definition = get_async_migration_definition(migration_name) if not migration_definition.is_required(): complete_migration(migration_instance, email=False) return True ok, error = check_service_version_requirements( migration_definition.service_version_requirements) if not ok: process_error(migration_instance, error, status=MigrationStatus.FailedAtStartup) return False ok, error = is_migration_dependency_fulfilled(migration_instance.name) if not ok: process_error(migration_instance, error, status=MigrationStatus.FailedAtStartup) return False ok, error = run_migration_precheck(migration_instance) if not ok: process_error(migration_instance, f"Migration precheck failed with error:{error}", status=MigrationStatus.FailedAtStartup) return False ok, error = run_migration_healthcheck(migration_instance) if not ok: process_error( migration_instance, f"Migration healthcheck failed with error:{error}", status=MigrationStatus.FailedAtStartup, ) return False mark_async_migration_as_running(migration_instance) return run_async_migration_operations(migration_name, migration_instance)