def test_failure_recovery_after_tick_success(external_repo_context, crash_location, crash_signal): # Verify that if the scheduler crashes or is interrupted after a run is created, # it will just re-launch the already-created run when it runs again with instance_with_schedules(external_repo_context) as (instance, external_repo): initial_datetime = pendulum.datetime(year=2019, month=2, day=27, hour=0, minute=0, second=0) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule( "simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule_and_update_storage_state(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 # As above there's a possible race condition here if the scheduler crashes # and launches the same run twice if we crash right after the launch and re-run # before the run actually starts wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 validate_run_started(instance.get_runs()[0], initial_datetime, pendulum.datetime(2019, 2, 26)) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 if crash_signal == get_terminate_signal(): validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.STARTED, None, ) else: validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.SUCCESS, instance.get_runs()[0].run_id, ) frozen_datetime = frozen_datetime.add(minutes=1) with pendulum.test(frozen_datetime): # Running again just marks the tick as success since the run has already started scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 validate_run_started(instance.get_runs()[0], initial_datetime, pendulum.datetime(2019, 2, 26)) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.SUCCESS, instance.get_runs()[0].run_id, )
def test_failure_recovery_after_tick_success(instance, external_repo, crash_location, crash_signal): initial_datetime = create_pendulum_time(year=2019, month=2, day=27, hour=0, minute=0, second=0) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule("simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 # As above there's a possible race condition here if the scheduler crashes # and launches the same run twice if we crash right after the launch and re-run # before the run actually starts wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 validate_run_exists(instance.get_runs()[0], initial_datetime, create_pendulum_time(2019, 2, 26)) ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 if crash_signal == get_terminate_signal(): run_ids = [] else: run_ids = [run.run_id for run in instance.get_runs()] validate_tick( ticks[0], external_schedule, initial_datetime, TickStatus.STARTED, run_ids, ) frozen_datetime = frozen_datetime.add(minutes=1) with pendulum.test(frozen_datetime): # Running again just marks the tick as success since the run has already started scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 validate_run_exists(instance.get_runs()[0], initial_datetime, create_pendulum_time(2019, 2, 26)) ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, TickStatus.SUCCESS, [instance.get_runs()[0].run_id], )