def test_failure_recovery_after_tick_success(external_repo_context,
                                             crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted after a run is created,
    # it will just re-launch the already-created run when it runs again
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = pendulum.datetime(year=2019,
                                             month=2,
                                             day=27,
                                             hour=0,
                                             minute=0,
                                             second=0)
        frozen_datetime = initial_datetime.add()
        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with pendulum.test(frozen_datetime):
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            # As above there's a possible race condition here if the scheduler crashes
            # and launches the same run twice if we crash right after the launch and re-run
            # before the run actually starts
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 pendulum.datetime(2019, 2, 26))

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1

            if crash_signal == get_terminate_signal():
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    JobTickStatus.STARTED,
                    None,
                )
            else:
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    JobTickStatus.SUCCESS,
                    instance.get_runs()[0].run_id,
                )

        frozen_datetime = frozen_datetime.add(minutes=1)
        with pendulum.test(frozen_datetime):
            # Running again just marks the tick as success since the run has already started
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 pendulum.datetime(2019, 2, 26))

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                JobTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )
Exemple #2
0
def test_failure_recovery_after_tick_success(instance, external_repo,
                                             crash_location, crash_signal):
    initial_datetime = create_pendulum_time(year=2019,
                                            month=2,
                                            day=27,
                                            hour=0,
                                            minute=0,
                                            second=0)
    frozen_datetime = initial_datetime.add()
    external_schedule = external_repo.get_external_schedule("simple_schedule")
    with pendulum.test(frozen_datetime):
        instance.start_schedule(external_schedule)

        debug_crash_flags = {
            external_schedule.name: {
                crash_location: crash_signal
            }
        }

        scheduler_process = spawn_ctx.Process(
            target=_test_launch_scheduled_runs_in_subprocess,
            args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
        )
        scheduler_process.start()
        scheduler_process.join(timeout=60)

        assert scheduler_process.exitcode != 0

        # As above there's a possible race condition here if the scheduler crashes
        # and launches the same run twice if we crash right after the launch and re-run
        # before the run actually starts
        wait_for_all_runs_to_start(instance)

        assert instance.get_runs_count() == 1
        validate_run_exists(instance.get_runs()[0], initial_datetime,
                            create_pendulum_time(2019, 2, 26))

        ticks = instance.get_ticks(external_schedule.get_external_origin_id(),
                                   external_schedule.selector_id)
        assert len(ticks) == 1

        if crash_signal == get_terminate_signal():
            run_ids = []
        else:
            run_ids = [run.run_id for run in instance.get_runs()]

        validate_tick(
            ticks[0],
            external_schedule,
            initial_datetime,
            TickStatus.STARTED,
            run_ids,
        )

    frozen_datetime = frozen_datetime.add(minutes=1)
    with pendulum.test(frozen_datetime):
        # Running again just marks the tick as success since the run has already started
        scheduler_process = spawn_ctx.Process(
            target=_test_launch_scheduled_runs_in_subprocess,
            args=[instance.get_ref(), frozen_datetime, None],
        )
        scheduler_process.start()
        scheduler_process.join(timeout=60)
        assert scheduler_process.exitcode == 0

        assert instance.get_runs_count() == 1
        validate_run_exists(instance.get_runs()[0], initial_datetime,
                            create_pendulum_time(2019, 2, 26))

        ticks = instance.get_ticks(external_schedule.get_external_origin_id(),
                                   external_schedule.selector_id)
        assert len(ticks) == 1
        validate_tick(
            ticks[0],
            external_schedule,
            initial_datetime,
            TickStatus.SUCCESS,
            [instance.get_runs()[0].run_id],
        )