Beispiel #1
0
    def get_date_range_partitions(current_time=None):
        check.opt_inst_param(current_time, "current_time", datetime.datetime)
        tz = timezone if timezone else "UTC"
        _start = (
            to_timezone(start, tz)
            if isinstance(start, PendulumDateTime)
            else pendulum.instance(start, tz=tz)
        )

        if end:
            _end = end
        elif current_time:
            _end = current_time
        else:
            _end = pendulum.now(tz)

        # coerce to the definition timezone
        if isinstance(_end, PendulumDateTime):
            _end = to_timezone(_end, tz)
        else:
            _end = pendulum.instance(_end, tz=tz)

        period = pendulum.period(_start, _end)
        date_names = [
            Partition(value=current, name=current.strftime(fmt))
            for current in period.range(delta_range, delta_amount)
        ]

        # We don't include the last element here by default since we only want
        # fully completed intervals, and the _end time is in the middle of the interval
        # represented by the last element of date_names
        if inclusive:
            return date_names

        return date_names[:-1]
Beispiel #2
0
def test_cursor_sensor(external_repo_context):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019, month=2, day=27, tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
        instance,
        workspace,
        external_repo,
    ):
        with pendulum.test(freeze_datetime):
            skip_sensor = external_repo.get_external_sensor("skip_cursor_sensor")
            run_sensor = external_repo.get_external_sensor("run_cursor_sensor")
            instance.start_sensor(skip_sensor)
            instance.start_sensor(run_sensor)
            evaluate_sensors(instance, workspace)

            skip_ticks = instance.get_job_ticks(skip_sensor.get_external_origin_id())
            assert len(skip_ticks) == 1
            validate_tick(
                skip_ticks[0],
                skip_sensor,
                freeze_datetime,
                JobTickStatus.SKIPPED,
            )
            assert skip_ticks[0].cursor == "1"

            run_ticks = instance.get_job_ticks(run_sensor.get_external_origin_id())
            assert len(run_ticks) == 1
            validate_tick(
                run_ticks[0],
                run_sensor,
                freeze_datetime,
                JobTickStatus.SUCCESS,
            )
            assert run_ticks[0].cursor == "1"

        freeze_datetime = freeze_datetime.add(seconds=60)
        with pendulum.test(freeze_datetime):
            evaluate_sensors(instance, workspace)

            skip_ticks = instance.get_job_ticks(skip_sensor.get_external_origin_id())
            assert len(skip_ticks) == 2
            validate_tick(
                skip_ticks[0],
                skip_sensor,
                freeze_datetime,
                JobTickStatus.SKIPPED,
            )
            assert skip_ticks[0].cursor == "2"

            run_ticks = instance.get_job_ticks(run_sensor.get_external_origin_id())
            assert len(run_ticks) == 2
            validate_tick(
                run_ticks[0],
                run_sensor,
                freeze_datetime,
                JobTickStatus.SUCCESS,
            )
            assert run_ticks[0].cursor == "2"
Beispiel #3
0
def test_custom_interval_sensor_with_offset(external_repo_context,
                                            monkeypatch):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019, month=2, day=28, tz="UTC"),
        "US/Central")

    sleeps = []

    def fake_sleep(s):
        sleeps.append(s)
        pendulum.set_test_now(pendulum.now().add(seconds=s))

    monkeypatch.setattr(time, "sleep", fake_sleep)

    with instance_with_sensors(external_repo_context) as (
            instance,
            grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(freeze_datetime):

            # 60 second custom interval
            external_sensor = external_repo.get_external_sensor(
                "custom_interval_sensor")

            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))

            # create a tick
            evaluate_sensors(instance, grpc_server_registry)
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1

            # calling for another iteration should not generate another tick because time has not
            # advanced
            evaluate_sensors(instance, grpc_server_registry)
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1

            # call the sensor_iteration_loop, which should loop, and call the monkeypatched sleep
            # to advance 30 seconds
            list(
                execute_sensor_iteration_loop(
                    instance,
                    grpc_server_registry,
                    get_default_daemon_logger("SensorDaemon"),
                    daemon_shutdown_event=None,
                    until=freeze_datetime.add(seconds=65).timestamp(),
                ))

            assert pendulum.now() == freeze_datetime.add(seconds=65)
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            assert sum(sleeps) == 65
Beispiel #4
0
def test_sensor_start_stop(external_repo_context):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019, month=2, day=27, tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(freeze_datetime):
            external_sensor = external_repo.get_external_sensor(
                "always_on_sensor")
            external_origin_id = external_sensor.get_external_origin_id()
            instance.start_sensor(external_sensor)

            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(external_origin_id)
            assert len(ticks) == 0

            evaluate_sensors(instance, grpc_server_registry)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            ticks = instance.get_job_ticks(external_origin_id)
            assert len(ticks) == 1
            validate_tick(ticks[0], external_sensor, freeze_datetime,
                          JobTickStatus.SUCCESS, [run.run_id])

            freeze_datetime = freeze_datetime.add(seconds=15)

        with pendulum.test(freeze_datetime):
            evaluate_sensors(instance, grpc_server_registry)
            # no new ticks, no new runs, we are below the 30 second min interval
            assert instance.get_runs_count() == 1
            ticks = instance.get_job_ticks(external_origin_id)
            assert len(ticks) == 1

            # stop / start
            instance.stop_sensor(external_origin_id)
            instance.start_sensor(external_sensor)

            evaluate_sensors(instance, grpc_server_registry)
            # no new ticks, no new runs, we are below the 30 second min interval
            assert instance.get_runs_count() == 1
            ticks = instance.get_job_ticks(external_origin_id)
            assert len(ticks) == 1

            freeze_datetime = freeze_datetime.add(seconds=16)

        with pendulum.test(freeze_datetime):
            evaluate_sensors(instance, grpc_server_registry)
            # should have new tick, new run, we are after the 30 second min interval
            assert instance.get_runs_count() == 2
            ticks = instance.get_job_ticks(external_origin_id)
            assert len(ticks) == 2
Beispiel #5
0
def test_custom_interval_sensor(external_repo_context):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019, month=2, day=28, tz="UTC"),
        "US/Central")
    with instance_with_sensors(external_repo_context) as (instance,
                                                          external_repo):
        with pendulum.test(freeze_datetime):
            external_sensor = external_repo.get_external_sensor(
                "custom_interval_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 0

            list(
                execute_sensor_iteration(
                    instance, get_default_daemon_logger("SensorDaemon")))
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(ticks[0], external_sensor, freeze_datetime,
                          JobTickStatus.SKIPPED)

            freeze_datetime = freeze_datetime.add(seconds=30)

        with pendulum.test(freeze_datetime):
            list(
                execute_sensor_iteration(
                    instance, get_default_daemon_logger("SensorDaemon")))
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            # no additional tick created after 30 seconds
            assert len(ticks) == 1

            freeze_datetime = freeze_datetime.add(seconds=30)

        with pendulum.test(freeze_datetime):
            list(
                execute_sensor_iteration(
                    instance, get_default_daemon_logger("SensorDaemon")))
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2

            expected_datetime = create_pendulum_time(year=2019,
                                                     month=2,
                                                     day=28,
                                                     hour=0,
                                                     minute=1)
            validate_tick(ticks[0], external_sensor, expected_datetime,
                          JobTickStatus.SKIPPED)
Beispiel #6
0
def test_bad_load_sensor_repository(external_repo_context, capfd):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019,
                             month=2,
                             day=27,
                             hour=23,
                             minute=59,
                             second=59,
                             tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(freeze_datetime):
            external_sensor = external_repo.get_external_sensor(
                "simple_sensor")

            valid_origin = external_sensor.get_external_origin()

            # Swap out a new repository name
            invalid_repo_origin = ExternalJobOrigin(
                ExternalRepositoryOrigin(
                    valid_origin.external_repository_origin.
                    repository_location_origin,
                    "invalid_repo_name",
                ),
                valid_origin.job_name,
            )

            instance.add_job_state(
                JobState(invalid_repo_origin, JobType.SENSOR,
                         JobStatus.RUNNING))

            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(invalid_repo_origin.get_id())
            assert len(ticks) == 0

            evaluate_sensors(instance, grpc_server_registry)

            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(invalid_repo_origin.get_id())
            assert len(ticks) == 0

            captured = capfd.readouterr()
            assert "Sensor daemon caught an error for sensor simple_sensor" in captured.out
            assert (
                "Could not find repository invalid_repo_name in location test_location to run sensor simple_sensor"
                in captured.out)
Beispiel #7
0
    def get_schedule_range_partitions(current_time=None):
        check.opt_inst_param(current_time, "current_time", datetime.datetime)
        tz = timezone if timezone else "UTC"
        _start = (
            to_timezone(start, tz)
            if isinstance(start, PendulumDateTime)
            else pendulum.instance(start, tz=tz)
        )

        if end:
            _end = end
        elif current_time:
            _end = current_time
        else:
            _end = pendulum.now(tz)

        # coerce to the definition timezone
        if isinstance(_end, PendulumDateTime):
            _end = to_timezone(_end, tz)
        else:
            _end = pendulum.instance(_end, tz=tz)

        end_timestamp = _end.timestamp()

        partitions = []
        for next_time in schedule_execution_time_iterator(_start.timestamp(), cron_schedule, tz):

            partition_time = execution_time_to_partition_fn(next_time)

            if partition_time.timestamp() > end_timestamp:
                break

            if partition_time.timestamp() < _start.timestamp():
                continue

            partitions.append(Partition(value=partition_time, name=partition_time.strftime(fmt)))

        return partitions if inclusive else partitions[:-1]
Beispiel #8
0
def _get_existing_run_for_request(instance, external_schedule, schedule_time, run_request):
    tags = merge_dicts(
        PipelineRun.tags_for_schedule(external_schedule),
        {
            SCHEDULED_EXECUTION_TIME_TAG: to_timezone(schedule_time, "UTC").isoformat(),
        },
    )
    if run_request.run_key:
        tags[RUN_KEY_TAG] = run_request.run_key
    runs_filter = PipelineRunsFilter(tags=tags)
    existing_runs = instance.get_runs(runs_filter)
    if not len(existing_runs):
        return None
    return existing_runs[0]
Beispiel #9
0
def test_error_sensor(external_repo_context, capfd):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019,
                             month=2,
                             day=27,
                             hour=23,
                             minute=59,
                             second=59,
                             tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(freeze_datetime):
            external_sensor = external_repo.get_external_sensor("error_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))
            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 0

            evaluate_sensors(instance, grpc_server_registry)

            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_sensor,
                freeze_datetime,
                JobTickStatus.FAILURE,
                [],
                "Error occurred during the execution of evaluation_fn for sensor error_sensor",
            )

            captured = capfd.readouterr()
            assert (
                "Failed to resolve sensor for error_sensor : ") in captured.out

            assert (
                "Error occurred during the execution of evaluation_fn for sensor error_sensor"
            ) in captured.out
Beispiel #10
0
def test_partitions_for_hourly_schedule_decorators_without_timezone():
    with instance_for_test() as instance:
        with pendulum.test(
            to_timezone(create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="UTC"), "US/Eastern")
        ):

            context_without_time = build_schedule_context(instance)

            start_date = datetime(year=2019, month=1, day=1)

            @hourly_schedule(
                pipeline_name="foo_pipeline",
                start_date=start_date,
                execution_time=time(hour=0, minute=25),
            )
            def hourly_foo_schedule(hourly_time):
                return {"hourly_time": hourly_time.isoformat()}

            _check_partitions(
                hourly_foo_schedule,
                HOURS_UNTIL_FEBRUARY_27,
                pendulum.instance(start_date, tz="UTC"),
                DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE,
                relativedelta(hours=1),
            )

            execution_data = hourly_foo_schedule.get_execution_data(context_without_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "hourly_time": create_pendulum_time(
                    year=2019, month=2, day=26, hour=23, tz="UTC"
                ).isoformat()
            }

            valid_time = create_pendulum_time(
                year=2019, month=1, day=27, hour=1, minute=25, tz="UTC"
            )
            context_with_valid_time = build_schedule_context(instance, valid_time)

            execution_data = hourly_foo_schedule.get_execution_data(context_with_valid_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "hourly_time": create_pendulum_time(
                    year=2019, month=1, day=27, hour=0, tz="UTC"
                ).isoformat()
            }
Beispiel #11
0
def test_error_sensor_daemon(external_repo_context, monkeypatch):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019, month=2, day=28, tz="UTC"),
        "US/Central")

    sleeps = []

    def fake_sleep(s):
        sleeps.append(s)
        pendulum.set_test_now(pendulum.now().add(seconds=s))

    monkeypatch.setattr(time, "sleep", fake_sleep)

    with instance_with_sensors(
            external_repo_context,
            overrides={
                "run_launcher": {
                    "module": "dagster.core.test_utils",
                    "class": "ExplodingRunLauncher",
                },
            },
    ) as (instance, workspace, _external_repo):

        @contextmanager
        def _gen_workspace(_instance):
            yield workspace

        with pendulum.test(freeze_datetime):
            instance.add_job_state(
                JobState(_get_unloadable_sensor_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))
            sensor_daemon = SensorDaemon.create_from_instance(instance)
            daemon_shutdown_event = threading.Event()
            sensor_daemon.run_loop(
                "my_uuid",
                daemon_shutdown_event,
                _gen_workspace,
                heartbeat_interval_seconds=DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
                error_interval_seconds=DEFAULT_DAEMON_ERROR_INTERVAL_SECONDS,
                until=freeze_datetime.add(seconds=65),
            )

            heartbeats = instance.get_daemon_heartbeats()
            heartbeat = heartbeats["SENSOR"]
            assert heartbeat
            assert heartbeat.errors
            assert len(heartbeat.errors) == DAEMON_HEARTBEAT_ERROR_LIMIT
Beispiel #12
0
def test_launch_failure(external_repo_context, capfd):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019,
                             month=2,
                             day=27,
                             hour=23,
                             minute=59,
                             second=59,
                             tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(
            external_repo_context,
            overrides={
                "run_launcher": {
                    "module": "dagster.core.test_utils",
                    "class": "ExplodingRunLauncher",
                },
            },
    ) as (instance, grpc_server_registry, external_repo):
        with pendulum.test(freeze_datetime):

            external_sensor = external_repo.get_external_sensor(
                "always_on_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))
            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 0

            evaluate_sensors(instance, grpc_server_registry)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(ticks[0], external_sensor, freeze_datetime,
                          JobTickStatus.SUCCESS, [run.run_id])

            captured = capfd.readouterr()
            assert ("Run {run_id} created successfully but failed to launch:".
                    format(run_id=run.run_id)) in captured.out

            assert "The entire purpose of this is to throw on launch" in captured.out
Beispiel #13
0
    def resolve_evaluationResult(self, graphene_info):
        if self._job_state.status != JobStatus.RUNNING:
            return None

        if self._job_state.job_type != JobType.SCHEDULE:
            return None

        repository_origin = self._job_state.origin.external_repository_origin
        if not graphene_info.context.has_repository_location(
                repository_origin.repository_location_origin.location_name):
            return None

        repository_location = graphene_info.context.get_repository_location(
            repository_origin.repository_location_origin.location_name)
        if not repository_location.has_repository(
                repository_origin.repository_name):
            return None

        repository = repository_location.get_repository(
            repository_origin.repository_name)
        external_schedule = repository.get_external_schedule(
            self._job_state.name)
        timezone_str = external_schedule.execution_timezone
        if not timezone_str:
            timezone_str = "UTC"

        next_tick_datetime = next(
            external_schedule.execution_time_iterator(self._timestamp))
        schedule_time = to_timezone(pendulum.instance(next_tick_datetime),
                                    timezone_str)
        try:
            schedule_data = repository_location.get_external_schedule_execution_data(
                instance=graphene_info.context.instance,
                repository_handle=repository.handle,
                schedule_name=external_schedule.name,
                scheduled_execution_time=schedule_time,
            )
        except Exception:  # pylint: disable=broad-except
            schedule_data = serializable_error_info_from_exc_info(
                sys.exc_info())

        return GrapheneTickEvaluation(schedule_data)
def _test_backfill_in_subprocess(instance_ref, debug_crash_flags):
    execution_datetime = to_timezone(
        create_pendulum_time(
            year=2021,
            month=2,
            day=17,
        ),
        "US/Central",
    )
    with DagsterInstance.from_ref(instance_ref) as instance:
        try:
            with pendulum.test(execution_datetime), ProcessGrpcServerRegistry(
                    wait_for_processes_on_exit=True) as grpc_server_registry:
                list(
                    execute_backfill_iteration(
                        instance,
                        grpc_server_registry,
                        get_default_daemon_logger("BackfillDaemon"),
                        debug_crash_flags=debug_crash_flags,
                    ))
        finally:
            cleanup_test_instance(instance)
Beispiel #15
0
def test_large_sensor(external_repo_context):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019, month=2, day=27, tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
        instance,
        workspace,
        external_repo,
    ):
        with pendulum.test(freeze_datetime):
            external_sensor = external_repo.get_external_sensor("large_sensor")
            instance.start_sensor(external_sensor)
            evaluate_sensors(instance, workspace)
            ticks = instance.get_job_ticks(external_sensor.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_sensor,
                freeze_datetime,
                JobTickStatus.SUCCESS,
            )
Beispiel #16
0
    def resolve_evaluationResult(self, graphene_info):
        if self._job_state.status != JobStatus.RUNNING:
            return None

        if self._job_state.job_type != JobType.SCHEDULE:
            return None

        repository_origin = self._job_state.origin.external_repository_origin
        if not graphene_info.context.has_repository_location(
                repository_origin.repository_location_origin.location_name):
            return None

        repository_location = graphene_info.context.get_repository_location(
            repository_origin.repository_location_origin.location_name)
        if not repository_location.has_repository(
                repository_origin.repository_name):
            return None

        repository = repository_location.get_repository(
            repository_origin.repository_name)
        external_schedule = repository.get_external_schedule(
            self._job_state.name)
        timezone_str = external_schedule.execution_timezone
        if not timezone_str:
            timezone_str = pendulum.now().timezone.name

        next_tick_datetime = next(
            external_schedule.execution_time_iterator(self._timestamp))
        schedule_time = to_timezone(pendulum.instance(next_tick_datetime),
                                    timezone_str)
        schedule_data = repository_location.get_external_schedule_execution_data(
            instance=graphene_info.context.instance,
            repository_handle=repository.handle,
            schedule_name=external_schedule.name,
            scheduled_execution_time=schedule_time,
        )
        return GrapheneTickEvaluation(schedule_data)
Beispiel #17
0
def test_execute_during_dst_transition_spring_forward(external_repo_context):
    # Verify that a daily schedule that is supposed to execute at a time that is skipped
    # by the DST transition does not execute for that day
    with instance_with_schedules(external_repo_context) as (
            instance,
            external_repo,
    ):
        # Day before DST
        freeze_datetime = to_timezone(
            create_pendulum_time(2019, 3, 9, 0, 0, 0, tz="US/Central"),
            "US/Pacific")

        with pendulum.test(freeze_datetime):
            external_schedule = external_repo.get_external_schedule(
                "daily_dst_transition_schedule_skipped_time")
            schedule_origin = external_schedule.get_external_origin()
            instance.start_schedule_and_update_storage_state(external_schedule)

            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 0

        freeze_datetime = freeze_datetime.add(days=3)

        with pendulum.test(freeze_datetime):
            list(
                launch_scheduled_runs(
                    instance,
                    logger(),
                    pendulum.now("UTC"),
                ))

            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 3
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 3

            expected_datetimes_utc = [
                to_timezone(
                    create_pendulum_time(2019,
                                         3,
                                         11,
                                         2,
                                         30,
                                         0,
                                         tz="US/Central"), "UTC"),
                to_timezone(
                    create_pendulum_time(2019,
                                         3,
                                         10,
                                         3,
                                         00,
                                         0,
                                         tz="US/Central"), "UTC"),
                to_timezone(
                    create_pendulum_time(2019, 3, 9, 2, 30, 0,
                                         tz="US/Central"), "UTC"),
            ]

            expected_partition_times = [
                create_pendulum_time(2019, 3, 10, tz="US/Central"),
                create_pendulum_time(2019, 3, 9, tz="US/Central"),
                create_pendulum_time(2019, 3, 8, tz="US/Central"),
            ]

            partition_set_def = the_repo.get_partition_set_def(
                "daily_dst_transition_schedule_skipped_time_partitions")
            partition_names = partition_set_def.get_partition_names()

            assert "2019-03-08" in partition_names
            assert "2019-03-09" in partition_names
            assert "2019-03-10" in partition_names

            for i in range(3):
                validate_tick(
                    ticks[i],
                    external_schedule,
                    expected_datetimes_utc[i],
                    JobTickStatus.SUCCESS,
                    [instance.get_runs()[i].run_id],
                )

                validate_run_started(
                    instance.get_runs()[i],
                    expected_datetimes_utc[i],
                    partition_time=expected_partition_times[i],
                )

            # Verify idempotence
            list(
                launch_scheduled_runs(
                    instance,
                    logger(),
                    pendulum.now("UTC"),
                ))
            assert instance.get_runs_count() == 3
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 3
Beispiel #18
0
def test_execute_during_dst_transition_fall_back(external_repo_context):
    with instance_with_schedules(external_repo_context) as (
            instance,
            external_repo,
    ):
        # A schedule that runs daily during a time that occurs twice during a fall DST transition
        # only executes once for that day
        freeze_datetime = to_timezone(
            create_pendulum_time(2019, 11, 2, 0, 0, 0, tz="US/Central"),
            "US/Pacific")

        with pendulum.test(freeze_datetime):
            external_schedule = external_repo.get_external_schedule(
                "daily_dst_transition_schedule_doubled_time")
            schedule_origin = external_schedule.get_external_origin()
            instance.start_schedule_and_update_storage_state(external_schedule)

            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 0

        freeze_datetime = freeze_datetime.add(days=3)

        with pendulum.test(freeze_datetime):
            list(
                launch_scheduled_runs(
                    instance,
                    logger(),
                    pendulum.now("UTC"),
                ))

            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 3
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 3

            expected_datetimes_utc = [
                create_pendulum_time(2019, 11, 4, 7, 30, 0, tz="UTC"),
                create_pendulum_time(2019, 11, 3, 7, 30, 0, tz="UTC"),
                create_pendulum_time(2019, 11, 2, 6, 30, 0, tz="UTC"),
            ]

            expected_partition_times = [
                create_pendulum_time(2019, 11, 3, tz="US/Central"),
                create_pendulum_time(2019, 11, 2, tz="US/Central"),
                create_pendulum_time(2019, 11, 1, tz="US/Central"),
            ]

            for i in range(3):
                validate_tick(
                    ticks[i],
                    external_schedule,
                    expected_datetimes_utc[i],
                    JobTickStatus.SUCCESS,
                    [instance.get_runs()[i].run_id],
                )

                validate_run_started(
                    instance.get_runs()[i],
                    expected_datetimes_utc[i],
                    partition_time=expected_partition_times[i],
                )

            # Verify idempotence
            list(
                launch_scheduled_runs(
                    instance,
                    logger(),
                    pendulum.now("UTC"),
                ))
            assert instance.get_runs_count() == 3
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 3
Beispiel #19
0
def test_simple_sensor(external_repo_context, capfd):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019,
                             month=2,
                             day=27,
                             hour=23,
                             minute=59,
                             second=59,
                             tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(freeze_datetime):
            external_sensor = external_repo.get_external_sensor(
                "simple_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))
            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 0

            evaluate_sensors(instance, grpc_server_registry)

            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_sensor,
                freeze_datetime,
                JobTickStatus.SKIPPED,
            )

            captured = capfd.readouterr()
            assert (
                captured.out ==
                """2019-02-27 17:59:59 - SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor
2019-02-27 17:59:59 - SensorDaemon - INFO - Sensor returned false for simple_sensor, skipping
""")

            freeze_datetime = freeze_datetime.add(seconds=30)

        with pendulum.test(freeze_datetime):
            evaluate_sensors(instance, grpc_server_registry)
            wait_for_all_runs_to_start(instance)
            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            validate_run_started(run)
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2

            expected_datetime = create_pendulum_time(year=2019,
                                                     month=2,
                                                     day=28,
                                                     hour=0,
                                                     minute=0,
                                                     second=29)
            validate_tick(
                ticks[0],
                external_sensor,
                expected_datetime,
                JobTickStatus.SUCCESS,
                [run.run_id],
            )

            captured = capfd.readouterr()
            assert (
                captured.out ==
                """2019-02-27 18:00:29 - SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor
2019-02-27 18:00:29 - SensorDaemon - INFO - Launching run for simple_sensor
2019-02-27 18:00:29 - SensorDaemon - INFO - Completed launch of run {run_id} for simple_sensor
""".format(run_id=run.run_id))
Beispiel #20
0
def schedule_execution_time_iterator(start_timestamp, cron_schedule,
                                     execution_timezone):
    check.float_param(start_timestamp, "start_timestamp")
    check.str_param(cron_schedule, "cron_schedule")
    check.opt_str_param(execution_timezone, "execution_timezone")
    timezone_str = execution_timezone if execution_timezone else "UTC"

    start_datetime = pendulum.from_timestamp(start_timestamp, tz=timezone_str)

    date_iter = croniter(cron_schedule, start_datetime)

    # Go back one iteration so that the next iteration is the first time that is >= start_datetime
    # and matches the cron schedule
    next_date = to_timezone(
        pendulum.instance(date_iter.get_prev(datetime.datetime)), timezone_str)

    cron_parts = cron_schedule.split(" ")

    check.invariant(len(cron_parts) == 5)

    is_numeric = [part.isnumeric() for part in cron_parts]

    delta_fn = None

    # Special-case common intervals (hourly/daily/weekly/monthly) since croniter iteration can be
    # much slower than adding a fixed interval
    if cron_schedule.endswith(" * *") and all(is_numeric[0:3]):  # monthly
        delta_fn = lambda d, num: d.add(months=num)
        should_hour_change = False
    elif (all(is_numeric[0:2]) and is_numeric[4] and cron_parts[2] == "*"
          and cron_parts[3] == "*"):  # weekly
        delta_fn = lambda d, num: d.add(weeks=num)
        should_hour_change = False
    elif all(is_numeric[0:2]) and cron_schedule.endswith(" * * *"):  # daily
        delta_fn = lambda d, num: d.add(days=num)
        should_hour_change = False
    elif is_numeric[0] and cron_schedule.endswith(" * * * *"):  # hourly
        delta_fn = lambda d, num: d.add(hours=num)
        should_hour_change = True

    while True:
        if delta_fn:
            curr_hour = next_date.hour

            next_date_cand = delta_fn(next_date, 1)
            new_hour = next_date_cand.hour

            if not should_hour_change and new_hour != curr_hour:
                # If the hour changes during a daily/weekly/monthly schedule, it
                # indicates that the time shifted due to falling in a time that doesn't
                # exist due to a DST transition (for example, 2:30AM CST on 3/10/2019).
                # Instead, execute at the first time that does exist (the start of the hour),
                # but return to the original hour for all subsequent executions so that the
                # hour doesn't stay different permanently.

                check.invariant(new_hour == curr_hour + 1)
                yield next_date_cand.replace(minute=0)

                next_date_cand = delta_fn(next_date, 2)
                check.invariant(next_date_cand.hour == curr_hour)

            next_date = next_date_cand
        else:
            next_date = to_timezone(
                pendulum.instance(date_iter.get_next(datetime.datetime)),
                timezone_str)

        yield next_date
Beispiel #21
0
def test_launch_once(external_repo_context, capfd):
    freeze_datetime = to_timezone(
        create_pendulum_time(
            year=2019,
            month=2,
            day=27,
            hour=23,
            minute=59,
            second=59,
            tz="UTC",
        ),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(freeze_datetime):

            external_sensor = external_repo.get_external_sensor(
                "run_key_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))
            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 0

            evaluate_sensors(instance, grpc_server_registry)
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_sensor,
                freeze_datetime,
                JobTickStatus.SUCCESS,
                expected_run_ids=[run.run_id],
            )

        # run again (after 30 seconds), to ensure that the run key maintains idempotence
        freeze_datetime = freeze_datetime.add(seconds=30)
        with pendulum.test(freeze_datetime):
            evaluate_sensors(instance, grpc_server_registry)
            assert instance.get_runs_count() == 1
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            validate_tick(
                ticks[0],
                external_sensor,
                freeze_datetime,
                JobTickStatus.SKIPPED,
            )
            captured = capfd.readouterr()
            assert (
                'Skipping 1 run for sensor run_key_sensor already completed with run keys: ["only_once"]'
                in captured.out)

            launched_run = instance.get_runs()[0]

            # Manually create a new run with the same tags
            execute_pipeline(
                the_pipeline,
                run_config=launched_run.run_config,
                tags=launched_run.tags,
                instance=instance,
            )

            # Sensor loop still executes
        freeze_datetime = freeze_datetime.add(seconds=30)
        with pendulum.test(freeze_datetime):
            evaluate_sensors(instance, grpc_server_registry)
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())

            assert len(ticks) == 3
            validate_tick(
                ticks[0],
                external_sensor,
                freeze_datetime,
                JobTickStatus.SKIPPED,
            )
def test_failure_recovery_before_run_created(external_repo_context,
                                             crash_location, crash_signal,
                                             capfd):
    # Verify that if the scheduler crashes or is interrupted before a run is created,
    # it will create exactly one tick/run when it is re-launched
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = to_timezone(
            create_pendulum_time(year=2019,
                                 month=2,
                                 day=27,
                                 hour=0,
                                 minute=0,
                                 second=0,
                                 tz="UTC"),
            "US/Central",
        )

        frozen_datetime = initial_datetime.add()

        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with pendulum.test(frozen_datetime):
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            captured = capfd.readouterr()
            assert (
                captured.out.replace("\r\n", "\n") ==
                """2019-02-26 18:00:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:00:00 - SchedulerDaemon - INFO - Evaluating schedule `simple_schedule` at 2019-02-27 00:00:00+0000
""")

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.STARTED

            assert instance.get_runs_count() == 0

        frozen_datetime = frozen_datetime.add(minutes=5)
        with pendulum.test(frozen_datetime):
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(
                instance.get_runs()[0],
                execution_time=initial_datetime,
                partition_time=create_pendulum_time(2019, 2, 26),
            )

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                JobTickStatus.SUCCESS,
                [instance.get_runs()[0].run_id],
            )
            captured = capfd.readouterr()
            assert (
                captured.out.replace("\r\n", "\n") ==
                """2019-02-26 18:05:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:05:00 - SchedulerDaemon - INFO - Evaluating schedule `simple_schedule` at 2019-02-27 00:00:00+0000
2019-02-26 18:05:00 - SchedulerDaemon - INFO - Resuming previously interrupted schedule execution
2019-02-26 18:05:00 - SchedulerDaemon - INFO - Completed scheduled launch of run {run_id} for simple_schedule
""".format(run_id=instance.get_runs()[0].run_id))
Beispiel #23
0
def test_daily_dst_fall_back(external_repo_context):
    # Verify that a daily schedule still runs once per day during the fall DST transition
    with instance_with_schedules(external_repo_context) as (
            instance,
            external_repo,
    ):
        # Night before DST
        freeze_datetime = to_timezone(
            create_pendulum_time(2019, 11, 3, 0, 0, 0, tz="US/Central"),
            "US/Pacific")

        with pendulum.test(freeze_datetime):
            external_schedule = external_repo.get_external_schedule(
                "daily_central_time_schedule")
            schedule_origin = external_schedule.get_external_origin()
            instance.start_schedule_and_update_storage_state(external_schedule)

            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 0

        freeze_datetime = freeze_datetime.add(days=2)

        with pendulum.test(freeze_datetime):
            list(
                launch_scheduled_runs(
                    instance,
                    logger(),
                    pendulum.now("UTC"),
                ))

            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 3
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 3

            # UTC time changed by one hour after the transition, still running daily at the same
            # time in CT
            expected_datetimes_utc = [
                create_pendulum_time(2019, 11, 5, 6, 0, 0, tz="UTC"),
                create_pendulum_time(2019, 11, 4, 6, 0, 0, tz="UTC"),
                create_pendulum_time(2019, 11, 3, 5, 0, 0, tz="UTC"),
            ]

            expected_partition_times = [
                create_pendulum_time(2019, 11, 4, tz="US/Central"),
                create_pendulum_time(2019, 11, 3, tz="US/Central"),
                create_pendulum_time(2019, 11, 2, tz="US/Central"),
            ]

            for i in range(3):
                validate_tick(
                    ticks[i],
                    external_schedule,
                    expected_datetimes_utc[i],
                    JobTickStatus.SUCCESS,
                    [instance.get_runs()[i].run_id],
                )

                validate_run_started(
                    instance.get_runs()[i],
                    expected_datetimes_utc[i],
                    partition_time=expected_partition_times[i],
                )

            # Verify idempotence
            list(
                launch_scheduled_runs(
                    instance,
                    logger(),
                    pendulum.now("UTC"),
                ))
            assert instance.get_runs_count() == 3
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 3
Beispiel #24
0
def test_partitions_for_monthly_schedule_decorators_without_timezone():
    with instance_for_test() as instance:
        with pendulum.test(
                to_timezone(
                    create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="UTC"),
                    "US/Eastern")):
            context_without_time = ScheduleExecutionContext(
                instance.get_ref(), None)

            start_date = datetime(year=2019, month=1, day=1)

            @monthly_schedule(
                pipeline_name="foo_pipeline",
                execution_day_of_month=3,
                start_date=start_date,
                execution_time=time(9, 30),
            )
            def monthly_foo_schedule(monthly_time):
                return {"monthly_time": monthly_time.isoformat()}

            valid_monthly_time = create_pendulum_time(year=2019,
                                                      month=2,
                                                      day=3,
                                                      hour=9,
                                                      minute=30,
                                                      tz="UTC")
            context_with_valid_time = ScheduleExecutionContext(
                instance.get_ref(), valid_monthly_time)

            execution_data = monthly_foo_schedule.get_execution_data(
                context_with_valid_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "monthly_time":
                create_pendulum_time(year=2019, month=1, day=1,
                                     tz="UTC").isoformat()
            }

            execution_data = monthly_foo_schedule.get_execution_data(
                context_without_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "monthly_time":
                create_pendulum_time(year=2019, month=1, day=1,
                                     tz="UTC").isoformat()
            }

            _check_partitions(
                monthly_foo_schedule,
                1,
                pendulum.instance(start_date, tz="UTC"),
                DEFAULT_MONTHLY_FORMAT,
                relativedelta(months=1),
            )

            # test partition_months_offset=0

            @monthly_schedule(
                pipeline_name="foo_pipeline",
                execution_day_of_month=3,
                start_date=start_date,
                execution_time=time(9, 30),
                partition_months_offset=0,
            )
            def monthly_foo_schedule_same_month(monthly_time):
                return {"monthly_time": monthly_time.isoformat()}

            valid_monthly_time = create_pendulum_time(year=2019,
                                                      month=2,
                                                      day=3,
                                                      hour=9,
                                                      minute=30,
                                                      tz="UTC")
            context_with_valid_time = ScheduleExecutionContext(
                instance.get_ref(), valid_monthly_time)

            execution_data = monthly_foo_schedule_same_month.get_execution_data(
                context_with_valid_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "monthly_time":
                create_pendulum_time(year=2019, month=2, day=1,
                                     tz="UTC").isoformat()
            }
Beispiel #25
0
def test_partitions_for_hourly_schedule_decorators_without_timezone():
    with instance_for_test() as instance:
        with pendulum.test(
                to_timezone(
                    create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="UTC"),
                    "US/Eastern")):

            context_without_time = ScheduleExecutionContext(
                instance.get_ref(), None)

            start_date = datetime(year=2019, month=1, day=1)

            @hourly_schedule(
                pipeline_name="foo_pipeline",
                start_date=start_date,
                execution_time=time(hour=0, minute=25),
            )
            def hourly_foo_schedule(hourly_time):
                return {"hourly_time": hourly_time.isoformat()}

            _check_partitions(
                hourly_foo_schedule,
                HOURS_UNTIL_FEBRUARY_27,
                pendulum.instance(start_date, tz="UTC"),
                DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE,
                relativedelta(hours=1),
            )

            execution_data = hourly_foo_schedule.get_execution_data(
                context_without_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "hourly_time":
                create_pendulum_time(year=2019,
                                     month=2,
                                     day=26,
                                     hour=23,
                                     tz="UTC").isoformat()
            }

            # time that's invalid since it corresponds to a partition before the start date
            # should not execute and should yield a SkipReason if it tries to generate run config
            execution_time_with_invalid_partition = create_pendulum_time(
                year=2018, month=12, day=30, hour=3, minute=25, tz="UTC")
            context_with_invalid_time = ScheduleExecutionContext(
                instance.get_ref(), execution_time_with_invalid_partition)

            execution_data = hourly_foo_schedule.get_execution_data(
                context_with_invalid_time)

            assert len(execution_data) == 1
            skip_data = execution_data[0]
            assert isinstance(skip_data, SkipReason)
            assert (
                "Partition selector did not return a partition. "
                "Make sure that the timezone on your partition set matches your execution timezone."
                in skip_data.skip_message)

            valid_time = create_pendulum_time(year=2019,
                                              month=1,
                                              day=27,
                                              hour=1,
                                              minute=25,
                                              tz="UTC")
            context_with_valid_time = ScheduleExecutionContext(
                instance.get_ref(), valid_time)

            execution_data = hourly_foo_schedule.get_execution_data(
                context_with_valid_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "hourly_time":
                create_pendulum_time(year=2019,
                                     month=1,
                                     day=27,
                                     hour=0,
                                     tz="UTC").isoformat()
            }
Beispiel #26
0
def test_wrong_config_sensor(external_repo_context, capfd):
    freeze_datetime = to_timezone(
        create_pendulum_time(
            year=2019,
            month=2,
            day=27,
            hour=23,
            minute=59,
            second=59,
        ),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(freeze_datetime):
            external_sensor = external_repo.get_external_sensor(
                "wrong_config_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))
            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 0

            evaluate_sensors(instance, grpc_server_registry)
            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1

            validate_tick(
                ticks[0],
                external_sensor,
                freeze_datetime,
                JobTickStatus.FAILURE,
                [],
                "Error in config for pipeline the_pipeline",
            )

            captured = capfd.readouterr()
            assert (
                "Error in config for pipeline the_pipeline") in captured.out

            # Error repeats on subsequent ticks

            evaluate_sensors(instance, grpc_server_registry)
            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2

            validate_tick(
                ticks[0],
                external_sensor,
                freeze_datetime,
                JobTickStatus.FAILURE,
                [],
                "Error in config for pipeline the_pipeline",
            )

            captured = capfd.readouterr()
            assert (
                "Error in config for pipeline the_pipeline") in captured.out
Beispiel #27
0
def test_failure_after_run_launched(external_repo_context, crash_location,
                                    crash_signal, capfd):
    frozen_datetime = to_timezone(
        create_pendulum_time(
            year=2019,
            month=2,
            day=28,
            hour=0,
            minute=0,
            second=0,
            tz="UTC",
        ),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            _grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(frozen_datetime):
            external_sensor = external_repo.get_external_sensor(
                "run_key_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))

            # create a run, launch but crash
            debug_crash_flags = {
                external_sensor.name: {
                    crash_location: crash_signal
                }
            }
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode != 0

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())

            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.STARTED
            assert instance.get_runs_count() == 1

            run = instance.get_runs()[0]
            wait_for_all_runs_to_start(instance)
            assert run.tags.get(SENSOR_NAME_TAG) == "run_key_sensor"
            assert run.tags.get(RUN_KEY_TAG) == "only_once"
            capfd.readouterr()

            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    frozen_datetime.add(seconds=1), None
                ],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode == 0
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            captured = capfd.readouterr()

            assert (
                'Skipping 1 run for sensor run_key_sensor already completed with run keys: ["only_once"]'
                in captured.out)

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            assert ticks[0].status == JobTickStatus.SKIPPED
Beispiel #28
0
def test_failure_before_run_created(external_repo_context, crash_location,
                                    crash_signal, capfd):
    frozen_datetime = to_timezone(
        create_pendulum_time(year=2019,
                             month=2,
                             day=28,
                             hour=0,
                             minute=0,
                             second=1,
                             tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            _grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(frozen_datetime):
            external_sensor = external_repo.get_external_sensor(
                "simple_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))

            # create a tick
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            launch_process.start()
            launch_process.join(timeout=60)
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.SKIPPED
            captured = capfd.readouterr()

            # create a starting tick, but crash
            debug_crash_flags = {
                external_sensor.name: {
                    crash_location: crash_signal
                }
            }
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    frozen_datetime.add(seconds=31), debug_crash_flags
                ],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode != 0

            captured = capfd.readouterr()

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            assert ticks[0].status == JobTickStatus.STARTED
            assert not int(
                ticks[0].timestamp) % 2  # skip condition for simple_sensor
            assert instance.get_runs_count() == 0

            # create another tick, but ensure that the last evaluation time used is from the first,
            # successful tick rather than the failed tick
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    frozen_datetime.add(seconds=62), None
                ],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode == 0
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            captured = capfd.readouterr()
            assert (
                captured.out.replace("\r\n", "\n") ==
                f"""2019-02-27 18:01:03 - SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor
2019-02-27 18:01:03 - SensorDaemon - INFO - Launching run for simple_sensor
2019-02-27 18:01:03 - SensorDaemon - INFO - Completed launch of run {run.run_id} for simple_sensor
""")

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 3
            assert ticks[0].status == JobTickStatus.SUCCESS
Beispiel #29
0
def test_partitions_for_hourly_schedule_decorators_with_timezone():
    with instance_for_test() as instance:
        with pendulum.test(
                create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="US/Central")):
            start_date = datetime(year=2019, month=1, day=1)

            # You can specify a start date with no timezone and it will be assumed to be
            # in the execution timezone

            @hourly_schedule(
                pipeline_name="foo_pipeline",
                start_date=start_date,
                execution_time=time(hour=0, minute=25),
                execution_timezone="US/Central",
            )
            def hourly_central_schedule(hourly_time):
                return {"hourly_time": hourly_time.isoformat()}

            assert hourly_central_schedule.execution_timezone == "US/Central"

            _check_partitions(
                hourly_central_schedule,
                HOURS_UNTIL_FEBRUARY_27,
                pendulum.instance(start_date, tz="US/Central"),
                DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE,
                relativedelta(hours=1),
            )

            valid_time = create_pendulum_time(year=2019,
                                              month=1,
                                              day=27,
                                              hour=1,
                                              minute=25,
                                              tz="US/Central")
            context_with_valid_time = ScheduleExecutionContext(
                instance.get_ref(), valid_time)

            execution_data = hourly_central_schedule.get_execution_data(
                context_with_valid_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "hourly_time":
                create_pendulum_time(year=2019,
                                     month=1,
                                     day=27,
                                     hour=0,
                                     tz="US/Central").isoformat()
            }

            # You can specify a start date in a different timezone and it will be transformed into the
            # execution timezone
            start_date_with_different_timezone = create_pendulum_time(
                2019, 1, 1, 0, tz="US/Pacific")

            @hourly_schedule(
                pipeline_name="foo_pipeline",
                start_date=start_date_with_different_timezone,
                execution_time=time(hour=0, minute=25),
                execution_timezone="US/Central",
            )
            def hourly_central_schedule_with_timezone_start_time(hourly_time):
                return {"hourly_time": hourly_time.isoformat()}

            _check_partitions(
                hourly_central_schedule_with_timezone_start_time,
                HOURS_UNTIL_FEBRUARY_27 -
                2,  # start date is two hours later since it's in PT
                to_timezone(start_date_with_different_timezone, "US/Central"),
                DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE,
                relativedelta(hours=1),
            )

            # test partition_hours_offset=0

            @hourly_schedule(
                pipeline_name="foo_pipeline",
                start_date=start_date_with_different_timezone,
                execution_time=time(hour=0, minute=25),
                execution_timezone="US/Central",
                partition_hours_offset=0,
            )
            def hourly_schedule_for_current_hour(hourly_time):
                return {"hourly_time": hourly_time.isoformat()}

            valid_time = create_pendulum_time(year=2019,
                                              month=1,
                                              day=27,
                                              hour=1,
                                              minute=25,
                                              tz="US/Central")
            context_with_valid_time = ScheduleExecutionContext(
                instance.get_ref(), valid_time)

            execution_data = hourly_schedule_for_current_hour.get_execution_data(
                context_with_valid_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "hourly_time":
                create_pendulum_time(year=2019,
                                     month=1,
                                     day=27,
                                     hour=1,
                                     tz="US/Central").isoformat()
            }

            # test partition_hours_offset=2

            @hourly_schedule(
                pipeline_name="foo_pipeline",
                start_date=start_date_with_different_timezone,
                execution_time=time(hour=0, minute=25),
                execution_timezone="US/Central",
                partition_hours_offset=2,
            )
            def hourly_schedule_for_two_hours_ago(hourly_time):
                return {"hourly_time": hourly_time.isoformat()}

            valid_time = create_pendulum_time(year=2019,
                                              month=1,
                                              day=27,
                                              hour=1,
                                              minute=25,
                                              tz="US/Central")
            context_with_valid_time = ScheduleExecutionContext(
                instance.get_ref(), valid_time)

            execution_data = hourly_schedule_for_two_hours_ago.get_execution_data(
                context_with_valid_time)
            assert len(execution_data) == 1
            assert isinstance(execution_data[0], RunRequest)
            assert execution_data[0].run_config == {
                "hourly_time":
                create_pendulum_time(year=2019,
                                     month=1,
                                     day=26,
                                     hour=23,
                                     tz="US/Central").isoformat()
            }
Beispiel #30
0
def _create_scheduler_run(
    instance,
    logger,
    schedule_time,
    repo_location,
    external_schedule,
    external_pipeline,
    run_request,
):
    run_config = run_request.run_config
    schedule_tags = run_request.tags

    execution_plan_errors = []
    execution_plan_snapshot = None

    try:
        external_execution_plan = repo_location.get_external_execution_plan(
            external_pipeline,
            run_config,
            external_schedule.mode,
            step_keys_to_execute=None,
            known_state=None,
        )
        execution_plan_snapshot = external_execution_plan.execution_plan_snapshot
    except DagsterSubprocessError as e:
        execution_plan_errors.extend(e.subprocess_error_infos)
    except Exception as e:  # pylint: disable=broad-except
        execution_plan_errors.append(serializable_error_info_from_exc_info(sys.exc_info()))

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    tags[SCHEDULED_EXECUTION_TIME_TAG] = to_timezone(schedule_time, "UTC").isoformat()
    if run_request.run_key:
        tags[RUN_KEY_TAG] = run_request.run_key

    # If the run was scheduled correctly but there was an error creating its
    # run config, enter it into the run DB with a FAILURE status
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=(
            PipelineRunStatus.FAILURE
            if len(execution_plan_errors) > 0
            else PipelineRunStatus.NOT_STARTED
        ),
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        external_pipeline_origin=external_pipeline.get_external_origin(),
    )

    if len(execution_plan_errors) > 0:
        for error in execution_plan_errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        error_string = "\n".join([error.to_string() for error in execution_plan_errors])
        logger.error(f"Failed to fetch execution plan for {external_schedule.name}: {error_string}")
    return possibly_invalid_pipeline_run