def get_date_range_partitions(current_time=None): check.opt_inst_param(current_time, "current_time", datetime.datetime) tz = timezone if timezone else "UTC" _start = ( to_timezone(start, tz) if isinstance(start, PendulumDateTime) else pendulum.instance(start, tz=tz) ) if end: _end = end elif current_time: _end = current_time else: _end = pendulum.now(tz) # coerce to the definition timezone if isinstance(_end, PendulumDateTime): _end = to_timezone(_end, tz) else: _end = pendulum.instance(_end, tz=tz) period = pendulum.period(_start, _end) date_names = [ Partition(value=current, name=current.strftime(fmt)) for current in period.range(delta_range, delta_amount) ] # We don't include the last element here by default since we only want # fully completed intervals, and the _end time is in the middle of the interval # represented by the last element of date_names if inclusive: return date_names return date_names[:-1]
def test_cursor_sensor(external_repo_context): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, tz="UTC"), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, workspace, external_repo, ): with pendulum.test(freeze_datetime): skip_sensor = external_repo.get_external_sensor("skip_cursor_sensor") run_sensor = external_repo.get_external_sensor("run_cursor_sensor") instance.start_sensor(skip_sensor) instance.start_sensor(run_sensor) evaluate_sensors(instance, workspace) skip_ticks = instance.get_job_ticks(skip_sensor.get_external_origin_id()) assert len(skip_ticks) == 1 validate_tick( skip_ticks[0], skip_sensor, freeze_datetime, JobTickStatus.SKIPPED, ) assert skip_ticks[0].cursor == "1" run_ticks = instance.get_job_ticks(run_sensor.get_external_origin_id()) assert len(run_ticks) == 1 validate_tick( run_ticks[0], run_sensor, freeze_datetime, JobTickStatus.SUCCESS, ) assert run_ticks[0].cursor == "1" freeze_datetime = freeze_datetime.add(seconds=60) with pendulum.test(freeze_datetime): evaluate_sensors(instance, workspace) skip_ticks = instance.get_job_ticks(skip_sensor.get_external_origin_id()) assert len(skip_ticks) == 2 validate_tick( skip_ticks[0], skip_sensor, freeze_datetime, JobTickStatus.SKIPPED, ) assert skip_ticks[0].cursor == "2" run_ticks = instance.get_job_ticks(run_sensor.get_external_origin_id()) assert len(run_ticks) == 2 validate_tick( run_ticks[0], run_sensor, freeze_datetime, JobTickStatus.SUCCESS, ) assert run_ticks[0].cursor == "2"
def test_custom_interval_sensor_with_offset(external_repo_context, monkeypatch): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, tz="UTC"), "US/Central") sleeps = [] def fake_sleep(s): sleeps.append(s) pendulum.set_test_now(pendulum.now().add(seconds=s)) monkeypatch.setattr(time, "sleep", fake_sleep) with instance_with_sensors(external_repo_context) as ( instance, grpc_server_registry, external_repo, ): with pendulum.test(freeze_datetime): # 60 second custom interval external_sensor = external_repo.get_external_sensor( "custom_interval_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) # create a tick evaluate_sensors(instance, grpc_server_registry) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 # calling for another iteration should not generate another tick because time has not # advanced evaluate_sensors(instance, grpc_server_registry) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 # call the sensor_iteration_loop, which should loop, and call the monkeypatched sleep # to advance 30 seconds list( execute_sensor_iteration_loop( instance, grpc_server_registry, get_default_daemon_logger("SensorDaemon"), daemon_shutdown_event=None, until=freeze_datetime.add(seconds=65).timestamp(), )) assert pendulum.now() == freeze_datetime.add(seconds=65) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 assert sum(sleeps) == 65
def test_sensor_start_stop(external_repo_context): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, tz="UTC"), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, grpc_server_registry, external_repo, ): with pendulum.test(freeze_datetime): external_sensor = external_repo.get_external_sensor( "always_on_sensor") external_origin_id = external_sensor.get_external_origin_id() instance.start_sensor(external_sensor) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks(external_origin_id) assert len(ticks) == 0 evaluate_sensors(instance, grpc_server_registry) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] ticks = instance.get_job_ticks(external_origin_id) assert len(ticks) == 1 validate_tick(ticks[0], external_sensor, freeze_datetime, JobTickStatus.SUCCESS, [run.run_id]) freeze_datetime = freeze_datetime.add(seconds=15) with pendulum.test(freeze_datetime): evaluate_sensors(instance, grpc_server_registry) # no new ticks, no new runs, we are below the 30 second min interval assert instance.get_runs_count() == 1 ticks = instance.get_job_ticks(external_origin_id) assert len(ticks) == 1 # stop / start instance.stop_sensor(external_origin_id) instance.start_sensor(external_sensor) evaluate_sensors(instance, grpc_server_registry) # no new ticks, no new runs, we are below the 30 second min interval assert instance.get_runs_count() == 1 ticks = instance.get_job_ticks(external_origin_id) assert len(ticks) == 1 freeze_datetime = freeze_datetime.add(seconds=16) with pendulum.test(freeze_datetime): evaluate_sensors(instance, grpc_server_registry) # should have new tick, new run, we are after the 30 second min interval assert instance.get_runs_count() == 2 ticks = instance.get_job_ticks(external_origin_id) assert len(ticks) == 2
def test_custom_interval_sensor(external_repo_context): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, tz="UTC"), "US/Central") with instance_with_sensors(external_repo_context) as (instance, external_repo): with pendulum.test(freeze_datetime): external_sensor = external_repo.get_external_sensor( "custom_interval_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 0 list( execute_sensor_iteration( instance, get_default_daemon_logger("SensorDaemon"))) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 validate_tick(ticks[0], external_sensor, freeze_datetime, JobTickStatus.SKIPPED) freeze_datetime = freeze_datetime.add(seconds=30) with pendulum.test(freeze_datetime): list( execute_sensor_iteration( instance, get_default_daemon_logger("SensorDaemon"))) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) # no additional tick created after 30 seconds assert len(ticks) == 1 freeze_datetime = freeze_datetime.add(seconds=30) with pendulum.test(freeze_datetime): list( execute_sensor_iteration( instance, get_default_daemon_logger("SensorDaemon"))) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 expected_datetime = create_pendulum_time(year=2019, month=2, day=28, hour=0, minute=1) validate_tick(ticks[0], external_sensor, expected_datetime, JobTickStatus.SKIPPED)
def test_bad_load_sensor_repository(external_repo_context, capfd): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, hour=23, minute=59, second=59, tz="UTC"), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, grpc_server_registry, external_repo, ): with pendulum.test(freeze_datetime): external_sensor = external_repo.get_external_sensor( "simple_sensor") valid_origin = external_sensor.get_external_origin() # Swap out a new repository name invalid_repo_origin = ExternalJobOrigin( ExternalRepositoryOrigin( valid_origin.external_repository_origin. repository_location_origin, "invalid_repo_name", ), valid_origin.job_name, ) instance.add_job_state( JobState(invalid_repo_origin, JobType.SENSOR, JobStatus.RUNNING)) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks(invalid_repo_origin.get_id()) assert len(ticks) == 0 evaluate_sensors(instance, grpc_server_registry) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks(invalid_repo_origin.get_id()) assert len(ticks) == 0 captured = capfd.readouterr() assert "Sensor daemon caught an error for sensor simple_sensor" in captured.out assert ( "Could not find repository invalid_repo_name in location test_location to run sensor simple_sensor" in captured.out)
def get_schedule_range_partitions(current_time=None): check.opt_inst_param(current_time, "current_time", datetime.datetime) tz = timezone if timezone else "UTC" _start = ( to_timezone(start, tz) if isinstance(start, PendulumDateTime) else pendulum.instance(start, tz=tz) ) if end: _end = end elif current_time: _end = current_time else: _end = pendulum.now(tz) # coerce to the definition timezone if isinstance(_end, PendulumDateTime): _end = to_timezone(_end, tz) else: _end = pendulum.instance(_end, tz=tz) end_timestamp = _end.timestamp() partitions = [] for next_time in schedule_execution_time_iterator(_start.timestamp(), cron_schedule, tz): partition_time = execution_time_to_partition_fn(next_time) if partition_time.timestamp() > end_timestamp: break if partition_time.timestamp() < _start.timestamp(): continue partitions.append(Partition(value=partition_time, name=partition_time.strftime(fmt))) return partitions if inclusive else partitions[:-1]
def _get_existing_run_for_request(instance, external_schedule, schedule_time, run_request): tags = merge_dicts( PipelineRun.tags_for_schedule(external_schedule), { SCHEDULED_EXECUTION_TIME_TAG: to_timezone(schedule_time, "UTC").isoformat(), }, ) if run_request.run_key: tags[RUN_KEY_TAG] = run_request.run_key runs_filter = PipelineRunsFilter(tags=tags) existing_runs = instance.get_runs(runs_filter) if not len(existing_runs): return None return existing_runs[0]
def test_error_sensor(external_repo_context, capfd): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, hour=23, minute=59, second=59, tz="UTC"), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, grpc_server_registry, external_repo, ): with pendulum.test(freeze_datetime): external_sensor = external_repo.get_external_sensor("error_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 0 evaluate_sensors(instance, grpc_server_registry) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_sensor, freeze_datetime, JobTickStatus.FAILURE, [], "Error occurred during the execution of evaluation_fn for sensor error_sensor", ) captured = capfd.readouterr() assert ( "Failed to resolve sensor for error_sensor : ") in captured.out assert ( "Error occurred during the execution of evaluation_fn for sensor error_sensor" ) in captured.out
def test_partitions_for_hourly_schedule_decorators_without_timezone(): with instance_for_test() as instance: with pendulum.test( to_timezone(create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="UTC"), "US/Eastern") ): context_without_time = build_schedule_context(instance) start_date = datetime(year=2019, month=1, day=1) @hourly_schedule( pipeline_name="foo_pipeline", start_date=start_date, execution_time=time(hour=0, minute=25), ) def hourly_foo_schedule(hourly_time): return {"hourly_time": hourly_time.isoformat()} _check_partitions( hourly_foo_schedule, HOURS_UNTIL_FEBRUARY_27, pendulum.instance(start_date, tz="UTC"), DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE, relativedelta(hours=1), ) execution_data = hourly_foo_schedule.get_execution_data(context_without_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "hourly_time": create_pendulum_time( year=2019, month=2, day=26, hour=23, tz="UTC" ).isoformat() } valid_time = create_pendulum_time( year=2019, month=1, day=27, hour=1, minute=25, tz="UTC" ) context_with_valid_time = build_schedule_context(instance, valid_time) execution_data = hourly_foo_schedule.get_execution_data(context_with_valid_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "hourly_time": create_pendulum_time( year=2019, month=1, day=27, hour=0, tz="UTC" ).isoformat() }
def test_error_sensor_daemon(external_repo_context, monkeypatch): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, tz="UTC"), "US/Central") sleeps = [] def fake_sleep(s): sleeps.append(s) pendulum.set_test_now(pendulum.now().add(seconds=s)) monkeypatch.setattr(time, "sleep", fake_sleep) with instance_with_sensors( external_repo_context, overrides={ "run_launcher": { "module": "dagster.core.test_utils", "class": "ExplodingRunLauncher", }, }, ) as (instance, workspace, _external_repo): @contextmanager def _gen_workspace(_instance): yield workspace with pendulum.test(freeze_datetime): instance.add_job_state( JobState(_get_unloadable_sensor_origin(), JobType.SENSOR, JobStatus.RUNNING)) sensor_daemon = SensorDaemon.create_from_instance(instance) daemon_shutdown_event = threading.Event() sensor_daemon.run_loop( "my_uuid", daemon_shutdown_event, _gen_workspace, heartbeat_interval_seconds=DEFAULT_HEARTBEAT_INTERVAL_SECONDS, error_interval_seconds=DEFAULT_DAEMON_ERROR_INTERVAL_SECONDS, until=freeze_datetime.add(seconds=65), ) heartbeats = instance.get_daemon_heartbeats() heartbeat = heartbeats["SENSOR"] assert heartbeat assert heartbeat.errors assert len(heartbeat.errors) == DAEMON_HEARTBEAT_ERROR_LIMIT
def test_launch_failure(external_repo_context, capfd): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, hour=23, minute=59, second=59, tz="UTC"), "US/Central", ) with instance_with_sensors( external_repo_context, overrides={ "run_launcher": { "module": "dagster.core.test_utils", "class": "ExplodingRunLauncher", }, }, ) as (instance, grpc_server_registry, external_repo): with pendulum.test(freeze_datetime): external_sensor = external_repo.get_external_sensor( "always_on_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 0 evaluate_sensors(instance, grpc_server_registry) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 validate_tick(ticks[0], external_sensor, freeze_datetime, JobTickStatus.SUCCESS, [run.run_id]) captured = capfd.readouterr() assert ("Run {run_id} created successfully but failed to launch:". format(run_id=run.run_id)) in captured.out assert "The entire purpose of this is to throw on launch" in captured.out
def resolve_evaluationResult(self, graphene_info): if self._job_state.status != JobStatus.RUNNING: return None if self._job_state.job_type != JobType.SCHEDULE: return None repository_origin = self._job_state.origin.external_repository_origin if not graphene_info.context.has_repository_location( repository_origin.repository_location_origin.location_name): return None repository_location = graphene_info.context.get_repository_location( repository_origin.repository_location_origin.location_name) if not repository_location.has_repository( repository_origin.repository_name): return None repository = repository_location.get_repository( repository_origin.repository_name) external_schedule = repository.get_external_schedule( self._job_state.name) timezone_str = external_schedule.execution_timezone if not timezone_str: timezone_str = "UTC" next_tick_datetime = next( external_schedule.execution_time_iterator(self._timestamp)) schedule_time = to_timezone(pendulum.instance(next_tick_datetime), timezone_str) try: schedule_data = repository_location.get_external_schedule_execution_data( instance=graphene_info.context.instance, repository_handle=repository.handle, schedule_name=external_schedule.name, scheduled_execution_time=schedule_time, ) except Exception: # pylint: disable=broad-except schedule_data = serializable_error_info_from_exc_info( sys.exc_info()) return GrapheneTickEvaluation(schedule_data)
def _test_backfill_in_subprocess(instance_ref, debug_crash_flags): execution_datetime = to_timezone( create_pendulum_time( year=2021, month=2, day=17, ), "US/Central", ) with DagsterInstance.from_ref(instance_ref) as instance: try: with pendulum.test(execution_datetime), ProcessGrpcServerRegistry( wait_for_processes_on_exit=True) as grpc_server_registry: list( execute_backfill_iteration( instance, grpc_server_registry, get_default_daemon_logger("BackfillDaemon"), debug_crash_flags=debug_crash_flags, )) finally: cleanup_test_instance(instance)
def test_large_sensor(external_repo_context): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, tz="UTC"), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, workspace, external_repo, ): with pendulum.test(freeze_datetime): external_sensor = external_repo.get_external_sensor("large_sensor") instance.start_sensor(external_sensor) evaluate_sensors(instance, workspace) ticks = instance.get_job_ticks(external_sensor.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_sensor, freeze_datetime, JobTickStatus.SUCCESS, )
def resolve_evaluationResult(self, graphene_info): if self._job_state.status != JobStatus.RUNNING: return None if self._job_state.job_type != JobType.SCHEDULE: return None repository_origin = self._job_state.origin.external_repository_origin if not graphene_info.context.has_repository_location( repository_origin.repository_location_origin.location_name): return None repository_location = graphene_info.context.get_repository_location( repository_origin.repository_location_origin.location_name) if not repository_location.has_repository( repository_origin.repository_name): return None repository = repository_location.get_repository( repository_origin.repository_name) external_schedule = repository.get_external_schedule( self._job_state.name) timezone_str = external_schedule.execution_timezone if not timezone_str: timezone_str = pendulum.now().timezone.name next_tick_datetime = next( external_schedule.execution_time_iterator(self._timestamp)) schedule_time = to_timezone(pendulum.instance(next_tick_datetime), timezone_str) schedule_data = repository_location.get_external_schedule_execution_data( instance=graphene_info.context.instance, repository_handle=repository.handle, schedule_name=external_schedule.name, scheduled_execution_time=schedule_time, ) return GrapheneTickEvaluation(schedule_data)
def test_execute_during_dst_transition_spring_forward(external_repo_context): # Verify that a daily schedule that is supposed to execute at a time that is skipped # by the DST transition does not execute for that day with instance_with_schedules(external_repo_context) as ( instance, external_repo, ): # Day before DST freeze_datetime = to_timezone( create_pendulum_time(2019, 3, 9, 0, 0, 0, tz="US/Central"), "US/Pacific") with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule( "daily_dst_transition_schedule_skipped_time") schedule_origin = external_schedule.get_external_origin() instance.start_schedule_and_update_storage_state(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks(schedule_origin.get_id()) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(days=3) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, logger(), pendulum.now("UTC"), )) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 3 ticks = instance.get_job_ticks(schedule_origin.get_id()) assert len(ticks) == 3 expected_datetimes_utc = [ to_timezone( create_pendulum_time(2019, 3, 11, 2, 30, 0, tz="US/Central"), "UTC"), to_timezone( create_pendulum_time(2019, 3, 10, 3, 00, 0, tz="US/Central"), "UTC"), to_timezone( create_pendulum_time(2019, 3, 9, 2, 30, 0, tz="US/Central"), "UTC"), ] expected_partition_times = [ create_pendulum_time(2019, 3, 10, tz="US/Central"), create_pendulum_time(2019, 3, 9, tz="US/Central"), create_pendulum_time(2019, 3, 8, tz="US/Central"), ] partition_set_def = the_repo.get_partition_set_def( "daily_dst_transition_schedule_skipped_time_partitions") partition_names = partition_set_def.get_partition_names() assert "2019-03-08" in partition_names assert "2019-03-09" in partition_names assert "2019-03-10" in partition_names for i in range(3): validate_tick( ticks[i], external_schedule, expected_datetimes_utc[i], JobTickStatus.SUCCESS, [instance.get_runs()[i].run_id], ) validate_run_started( instance.get_runs()[i], expected_datetimes_utc[i], partition_time=expected_partition_times[i], ) # Verify idempotence list( launch_scheduled_runs( instance, logger(), pendulum.now("UTC"), )) assert instance.get_runs_count() == 3 ticks = instance.get_job_ticks(schedule_origin.get_id()) assert len(ticks) == 3
def test_execute_during_dst_transition_fall_back(external_repo_context): with instance_with_schedules(external_repo_context) as ( instance, external_repo, ): # A schedule that runs daily during a time that occurs twice during a fall DST transition # only executes once for that day freeze_datetime = to_timezone( create_pendulum_time(2019, 11, 2, 0, 0, 0, tz="US/Central"), "US/Pacific") with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule( "daily_dst_transition_schedule_doubled_time") schedule_origin = external_schedule.get_external_origin() instance.start_schedule_and_update_storage_state(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks(schedule_origin.get_id()) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(days=3) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, logger(), pendulum.now("UTC"), )) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 3 ticks = instance.get_job_ticks(schedule_origin.get_id()) assert len(ticks) == 3 expected_datetimes_utc = [ create_pendulum_time(2019, 11, 4, 7, 30, 0, tz="UTC"), create_pendulum_time(2019, 11, 3, 7, 30, 0, tz="UTC"), create_pendulum_time(2019, 11, 2, 6, 30, 0, tz="UTC"), ] expected_partition_times = [ create_pendulum_time(2019, 11, 3, tz="US/Central"), create_pendulum_time(2019, 11, 2, tz="US/Central"), create_pendulum_time(2019, 11, 1, tz="US/Central"), ] for i in range(3): validate_tick( ticks[i], external_schedule, expected_datetimes_utc[i], JobTickStatus.SUCCESS, [instance.get_runs()[i].run_id], ) validate_run_started( instance.get_runs()[i], expected_datetimes_utc[i], partition_time=expected_partition_times[i], ) # Verify idempotence list( launch_scheduled_runs( instance, logger(), pendulum.now("UTC"), )) assert instance.get_runs_count() == 3 ticks = instance.get_job_ticks(schedule_origin.get_id()) assert len(ticks) == 3
def test_simple_sensor(external_repo_context, capfd): freeze_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, hour=23, minute=59, second=59, tz="UTC"), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, grpc_server_registry, external_repo, ): with pendulum.test(freeze_datetime): external_sensor = external_repo.get_external_sensor( "simple_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 0 evaluate_sensors(instance, grpc_server_registry) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_sensor, freeze_datetime, JobTickStatus.SKIPPED, ) captured = capfd.readouterr() assert ( captured.out == """2019-02-27 17:59:59 - SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor 2019-02-27 17:59:59 - SensorDaemon - INFO - Sensor returned false for simple_sensor, skipping """) freeze_datetime = freeze_datetime.add(seconds=30) with pendulum.test(freeze_datetime): evaluate_sensors(instance, grpc_server_registry) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] validate_run_started(run) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 expected_datetime = create_pendulum_time(year=2019, month=2, day=28, hour=0, minute=0, second=29) validate_tick( ticks[0], external_sensor, expected_datetime, JobTickStatus.SUCCESS, [run.run_id], ) captured = capfd.readouterr() assert ( captured.out == """2019-02-27 18:00:29 - SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor 2019-02-27 18:00:29 - SensorDaemon - INFO - Launching run for simple_sensor 2019-02-27 18:00:29 - SensorDaemon - INFO - Completed launch of run {run_id} for simple_sensor """.format(run_id=run.run_id))
def schedule_execution_time_iterator(start_timestamp, cron_schedule, execution_timezone): check.float_param(start_timestamp, "start_timestamp") check.str_param(cron_schedule, "cron_schedule") check.opt_str_param(execution_timezone, "execution_timezone") timezone_str = execution_timezone if execution_timezone else "UTC" start_datetime = pendulum.from_timestamp(start_timestamp, tz=timezone_str) date_iter = croniter(cron_schedule, start_datetime) # Go back one iteration so that the next iteration is the first time that is >= start_datetime # and matches the cron schedule next_date = to_timezone( pendulum.instance(date_iter.get_prev(datetime.datetime)), timezone_str) cron_parts = cron_schedule.split(" ") check.invariant(len(cron_parts) == 5) is_numeric = [part.isnumeric() for part in cron_parts] delta_fn = None # Special-case common intervals (hourly/daily/weekly/monthly) since croniter iteration can be # much slower than adding a fixed interval if cron_schedule.endswith(" * *") and all(is_numeric[0:3]): # monthly delta_fn = lambda d, num: d.add(months=num) should_hour_change = False elif (all(is_numeric[0:2]) and is_numeric[4] and cron_parts[2] == "*" and cron_parts[3] == "*"): # weekly delta_fn = lambda d, num: d.add(weeks=num) should_hour_change = False elif all(is_numeric[0:2]) and cron_schedule.endswith(" * * *"): # daily delta_fn = lambda d, num: d.add(days=num) should_hour_change = False elif is_numeric[0] and cron_schedule.endswith(" * * * *"): # hourly delta_fn = lambda d, num: d.add(hours=num) should_hour_change = True while True: if delta_fn: curr_hour = next_date.hour next_date_cand = delta_fn(next_date, 1) new_hour = next_date_cand.hour if not should_hour_change and new_hour != curr_hour: # If the hour changes during a daily/weekly/monthly schedule, it # indicates that the time shifted due to falling in a time that doesn't # exist due to a DST transition (for example, 2:30AM CST on 3/10/2019). # Instead, execute at the first time that does exist (the start of the hour), # but return to the original hour for all subsequent executions so that the # hour doesn't stay different permanently. check.invariant(new_hour == curr_hour + 1) yield next_date_cand.replace(minute=0) next_date_cand = delta_fn(next_date, 2) check.invariant(next_date_cand.hour == curr_hour) next_date = next_date_cand else: next_date = to_timezone( pendulum.instance(date_iter.get_next(datetime.datetime)), timezone_str) yield next_date
def test_launch_once(external_repo_context, capfd): freeze_datetime = to_timezone( create_pendulum_time( year=2019, month=2, day=27, hour=23, minute=59, second=59, tz="UTC", ), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, grpc_server_registry, external_repo, ): with pendulum.test(freeze_datetime): external_sensor = external_repo.get_external_sensor( "run_key_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 0 evaluate_sensors(instance, grpc_server_registry) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_sensor, freeze_datetime, JobTickStatus.SUCCESS, expected_run_ids=[run.run_id], ) # run again (after 30 seconds), to ensure that the run key maintains idempotence freeze_datetime = freeze_datetime.add(seconds=30) with pendulum.test(freeze_datetime): evaluate_sensors(instance, grpc_server_registry) assert instance.get_runs_count() == 1 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 validate_tick( ticks[0], external_sensor, freeze_datetime, JobTickStatus.SKIPPED, ) captured = capfd.readouterr() assert ( 'Skipping 1 run for sensor run_key_sensor already completed with run keys: ["only_once"]' in captured.out) launched_run = instance.get_runs()[0] # Manually create a new run with the same tags execute_pipeline( the_pipeline, run_config=launched_run.run_config, tags=launched_run.tags, instance=instance, ) # Sensor loop still executes freeze_datetime = freeze_datetime.add(seconds=30) with pendulum.test(freeze_datetime): evaluate_sensors(instance, grpc_server_registry) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 3 validate_tick( ticks[0], external_sensor, freeze_datetime, JobTickStatus.SKIPPED, )
def test_failure_recovery_before_run_created(external_repo_context, crash_location, crash_signal, capfd): # Verify that if the scheduler crashes or is interrupted before a run is created, # it will create exactly one tick/run when it is re-launched with instance_with_schedules(external_repo_context) as (instance, external_repo): initial_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, hour=0, minute=0, second=0, tz="UTC"), "US/Central", ) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule( "simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule_and_update_storage_state(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 captured = capfd.readouterr() assert ( captured.out.replace("\r\n", "\n") == """2019-02-26 18:00:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule 2019-02-26 18:00:00 - SchedulerDaemon - INFO - Evaluating schedule `simple_schedule` at 2019-02-27 00:00:00+0000 """) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == JobTickStatus.STARTED assert instance.get_runs_count() == 0 frozen_datetime = frozen_datetime.add(minutes=5) with pendulum.test(frozen_datetime): scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 wait_for_all_runs_to_start(instance) validate_run_started( instance.get_runs()[0], execution_time=initial_datetime, partition_time=create_pendulum_time(2019, 2, 26), ) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.SUCCESS, [instance.get_runs()[0].run_id], ) captured = capfd.readouterr() assert ( captured.out.replace("\r\n", "\n") == """2019-02-26 18:05:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule 2019-02-26 18:05:00 - SchedulerDaemon - INFO - Evaluating schedule `simple_schedule` at 2019-02-27 00:00:00+0000 2019-02-26 18:05:00 - SchedulerDaemon - INFO - Resuming previously interrupted schedule execution 2019-02-26 18:05:00 - SchedulerDaemon - INFO - Completed scheduled launch of run {run_id} for simple_schedule """.format(run_id=instance.get_runs()[0].run_id))
def test_daily_dst_fall_back(external_repo_context): # Verify that a daily schedule still runs once per day during the fall DST transition with instance_with_schedules(external_repo_context) as ( instance, external_repo, ): # Night before DST freeze_datetime = to_timezone( create_pendulum_time(2019, 11, 3, 0, 0, 0, tz="US/Central"), "US/Pacific") with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule( "daily_central_time_schedule") schedule_origin = external_schedule.get_external_origin() instance.start_schedule_and_update_storage_state(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks(schedule_origin.get_id()) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(days=2) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, logger(), pendulum.now("UTC"), )) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 3 ticks = instance.get_job_ticks(schedule_origin.get_id()) assert len(ticks) == 3 # UTC time changed by one hour after the transition, still running daily at the same # time in CT expected_datetimes_utc = [ create_pendulum_time(2019, 11, 5, 6, 0, 0, tz="UTC"), create_pendulum_time(2019, 11, 4, 6, 0, 0, tz="UTC"), create_pendulum_time(2019, 11, 3, 5, 0, 0, tz="UTC"), ] expected_partition_times = [ create_pendulum_time(2019, 11, 4, tz="US/Central"), create_pendulum_time(2019, 11, 3, tz="US/Central"), create_pendulum_time(2019, 11, 2, tz="US/Central"), ] for i in range(3): validate_tick( ticks[i], external_schedule, expected_datetimes_utc[i], JobTickStatus.SUCCESS, [instance.get_runs()[i].run_id], ) validate_run_started( instance.get_runs()[i], expected_datetimes_utc[i], partition_time=expected_partition_times[i], ) # Verify idempotence list( launch_scheduled_runs( instance, logger(), pendulum.now("UTC"), )) assert instance.get_runs_count() == 3 ticks = instance.get_job_ticks(schedule_origin.get_id()) assert len(ticks) == 3
def test_partitions_for_monthly_schedule_decorators_without_timezone(): with instance_for_test() as instance: with pendulum.test( to_timezone( create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="UTC"), "US/Eastern")): context_without_time = ScheduleExecutionContext( instance.get_ref(), None) start_date = datetime(year=2019, month=1, day=1) @monthly_schedule( pipeline_name="foo_pipeline", execution_day_of_month=3, start_date=start_date, execution_time=time(9, 30), ) def monthly_foo_schedule(monthly_time): return {"monthly_time": monthly_time.isoformat()} valid_monthly_time = create_pendulum_time(year=2019, month=2, day=3, hour=9, minute=30, tz="UTC") context_with_valid_time = ScheduleExecutionContext( instance.get_ref(), valid_monthly_time) execution_data = monthly_foo_schedule.get_execution_data( context_with_valid_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "monthly_time": create_pendulum_time(year=2019, month=1, day=1, tz="UTC").isoformat() } execution_data = monthly_foo_schedule.get_execution_data( context_without_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "monthly_time": create_pendulum_time(year=2019, month=1, day=1, tz="UTC").isoformat() } _check_partitions( monthly_foo_schedule, 1, pendulum.instance(start_date, tz="UTC"), DEFAULT_MONTHLY_FORMAT, relativedelta(months=1), ) # test partition_months_offset=0 @monthly_schedule( pipeline_name="foo_pipeline", execution_day_of_month=3, start_date=start_date, execution_time=time(9, 30), partition_months_offset=0, ) def monthly_foo_schedule_same_month(monthly_time): return {"monthly_time": monthly_time.isoformat()} valid_monthly_time = create_pendulum_time(year=2019, month=2, day=3, hour=9, minute=30, tz="UTC") context_with_valid_time = ScheduleExecutionContext( instance.get_ref(), valid_monthly_time) execution_data = monthly_foo_schedule_same_month.get_execution_data( context_with_valid_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "monthly_time": create_pendulum_time(year=2019, month=2, day=1, tz="UTC").isoformat() }
def test_partitions_for_hourly_schedule_decorators_without_timezone(): with instance_for_test() as instance: with pendulum.test( to_timezone( create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="UTC"), "US/Eastern")): context_without_time = ScheduleExecutionContext( instance.get_ref(), None) start_date = datetime(year=2019, month=1, day=1) @hourly_schedule( pipeline_name="foo_pipeline", start_date=start_date, execution_time=time(hour=0, minute=25), ) def hourly_foo_schedule(hourly_time): return {"hourly_time": hourly_time.isoformat()} _check_partitions( hourly_foo_schedule, HOURS_UNTIL_FEBRUARY_27, pendulum.instance(start_date, tz="UTC"), DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE, relativedelta(hours=1), ) execution_data = hourly_foo_schedule.get_execution_data( context_without_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "hourly_time": create_pendulum_time(year=2019, month=2, day=26, hour=23, tz="UTC").isoformat() } # time that's invalid since it corresponds to a partition before the start date # should not execute and should yield a SkipReason if it tries to generate run config execution_time_with_invalid_partition = create_pendulum_time( year=2018, month=12, day=30, hour=3, minute=25, tz="UTC") context_with_invalid_time = ScheduleExecutionContext( instance.get_ref(), execution_time_with_invalid_partition) execution_data = hourly_foo_schedule.get_execution_data( context_with_invalid_time) assert len(execution_data) == 1 skip_data = execution_data[0] assert isinstance(skip_data, SkipReason) assert ( "Partition selector did not return a partition. " "Make sure that the timezone on your partition set matches your execution timezone." in skip_data.skip_message) valid_time = create_pendulum_time(year=2019, month=1, day=27, hour=1, minute=25, tz="UTC") context_with_valid_time = ScheduleExecutionContext( instance.get_ref(), valid_time) execution_data = hourly_foo_schedule.get_execution_data( context_with_valid_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "hourly_time": create_pendulum_time(year=2019, month=1, day=27, hour=0, tz="UTC").isoformat() }
def test_wrong_config_sensor(external_repo_context, capfd): freeze_datetime = to_timezone( create_pendulum_time( year=2019, month=2, day=27, hour=23, minute=59, second=59, ), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, grpc_server_registry, external_repo, ): with pendulum.test(freeze_datetime): external_sensor = external_repo.get_external_sensor( "wrong_config_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 0 evaluate_sensors(instance, grpc_server_registry) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_sensor, freeze_datetime, JobTickStatus.FAILURE, [], "Error in config for pipeline the_pipeline", ) captured = capfd.readouterr() assert ( "Error in config for pipeline the_pipeline") in captured.out # Error repeats on subsequent ticks evaluate_sensors(instance, grpc_server_registry) assert instance.get_runs_count() == 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 validate_tick( ticks[0], external_sensor, freeze_datetime, JobTickStatus.FAILURE, [], "Error in config for pipeline the_pipeline", ) captured = capfd.readouterr() assert ( "Error in config for pipeline the_pipeline") in captured.out
def test_failure_after_run_launched(external_repo_context, crash_location, crash_signal, capfd): frozen_datetime = to_timezone( create_pendulum_time( year=2019, month=2, day=28, hour=0, minute=0, second=0, tz="UTC", ), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, _grpc_server_registry, external_repo, ): with pendulum.test(frozen_datetime): external_sensor = external_repo.get_external_sensor( "run_key_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) # create a run, launch but crash debug_crash_flags = { external_sensor.name: { crash_location: crash_signal } } launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode != 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == JobTickStatus.STARTED assert instance.get_runs_count() == 1 run = instance.get_runs()[0] wait_for_all_runs_to_start(instance) assert run.tags.get(SENSOR_NAME_TAG) == "run_key_sensor" assert run.tags.get(RUN_KEY_TAG) == "only_once" capfd.readouterr() launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[ instance.get_ref(), frozen_datetime.add(seconds=1), None ], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode == 0 wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] captured = capfd.readouterr() assert ( 'Skipping 1 run for sensor run_key_sensor already completed with run keys: ["only_once"]' in captured.out) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 assert ticks[0].status == JobTickStatus.SKIPPED
def test_failure_before_run_created(external_repo_context, crash_location, crash_signal, capfd): frozen_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, hour=0, minute=0, second=1, tz="UTC"), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, _grpc_server_registry, external_repo, ): with pendulum.test(frozen_datetime): external_sensor = external_repo.get_external_sensor( "simple_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) # create a tick launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) launch_process.start() launch_process.join(timeout=60) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == JobTickStatus.SKIPPED captured = capfd.readouterr() # create a starting tick, but crash debug_crash_flags = { external_sensor.name: { crash_location: crash_signal } } launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[ instance.get_ref(), frozen_datetime.add(seconds=31), debug_crash_flags ], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode != 0 captured = capfd.readouterr() ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 assert ticks[0].status == JobTickStatus.STARTED assert not int( ticks[0].timestamp) % 2 # skip condition for simple_sensor assert instance.get_runs_count() == 0 # create another tick, but ensure that the last evaluation time used is from the first, # successful tick rather than the failed tick launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[ instance.get_ref(), frozen_datetime.add(seconds=62), None ], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode == 0 wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] captured = capfd.readouterr() assert ( captured.out.replace("\r\n", "\n") == f"""2019-02-27 18:01:03 - SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor 2019-02-27 18:01:03 - SensorDaemon - INFO - Launching run for simple_sensor 2019-02-27 18:01:03 - SensorDaemon - INFO - Completed launch of run {run.run_id} for simple_sensor """) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 3 assert ticks[0].status == JobTickStatus.SUCCESS
def test_partitions_for_hourly_schedule_decorators_with_timezone(): with instance_for_test() as instance: with pendulum.test( create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="US/Central")): start_date = datetime(year=2019, month=1, day=1) # You can specify a start date with no timezone and it will be assumed to be # in the execution timezone @hourly_schedule( pipeline_name="foo_pipeline", start_date=start_date, execution_time=time(hour=0, minute=25), execution_timezone="US/Central", ) def hourly_central_schedule(hourly_time): return {"hourly_time": hourly_time.isoformat()} assert hourly_central_schedule.execution_timezone == "US/Central" _check_partitions( hourly_central_schedule, HOURS_UNTIL_FEBRUARY_27, pendulum.instance(start_date, tz="US/Central"), DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE, relativedelta(hours=1), ) valid_time = create_pendulum_time(year=2019, month=1, day=27, hour=1, minute=25, tz="US/Central") context_with_valid_time = ScheduleExecutionContext( instance.get_ref(), valid_time) execution_data = hourly_central_schedule.get_execution_data( context_with_valid_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "hourly_time": create_pendulum_time(year=2019, month=1, day=27, hour=0, tz="US/Central").isoformat() } # You can specify a start date in a different timezone and it will be transformed into the # execution timezone start_date_with_different_timezone = create_pendulum_time( 2019, 1, 1, 0, tz="US/Pacific") @hourly_schedule( pipeline_name="foo_pipeline", start_date=start_date_with_different_timezone, execution_time=time(hour=0, minute=25), execution_timezone="US/Central", ) def hourly_central_schedule_with_timezone_start_time(hourly_time): return {"hourly_time": hourly_time.isoformat()} _check_partitions( hourly_central_schedule_with_timezone_start_time, HOURS_UNTIL_FEBRUARY_27 - 2, # start date is two hours later since it's in PT to_timezone(start_date_with_different_timezone, "US/Central"), DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE, relativedelta(hours=1), ) # test partition_hours_offset=0 @hourly_schedule( pipeline_name="foo_pipeline", start_date=start_date_with_different_timezone, execution_time=time(hour=0, minute=25), execution_timezone="US/Central", partition_hours_offset=0, ) def hourly_schedule_for_current_hour(hourly_time): return {"hourly_time": hourly_time.isoformat()} valid_time = create_pendulum_time(year=2019, month=1, day=27, hour=1, minute=25, tz="US/Central") context_with_valid_time = ScheduleExecutionContext( instance.get_ref(), valid_time) execution_data = hourly_schedule_for_current_hour.get_execution_data( context_with_valid_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "hourly_time": create_pendulum_time(year=2019, month=1, day=27, hour=1, tz="US/Central").isoformat() } # test partition_hours_offset=2 @hourly_schedule( pipeline_name="foo_pipeline", start_date=start_date_with_different_timezone, execution_time=time(hour=0, minute=25), execution_timezone="US/Central", partition_hours_offset=2, ) def hourly_schedule_for_two_hours_ago(hourly_time): return {"hourly_time": hourly_time.isoformat()} valid_time = create_pendulum_time(year=2019, month=1, day=27, hour=1, minute=25, tz="US/Central") context_with_valid_time = ScheduleExecutionContext( instance.get_ref(), valid_time) execution_data = hourly_schedule_for_two_hours_ago.get_execution_data( context_with_valid_time) assert len(execution_data) == 1 assert isinstance(execution_data[0], RunRequest) assert execution_data[0].run_config == { "hourly_time": create_pendulum_time(year=2019, month=1, day=26, hour=23, tz="US/Central").isoformat() }
def _create_scheduler_run( instance, logger, schedule_time, repo_location, external_schedule, external_pipeline, run_request, ): run_config = run_request.run_config schedule_tags = run_request.tags execution_plan_errors = [] execution_plan_snapshot = None try: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_config, external_schedule.mode, step_keys_to_execute=None, known_state=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot except DagsterSubprocessError as e: execution_plan_errors.extend(e.subprocess_error_infos) except Exception as e: # pylint: disable=broad-except execution_plan_errors.append(serializable_error_info_from_exc_info(sys.exc_info())) pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts(pipeline_tags, schedule_tags) tags[SCHEDULED_EXECUTION_TIME_TAG] = to_timezone(schedule_time, "UTC").isoformat() if run_request.run_key: tags[RUN_KEY_TAG] = run_request.run_key # If the run was scheduled correctly but there was an error creating its # run config, enter it into the run DB with a FAILURE status possibly_invalid_pipeline_run = instance.create_run( pipeline_name=external_schedule.pipeline_name, run_id=None, run_config=run_config, mode=external_schedule.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=external_pipeline.solid_selection, status=( PipelineRunStatus.FAILURE if len(execution_plan_errors) > 0 else PipelineRunStatus.NOT_STARTED ), root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, external_pipeline_origin=external_pipeline.get_external_origin(), ) if len(execution_plan_errors) > 0: for error in execution_plan_errors: instance.report_engine_event( error.message, possibly_invalid_pipeline_run, EngineEventData.engine_error(error), ) instance.report_run_failed(possibly_invalid_pipeline_run) error_string = "\n".join([error.to_string() for error in execution_plan_errors]) logger.error(f"Failed to fetch execution plan for {external_schedule.name}: {error_string}") return possibly_invalid_pipeline_run