Exemple #1
0
    def get_date_range_partitions(current_time=None):
        check.opt_inst_param(current_time, "current_time", datetime.datetime)
        tz = timezone if timezone else "UTC"
        _start = (to_timezone(start, tz) if isinstance(start, PendulumDateTime)
                  else pendulum.instance(start, tz=tz))

        if end:
            _end = end
        elif current_time:
            _end = current_time
        else:
            _end = pendulum.now(tz)

        # coerce to the definition timezone
        if isinstance(_end, PendulumDateTime):
            _end = to_timezone(_end, tz)
        else:
            _end = pendulum.instance(_end, tz=tz)

        period = pendulum.period(_start, _end)
        date_names = [
            Partition(value=current, name=current.strftime(fmt))
            for current in period.range(delta_range, delta_amount)
        ]

        # We don't include the last element here by default since we only want
        # fully completed intervals, and the _end time is in the middle of the interval
        # represented by the last element of date_names
        if inclusive:
            return date_names

        return date_names[:-1]
Exemple #2
0
def _create_scheduler_run(
    instance,
    schedule_time,
    repo_location,
    external_schedule,
    external_pipeline,
    run_request,
):
    from dagster.daemon.daemon import get_telemetry_daemon_session_id

    run_config = run_request.run_config
    schedule_tags = run_request.tags

    external_execution_plan = repo_location.get_external_execution_plan(
        external_pipeline,
        run_config,
        external_schedule.mode,
        step_keys_to_execute=None,
        known_state=None,
    )
    execution_plan_snapshot = external_execution_plan.execution_plan_snapshot

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    tags[SCHEDULED_EXECUTION_TIME_TAG] = to_timezone(schedule_time,
                                                     "UTC").isoformat()
    if run_request.run_key:
        tags[RUN_KEY_TAG] = run_request.run_key

    log_action(
        instance,
        SCHEDULED_RUN_CREATED,
        metadata={
            "DAEMON_SESSION_ID": get_telemetry_daemon_session_id(),
            "SCHEDULE_NAME_HASH": hash_name(external_schedule.name),
            "repo_hash": hash_name(repo_location.name),
            "pipeline_name_hash": hash_name(external_pipeline.name),
        },
    )

    return instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=PipelineRunStatus.NOT_STARTED,
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        external_pipeline_origin=external_pipeline.get_external_origin(),
        pipeline_code_origin=external_pipeline.get_python_origin(),
    )
Exemple #3
0
def test_partitions_for_monthly_schedule_decorators_without_timezone(
        partition_months_offset: int):
    with pendulum.test(
            to_timezone(create_pendulum_time(2019, 3, 27, 0, 1, 1, tz="UTC"),
                        "US/Eastern")):
        context_without_time = build_schedule_context()

        start_date = datetime(year=2019, month=1, day=1)

        @monthly_schedule(
            pipeline_name="foo_pipeline",
            execution_day_of_month=3,
            start_date=start_date,
            execution_time=time(9, 30),
            partition_months_offset=partition_months_offset,
        )
        def monthly_foo_schedule(monthly_time):
            return {"monthly_time": monthly_time.isoformat()}

        valid_monthly_time = create_pendulum_time(year=2019,
                                                  month=3,
                                                  day=3,
                                                  hour=9,
                                                  minute=30,
                                                  tz="UTC")
        context_with_valid_time = build_schedule_context(
            scheduled_execution_time=valid_monthly_time)

        execution_data = monthly_foo_schedule.evaluate_tick(
            context_with_valid_time)
        assert execution_data.run_requests
        assert len(execution_data.run_requests) == 1
        assert execution_data.run_requests[0].run_config == {
            "monthly_time":
            create_pendulum_time(
                year=2019, month=3, day=1,
                tz="UTC").subtract(months=partition_months_offset).isoformat()
        }

        execution_data = monthly_foo_schedule.evaluate_tick(
            context_without_time)
        assert execution_data.run_requests
        assert len(execution_data.run_requests) == 1
        assert execution_data.run_requests[0].run_config == {
            "monthly_time":
            create_pendulum_time(
                year=2019, month=3, day=1,
                tz="UTC").subtract(months=partition_months_offset).isoformat()
        }

        _check_partitions(
            monthly_foo_schedule,
            3 - partition_months_offset,
            pendulum.instance(start_date, tz="UTC"),
            DEFAULT_MONTHLY_FORMAT,
            relativedelta(months=1),
        )
Exemple #4
0
def test_partitions_for_hourly_schedule_decorators_without_timezone(
        partition_hours_offset: int):
    with pendulum.test(
            to_timezone(create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="UTC"),
                        "US/Eastern")):

        context_without_time = build_schedule_context()

        start_date = datetime(year=2019, month=1, day=1)

        @hourly_schedule(
            pipeline_name="foo_pipeline",
            start_date=start_date,
            execution_time=time(hour=0, minute=25),
            partition_hours_offset=partition_hours_offset,
        )
        def hourly_foo_schedule(hourly_time):
            return {"hourly_time": hourly_time.isoformat()}

        _check_partitions(
            hourly_foo_schedule,
            HOURS_UNTIL_FEBRUARY_27 + 1 - partition_hours_offset,
            pendulum.instance(start_date, tz="UTC"),
            DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE,
            relativedelta(hours=1),
        )

        execution_data = hourly_foo_schedule.evaluate_tick(
            context_without_time)
        assert execution_data.run_requests
        assert len(execution_data.run_requests) == 1
        assert execution_data.run_requests[0].run_config == {
            "hourly_time":
            create_pendulum_time(
                year=2019, month=2, day=27,
                tz="UTC").subtract(hours=partition_hours_offset).isoformat()
        }

        valid_time = create_pendulum_time(year=2019,
                                          month=1,
                                          day=27,
                                          hour=1,
                                          minute=25,
                                          tz="UTC")
        context_with_valid_time = build_schedule_context(
            scheduled_execution_time=valid_time)

        execution_data = hourly_foo_schedule.evaluate_tick(
            context_with_valid_time)
        assert execution_data.run_requests
        assert len(execution_data.run_requests) == 1
        assert execution_data.run_requests[0].run_config == {
            "hourly_time":
            create_pendulum_time(
                year=2019, month=1, day=27, hour=1,
                tz="UTC").subtract(hours=partition_hours_offset).isoformat()
        }
Exemple #5
0
def test_run_record_timestamps():
    with get_instance() as instance:
        freeze_datetime = to_timezone(
            create_pendulum_time(2019, 11, 2, 0, 0, 0, tz="US/Central"), "US/Pacific"
        )

        with pendulum.test(freeze_datetime):
            result = my_job.execute_in_process(instance=instance)
            records = instance.get_run_records(filters=PipelineRunsFilter(run_ids=[result.run_id]))
            assert len(records) == 1
            record = records[0]
            assert record.start_time == 1572670800.0
            assert record.end_time == 1572670800.0
Exemple #6
0
def _get_existing_run_for_request(instance, external_schedule, schedule_time, run_request):
    tags = merge_dicts(
        PipelineRun.tags_for_schedule(external_schedule),
        {
            SCHEDULED_EXECUTION_TIME_TAG: to_timezone(schedule_time, "UTC").isoformat(),
        },
    )
    if run_request.run_key:
        tags[RUN_KEY_TAG] = run_request.run_key
    runs_filter = RunsFilter(tags=tags)
    existing_runs = instance.get_runs(runs_filter)
    if not len(existing_runs):
        return None
    return existing_runs[0]
Exemple #7
0
def _create_scheduler_run(
    instance,
    schedule_time,
    repo_location,
    external_schedule,
    external_pipeline,
    run_request,
):
    run_config = run_request.run_config
    schedule_tags = run_request.tags

    external_execution_plan = repo_location.get_external_execution_plan(
        external_pipeline,
        run_config,
        external_schedule.mode,
        step_keys_to_execute=None,
        known_state=None,
    )
    execution_plan_snapshot = external_execution_plan.execution_plan_snapshot

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    tags[SCHEDULED_EXECUTION_TIME_TAG] = to_timezone(schedule_time,
                                                     "UTC").isoformat()
    if run_request.run_key:
        tags[RUN_KEY_TAG] = run_request.run_key

    return instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=PipelineRunStatus.NOT_STARTED,
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        external_pipeline_origin=external_pipeline.get_external_origin(),
        pipeline_code_origin=external_pipeline.get_python_origin(),
    )
Exemple #8
0
    def resolve_evaluationResult(self, graphene_info):
        if self._job_state.status != InstigatorStatus.RUNNING:
            return None

        if self._job_state.job_type != InstigatorType.SCHEDULE:
            return None

        repository_origin = self._job_state.origin.external_repository_origin
        if not graphene_info.context.has_repository_location(
                repository_origin.repository_location_origin.location_name):
            return None

        repository_location = graphene_info.context.get_repository_location(
            repository_origin.repository_location_origin.location_name)
        if not repository_location.has_repository(
                repository_origin.repository_name):
            return None

        repository = repository_location.get_repository(
            repository_origin.repository_name)
        external_schedule = repository.get_external_schedule(
            self._job_state.name)
        timezone_str = external_schedule.execution_timezone
        if not timezone_str:
            timezone_str = "UTC"

        next_tick_datetime = next(
            external_schedule.execution_time_iterator(self._timestamp))
        schedule_time = to_timezone(pendulum.instance(next_tick_datetime),
                                    timezone_str)
        try:
            schedule_data = repository_location.get_external_schedule_execution_data(
                instance=graphene_info.context.instance,
                repository_handle=repository.handle,
                schedule_name=external_schedule.name,
                scheduled_execution_time=schedule_time,
            )
        except Exception:
            schedule_data = serializable_error_info_from_exc_info(
                sys.exc_info())

        return GrapheneTickEvaluation(schedule_data)
Exemple #9
0
    def test_run_record_timestamps(self, storage):
        assert storage

        self._skip_in_memory(storage)

        @op
        def a():
            pass

        @job
        def my_job():
            a()

        with tempfile.TemporaryDirectory() as temp_dir:
            if storage._instance:  # pylint: disable=protected-access
                instance = storage._instance  # pylint: disable=protected-access
            else:
                instance = DagsterInstance(
                    instance_type=InstanceType.EPHEMERAL,
                    local_artifact_storage=LocalArtifactStorage(temp_dir),
                    run_storage=storage,
                    event_storage=InMemoryEventLogStorage(),
                    compute_log_manager=NoOpComputeLogManager(),
                    run_coordinator=DefaultRunCoordinator(),
                    run_launcher=SyncInMemoryRunLauncher(),
                )

            freeze_datetime = to_timezone(
                create_pendulum_time(2019, 11, 2, 0, 0, 0, tz="US/Central"), "US/Pacific"
            )

            with pendulum.test(freeze_datetime):
                result = my_job.execute_in_process(instance=instance)
                records = instance.get_run_records(
                    filters=PipelineRunsFilter(run_ids=[result.run_id])
                )
                assert len(records) == 1
                record = records[0]
                assert record.start_time == freeze_datetime.timestamp()
                assert record.end_time == freeze_datetime.timestamp()
Exemple #10
0
def _test_backfill_in_subprocess(instance_ref, debug_crash_flags):
    execution_datetime = to_timezone(
        create_pendulum_time(
            year=2021,
            month=2,
            day=17,
        ),
        "US/Central",
    )
    with DagsterInstance.from_ref(instance_ref) as instance:
        try:
            with pendulum.test(execution_datetime), create_test_daemon_workspace() as workspace:
                list(
                    execute_backfill_iteration(
                        instance,
                        workspace,
                        get_default_daemon_logger("BackfillDaemon"),
                        debug_crash_flags=debug_crash_flags,
                    )
                )
        finally:
            cleanup_test_instance(instance)
Exemple #11
0
def test_differing_timezones(instance, workspace, external_repo):
    # Two schedules, one using US/Central, the other on US/Eastern
    freeze_datetime = to_timezone(
        create_pendulum_time(2019, 2, 27, 23, 59, 59, tz="US/Eastern"), "US/Pacific"
    )
    with pendulum.test(freeze_datetime):
        external_schedule = external_repo.get_external_schedule("daily_central_time_schedule")
        external_eastern_schedule = external_repo.get_external_schedule(
            "daily_eastern_time_schedule"
        )

        schedule_origin = external_schedule.get_external_origin()
        eastern_origin = external_eastern_schedule.get_external_origin()

        instance.start_schedule(external_schedule)
        instance.start_schedule(external_eastern_schedule)

        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

        ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id)
        assert len(ticks) == 0

        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

        ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id)
        assert len(ticks) == 0

    # Past midnight eastern time, the eastern timezone schedule will run, but not the central timezone
    freeze_datetime = freeze_datetime.add(minutes=1)
    with pendulum.test(freeze_datetime):
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )

        assert instance.get_runs_count() == 1
        ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id)
        assert len(ticks) == 1

        expected_datetime = to_timezone(
            create_pendulum_time(year=2019, month=2, day=28, tz="US/Eastern"), "UTC"
        )

        validate_tick(
            ticks[0],
            external_eastern_schedule,
            expected_datetime,
            TickStatus.SUCCESS,
            [run.run_id for run in instance.get_runs()],
        )

        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

        wait_for_all_runs_to_start(instance)
        validate_run_started(
            instance,
            instance.get_runs()[0],
            expected_datetime,
            create_pendulum_time(2019, 2, 27, tz="US/Eastern"),
        )

    # Past midnight central time, the central timezone schedule will now run
    freeze_datetime = freeze_datetime.add(hours=1)
    with pendulum.test(freeze_datetime):

        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )

        assert instance.get_runs_count() == 2
        ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id)
        assert len(ticks) == 1

        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 1

        expected_datetime = to_timezone(
            create_pendulum_time(year=2019, month=2, day=28, tz="US/Central"), "UTC"
        )

        validate_tick(
            ticks[0],
            external_schedule,
            expected_datetime,
            TickStatus.SUCCESS,
            [instance.get_runs()[0].run_id],
        )

        wait_for_all_runs_to_start(instance)
        validate_run_started(
            instance,
            instance.get_runs()[0],
            expected_datetime,
            create_pendulum_time(2019, 2, 27, tz="US/Central"),
        )

        # Verify idempotence
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 2
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 1
        assert ticks[0].status == TickStatus.SUCCESS

        ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id)
        assert len(ticks) == 1
        assert ticks[0].status == TickStatus.SUCCESS
Exemple #12
0
def test_execute_during_dst_transition_fall_back(instance, workspace, external_repo):
    # A schedule that runs daily during a time that occurs twice during a fall DST transition
    # only executes once for that day
    freeze_datetime = to_timezone(
        create_pendulum_time(2019, 11, 2, 0, 0, 0, tz="US/Central"), "US/Pacific"
    )

    with pendulum.test(freeze_datetime):
        external_schedule = external_repo.get_external_schedule(
            "daily_dst_transition_schedule_doubled_time"
        )
        schedule_origin = external_schedule.get_external_origin()
        instance.start_schedule(external_schedule)

        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

    freeze_datetime = freeze_datetime.add(days=3)

    with pendulum.test(freeze_datetime):
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )

        wait_for_all_runs_to_start(instance)

        assert instance.get_runs_count() == 3
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 3

        expected_datetimes_utc = [
            create_pendulum_time(2019, 11, 4, 7, 30, 0, tz="UTC"),
            create_pendulum_time(2019, 11, 3, 7, 30, 0, tz="UTC"),
            create_pendulum_time(2019, 11, 2, 6, 30, 0, tz="UTC"),
        ]

        expected_partition_times = [
            create_pendulum_time(2019, 11, 3, tz="US/Central"),
            create_pendulum_time(2019, 11, 2, tz="US/Central"),
            create_pendulum_time(2019, 11, 1, tz="US/Central"),
        ]

        for i in range(3):
            validate_tick(
                ticks[i],
                external_schedule,
                expected_datetimes_utc[i],
                TickStatus.SUCCESS,
                [instance.get_runs()[i].run_id],
            )

            validate_run_started(
                instance,
                instance.get_runs()[i],
                expected_datetimes_utc[i],
                partition_time=expected_partition_times[i],
            )

        # Verify idempotence
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 3
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 3
Exemple #13
0
def test_execute_during_dst_transition_spring_forward(instance, workspace, external_repo):
    # Verify that a daily schedule that is supposed to execute at a time that is skipped
    # by the DST transition does not execute for that day
    # Day before DST
    freeze_datetime = to_timezone(
        create_pendulum_time(2019, 3, 9, 0, 0, 0, tz="US/Central"), "US/Pacific"
    )

    with pendulum.test(freeze_datetime):
        external_schedule = external_repo.get_external_schedule(
            "daily_dst_transition_schedule_skipped_time"
        )
        schedule_origin = external_schedule.get_external_origin()
        instance.start_schedule(external_schedule)

        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

    freeze_datetime = freeze_datetime.add(days=3)

    with pendulum.test(freeze_datetime):
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )

        wait_for_all_runs_to_start(instance)

        assert instance.get_runs_count() == 3
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 3

        expected_datetimes_utc = [
            to_timezone(create_pendulum_time(2019, 3, 11, 2, 30, 0, tz="US/Central"), "UTC"),
            to_timezone(create_pendulum_time(2019, 3, 10, 3, 00, 0, tz="US/Central"), "UTC"),
            to_timezone(create_pendulum_time(2019, 3, 9, 2, 30, 0, tz="US/Central"), "UTC"),
        ]

        expected_partition_times = [
            create_pendulum_time(2019, 3, 10, tz="US/Central"),
            create_pendulum_time(2019, 3, 9, tz="US/Central"),
            create_pendulum_time(2019, 3, 8, tz="US/Central"),
        ]

        partition_set_def = the_repo.get_partition_set_def(
            "daily_dst_transition_schedule_skipped_time_partitions"
        )
        partition_names = partition_set_def.get_partition_names()

        assert "2019-03-08" in partition_names
        assert "2019-03-09" in partition_names
        assert "2019-03-10" in partition_names

        for i in range(3):
            validate_tick(
                ticks[i],
                external_schedule,
                expected_datetimes_utc[i],
                TickStatus.SUCCESS,
                [instance.get_runs()[i].run_id],
            )

            validate_run_started(
                instance,
                instance.get_runs()[i],
                expected_datetimes_utc[i],
                partition_time=expected_partition_times[i],
            )

        # Verify idempotence
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 3
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 3
Exemple #14
0
def test_daily_dst_fall_back(instance, workspace, external_repo):
    # Verify that a daily schedule still runs once per day during the fall DST transition
    # Night before DST
    freeze_datetime = to_timezone(
        create_pendulum_time(2019, 11, 3, 0, 0, 0, tz="US/Central"), "US/Pacific"
    )

    with pendulum.test(freeze_datetime):
        external_schedule = external_repo.get_external_schedule("daily_central_time_schedule")
        schedule_origin = external_schedule.get_external_origin()
        instance.start_schedule(external_schedule)

        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

    freeze_datetime = freeze_datetime.add(days=2)

    with pendulum.test(freeze_datetime):
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )

        wait_for_all_runs_to_start(instance)

        assert instance.get_runs_count() == 3
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 3

        # UTC time changed by one hour after the transition, still running daily at the same
        # time in CT
        expected_datetimes_utc = [
            create_pendulum_time(2019, 11, 5, 6, 0, 0, tz="UTC"),
            create_pendulum_time(2019, 11, 4, 6, 0, 0, tz="UTC"),
            create_pendulum_time(2019, 11, 3, 5, 0, 0, tz="UTC"),
        ]

        expected_partition_times = [
            create_pendulum_time(2019, 11, 4, tz="US/Central"),
            create_pendulum_time(2019, 11, 3, tz="US/Central"),
            create_pendulum_time(2019, 11, 2, tz="US/Central"),
        ]

        for i in range(3):
            validate_tick(
                ticks[i],
                external_schedule,
                expected_datetimes_utc[i],
                TickStatus.SUCCESS,
                [instance.get_runs()[i].run_id],
            )

            validate_run_started(
                instance,
                instance.get_runs()[i],
                expected_datetimes_utc[i],
                partition_time=expected_partition_times[i],
            )

        # Verify idempotence
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 3
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 3
Exemple #15
0
def test_hourly_dst_fall_back(instance, workspace, external_repo):
    # Verify that an hourly schedule still runs hourly during the fall DST transition
    # 12:30 AM CST
    freeze_datetime = to_timezone(
        create_pendulum_time(2019, 11, 3, 0, 30, 0, tz="US/Central"), "US/Pacific"
    )

    with pendulum.test(freeze_datetime):
        external_schedule = external_repo.get_external_schedule("hourly_central_time_schedule")
        schedule_origin = external_schedule.get_external_origin()
        instance.start_schedule(external_schedule)

        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

    freeze_datetime = freeze_datetime.add(hours=4)

    # DST has now happened, 4 hours later it is 3:30AM CST
    # Should be 4 runs: 1AM CDT, 1AM CST, 2AM CST, 3AM CST
    with pendulum.test(freeze_datetime):
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )

        wait_for_all_runs_to_start(instance)

        assert instance.get_runs_count() == 4
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 4

        expected_datetimes_utc = [
            create_pendulum_time(2019, 11, 3, 9, 0, 0, tz="UTC"),
            create_pendulum_time(2019, 11, 3, 8, 0, 0, tz="UTC"),
            create_pendulum_time(2019, 11, 3, 7, 0, 0, tz="UTC"),
            create_pendulum_time(2019, 11, 3, 6, 0, 0, tz="UTC"),
        ]

        expected_ct_times = [
            "2019-11-03T03:00:00-06:00",  # 3 AM CST
            "2019-11-03T02:00:00-06:00",  # 2 AM CST
            "2019-11-03T01:00:00-06:00",  # 1 AM CST
            "2019-11-03T01:00:00-05:00",  # 1 AM CDT
        ]

        for i in range(4):
            assert (
                to_timezone(expected_datetimes_utc[i], "US/Central").isoformat()
                == expected_ct_times[i]
            )

            validate_tick(
                ticks[i],
                external_schedule,
                expected_datetimes_utc[i],
                TickStatus.SUCCESS,
                [instance.get_runs()[i].run_id],
            )

            validate_run_started(
                instance,
                instance.get_runs()[i],
                expected_datetimes_utc[i],
                partition_time=to_timezone(expected_datetimes_utc[i], "US/Central").subtract(
                    hours=1
                ),
                partition_fmt=DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE,
            )

        # Verify idempotence
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 4
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 4
Exemple #16
0
def test_hourly_dst_spring_forward(instance, workspace, external_repo):
    # Verify that an hourly schedule still runs hourly during the spring DST transition
    # 1AM CST
    freeze_datetime = to_timezone(
        create_pendulum_time(2019, 3, 10, 1, 0, 0, tz="US/Central"), "US/Pacific"
    )

    with pendulum.test(freeze_datetime):
        external_schedule = external_repo.get_external_schedule("hourly_central_time_schedule")
        schedule_origin = external_schedule.get_external_origin()
        instance.start_schedule(external_schedule)

        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

    freeze_datetime = freeze_datetime.add(hours=2)

    # DST has now happened, 2 hours later it is 4AM CST
    # Should be 3 runs: 1AM CST, 3AM CST, 4AM CST
    with pendulum.test(freeze_datetime):
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )

        wait_for_all_runs_to_start(instance)

        assert instance.get_runs_count() == 3
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 3

        expected_datetimes_utc = [
            to_timezone(create_pendulum_time(2019, 3, 10, 4, 0, 0, tz="US/Central"), "UTC"),
            to_timezone(create_pendulum_time(2019, 3, 10, 3, 0, 0, tz="US/Central"), "UTC"),
            to_timezone(create_pendulum_time(2019, 3, 10, 1, 0, 0, tz="US/Central"), "UTC"),
        ]

        for i in range(3):
            validate_tick(
                ticks[i],
                external_schedule,
                expected_datetimes_utc[i],
                TickStatus.SUCCESS,
                [instance.get_runs()[i].run_id],
            )

            validate_run_started(
                instance,
                instance.get_runs()[i],
                expected_datetimes_utc[i],
                partition_time=to_timezone(expected_datetimes_utc[i], "US/Central").subtract(
                    hours=1
                ),
                partition_fmt=DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE,
            )

        # Verify idempotence
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 3
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 3
Exemple #17
0
def schedule_execution_time_iterator(
        start_timestamp: float, cron_schedule: str,
        execution_timezone: Optional[str]) -> Iterator[datetime.datetime]:
    timezone_str = execution_timezone if execution_timezone else "UTC"

    utc_datetime = pytz.utc.localize(
        datetime.datetime.utcfromtimestamp(start_timestamp))
    start_datetime = utc_datetime.astimezone(pytz.timezone(timezone_str))

    date_iter = croniter(cron_schedule, start_datetime)

    # Go back one iteration so that the next iteration is the first time that is >= start_datetime
    # and matches the cron schedule
    next_date = date_iter.get_prev(datetime.datetime)

    check.invariant(is_valid_cron_string(cron_schedule))

    cron_parts, _ = croniter.expand(cron_schedule)

    is_numeric = [len(part) == 1 and part[0] != "*" for part in cron_parts]
    is_wildcard = [len(part) == 1 and part[0] == "*" for part in cron_parts]

    delta_fn = None
    should_hour_change = False

    # Special-case common intervals (hourly/daily/weekly/monthly) since croniter iteration can be
    # much slower than adding a fixed interval
    if all(is_numeric[0:3]) and all(is_wildcard[3:]):  # monthly
        delta_fn = lambda d, num: d.add(months=num)
        should_hour_change = False
    elif all(is_numeric[0:2]) and is_numeric[4] and all(
            is_wildcard[2:4]):  # weekly
        delta_fn = lambda d, num: d.add(weeks=num)
        should_hour_change = False
    elif all(is_numeric[0:2]) and all(is_wildcard[2:]):  # daily
        delta_fn = lambda d, num: d.add(days=num)
        should_hour_change = False
    elif is_numeric[0] and all(is_wildcard[1:]):  # hourly
        delta_fn = lambda d, num: d.add(hours=num)
        should_hour_change = True
    else:
        delta_fn = None
        should_hour_change = False

    if delta_fn:
        # Use pendulums for intervals when possible
        next_date = to_timezone(pendulum.instance(next_date), timezone_str)
        while True:
            curr_hour = next_date.hour

            next_date_cand = delta_fn(next_date, 1)
            new_hour = next_date_cand.hour

            if not should_hour_change and new_hour != curr_hour:
                # If the hour changes during a daily/weekly/monthly schedule, it
                # indicates that the time shifted due to falling in a time that doesn't
                # exist due to a DST transition (for example, 2:30AM CST on 3/10/2019).
                # Instead, execute at the first time that does exist (the start of the hour),
                # but return to the original hour for all subsequent executions so that the
                # hour doesn't stay different permanently.

                check.invariant(new_hour == curr_hour + 1)
                yield next_date_cand.replace(minute=0)

                next_date_cand = delta_fn(next_date, 2)
                check.invariant(next_date_cand.hour == curr_hour)

            next_date = next_date_cand
            yield next_date
    else:
        # Otherwise fall back to croniter
        while True:
            next_date = to_timezone(
                pendulum.instance(date_iter.get_next(datetime.datetime)),
                timezone_str)

            yield next_date
Exemple #18
0
def test_non_utc_timezone_run(instance, workspace, external_repo):
    # Verify that schedule runs at the expected time in a non-UTC timezone
    freeze_datetime = to_timezone(
        create_pendulum_time(2019, 2, 27, 23, 59, 59, tz="US/Central"), "US/Pacific"
    )
    with pendulum.test(freeze_datetime):
        external_schedule = external_repo.get_external_schedule("daily_central_time_schedule")

        schedule_origin = external_schedule.get_external_origin()

        instance.start_schedule(external_schedule)

        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

    freeze_datetime = freeze_datetime.add(seconds=2)
    with pendulum.test(freeze_datetime):
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )

        assert instance.get_runs_count() == 1
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 1

        expected_datetime = to_timezone(
            create_pendulum_time(year=2019, month=2, day=28, tz="US/Central"), "UTC"
        )

        validate_tick(
            ticks[0],
            external_schedule,
            expected_datetime,
            TickStatus.SUCCESS,
            [run.run_id for run in instance.get_runs()],
        )

        wait_for_all_runs_to_start(instance)
        validate_run_started(
            instance,
            instance.get_runs()[0],
            expected_datetime,
            create_pendulum_time(2019, 2, 27, tz="US/Central"),
        )

        # Verify idempotence
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 1
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 1
        assert ticks[0].status == TickStatus.SUCCESS
Exemple #19
0
def test_failure_recovery_before_run_created(external_repo_context,
                                             crash_location, crash_signal,
                                             capfd):
    # Verify that if the scheduler crashes or is interrupted before a run is created,
    # it will create exactly one tick/run when it is re-launched
    with instance_with_schedules(external_repo_context) as (
            instance,
            _grpc_server_registry,
            external_repo,
    ):
        initial_datetime = to_timezone(
            create_pendulum_time(year=2019,
                                 month=2,
                                 day=27,
                                 hour=0,
                                 minute=0,
                                 second=0,
                                 tz="UTC"),
            "US/Central",
        )

        frozen_datetime = initial_datetime.add()

        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with pendulum.test(frozen_datetime):
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            captured = capfd.readouterr()
            assert (
                captured.out.replace("\r\n", "\n") ==
                """2019-02-26 18:00:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:00:00 - SchedulerDaemon - INFO - Evaluating schedule `simple_schedule` at 2019-02-27 00:00:00+0000
""")

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.STARTED

            assert instance.get_runs_count() == 0

        frozen_datetime = frozen_datetime.add(minutes=5)
        with pendulum.test(frozen_datetime):
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(
                instance.get_runs()[0],
                execution_time=initial_datetime,
                partition_time=create_pendulum_time(2019, 2, 26),
            )

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                JobTickStatus.SUCCESS,
                [instance.get_runs()[0].run_id],
            )
            captured = capfd.readouterr()
            assert (
                captured.out.replace("\r\n", "\n") ==
                """2019-02-26 18:05:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:05:00 - SchedulerDaemon - INFO - Evaluating schedule `simple_schedule` at 2019-02-27 00:00:00+0000
2019-02-26 18:05:00 - SchedulerDaemon - INFO - Resuming previously interrupted schedule execution
2019-02-26 18:05:00 - SchedulerDaemon - INFO - Completed scheduled launch of run {run_id} for simple_schedule
""".format(run_id=instance.get_runs()[0].run_id))
Exemple #20
0
def test_failure_before_run_created(crash_location, crash_signal, capfd):
    frozen_datetime = to_timezone(
        create_pendulum_time(year=2019, month=2, day=28, hour=0, minute=0, second=1, tz="UTC"),
        "US/Central",
    )

    with instance_with_sensors() as (
        instance,
        _grpc_server_registry,
        external_repo,
    ):
        with pendulum.test(frozen_datetime):
            external_sensor = external_repo.get_external_sensor("simple_sensor")
            instance.add_instigator_state(
                InstigatorState(
                    external_sensor.get_external_origin(),
                    InstigatorType.SENSOR,
                    InstigatorStatus.RUNNING,
                )
            )

            # create a tick
            launch_process = spawn_ctx.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            launch_process.start()
            launch_process.join(timeout=60)
            ticks = instance.get_ticks(external_sensor.get_external_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == TickStatus.SKIPPED
            capfd.readouterr()

            # create a starting tick, but crash
            debug_crash_flags = {external_sensor.name: {crash_location: crash_signal}}
            launch_process = spawn_ctx.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime.add(seconds=31), debug_crash_flags],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode != 0

            capfd.readouterr()

            ticks = instance.get_ticks(external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            assert ticks[0].status == TickStatus.STARTED
            assert not int(ticks[0].timestamp) % 2  # skip condition for simple_sensor
            assert instance.get_runs_count() == 0

            # create another tick, but ensure that the last evaluation time used is from the first,
            # successful tick rather than the failed tick
            launch_process = spawn_ctx.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime.add(seconds=62), None],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode == 0
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            assert (
                get_logger_output_from_capfd(capfd, "dagster.daemon.SensorDaemon")
                == f"""2019-02-27 18:01:03 -0600 - dagster.daemon.SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor
2019-02-27 18:01:03 -0600 - dagster.daemon.SensorDaemon - INFO - Launching run for simple_sensor
2019-02-27 18:01:03 -0600 - dagster.daemon.SensorDaemon - INFO - Completed launch of run {run.run_id} for simple_sensor"""
            )

            ticks = instance.get_ticks(external_sensor.get_external_origin_id())
            assert len(ticks) == 3
            assert ticks[0].status == TickStatus.SUCCESS
Exemple #21
0
def test_partitions_for_hourly_schedule_decorators_with_timezone(
        partition_hours_offset: int):
    with pendulum.test(
            create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="US/Central")):
        start_date = datetime(year=2019, month=1, day=1)

        # You can specify a start date with no timezone and it will be assumed to be
        # in the execution timezone

        @hourly_schedule(
            pipeline_name="foo_pipeline",
            start_date=start_date,
            execution_time=time(hour=0, minute=25),
            execution_timezone="US/Central",
            partition_hours_offset=partition_hours_offset,
        )
        def hourly_central_schedule(hourly_time):
            return {"hourly_time": hourly_time.isoformat()}

        assert hourly_central_schedule.execution_timezone == "US/Central"

        _check_partitions(
            hourly_central_schedule,
            HOURS_UNTIL_FEBRUARY_27 + 1 - partition_hours_offset,
            pendulum.instance(start_date, tz="US/Central"),
            DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE,
            relativedelta(hours=1),
        )

        valid_time = create_pendulum_time(year=2019,
                                          month=1,
                                          day=27,
                                          hour=1,
                                          minute=25,
                                          tz="US/Central")
        context_with_valid_time = build_schedule_context(
            scheduled_execution_time=valid_time)

        execution_data = hourly_central_schedule.evaluate_tick(
            context_with_valid_time)
        assert execution_data.run_requests
        assert len(execution_data.run_requests) == 1
        assert execution_data.run_requests[0].run_config == {
            "hourly_time":
            create_pendulum_time(year=2019,
                                 month=1,
                                 day=27,
                                 hour=1,
                                 tz="US/Central").subtract(
                                     hours=partition_hours_offset).isoformat()
        }

        # You can specify a start date in a different timezone and it will be transformed into the
        # execution timezone
        start_date_with_different_timezone = create_pendulum_time(
            2019, 1, 1, 0, tz="US/Pacific")

        @hourly_schedule(
            pipeline_name="foo_pipeline",
            start_date=start_date_with_different_timezone,
            execution_time=time(hour=0, minute=25),
            execution_timezone="US/Central",
            partition_hours_offset=partition_hours_offset,
        )
        def hourly_central_schedule_with_timezone_start_time(hourly_time):
            return {"hourly_time": hourly_time.isoformat()}

        _check_partitions(
            hourly_central_schedule_with_timezone_start_time,
            HOURS_UNTIL_FEBRUARY_27 -
            2  # start date is two hours later since it's in PT
            + 1 - partition_hours_offset,
            to_timezone(start_date_with_different_timezone, "US/Central"),
            DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE,
            relativedelta(hours=1),
        )
Exemple #22
0
def schedule_execution_time_iterator(
        start_timestamp: float, cron_schedule: str,
        execution_timezone: Optional[str]) -> Iterator[datetime.datetime]:
    timezone_str = execution_timezone if execution_timezone else "UTC"

    start_datetime = pendulum.from_timestamp(start_timestamp, tz=timezone_str)

    date_iter = croniter(cron_schedule, start_datetime)

    # Go back one iteration so that the next iteration is the first time that is >= start_datetime
    # and matches the cron schedule
    next_date = to_timezone(
        pendulum.instance(date_iter.get_prev(datetime.datetime)), timezone_str)

    cron_parts = cron_schedule.split(" ")

    check.invariant(len(cron_parts) == 5)

    is_numeric = [part.isnumeric() for part in cron_parts]

    delta_fn = None

    # Special-case common intervals (hourly/daily/weekly/monthly) since croniter iteration can be
    # much slower than adding a fixed interval
    if cron_schedule.endswith(" * *") and all(is_numeric[0:3]):  # monthly
        delta_fn = lambda d, num: d.add(months=num)
        should_hour_change = False
    elif (all(is_numeric[0:2]) and is_numeric[4] and cron_parts[2] == "*"
          and cron_parts[3] == "*"):  # weekly
        delta_fn = lambda d, num: d.add(weeks=num)
        should_hour_change = False
    elif all(is_numeric[0:2]) and cron_schedule.endswith(" * * *"):  # daily
        delta_fn = lambda d, num: d.add(days=num)
        should_hour_change = False
    elif is_numeric[0] and cron_schedule.endswith(" * * * *"):  # hourly
        delta_fn = lambda d, num: d.add(hours=num)
        should_hour_change = True

    while True:
        if delta_fn:
            curr_hour = next_date.hour

            next_date_cand = delta_fn(next_date, 1)
            new_hour = next_date_cand.hour

            if not should_hour_change and new_hour != curr_hour:
                # If the hour changes during a daily/weekly/monthly schedule, it
                # indicates that the time shifted due to falling in a time that doesn't
                # exist due to a DST transition (for example, 2:30AM CST on 3/10/2019).
                # Instead, execute at the first time that does exist (the start of the hour),
                # but return to the original hour for all subsequent executions so that the
                # hour doesn't stay different permanently.

                check.invariant(new_hour == curr_hour + 1)
                yield next_date_cand.replace(minute=0)

                next_date_cand = delta_fn(next_date, 2)
                check.invariant(next_date_cand.hour == curr_hour)

            next_date = next_date_cand
        else:
            next_date = to_timezone(
                pendulum.instance(date_iter.get_next(datetime.datetime)),
                timezone_str)

        yield next_date
Exemple #23
0
def test_failure_recovery_before_run_created(instance, external_repo,
                                             crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted before a run is created,
    # it will create exactly one tick/run when it is re-launched
    initial_datetime = to_timezone(
        create_pendulum_time(year=2019,
                             month=2,
                             day=27,
                             hour=0,
                             minute=0,
                             second=0,
                             tz="UTC"),
        "US/Central",
    )

    frozen_datetime = initial_datetime.add()

    external_schedule = external_repo.get_external_schedule("simple_schedule")
    with pendulum.test(frozen_datetime):
        instance.start_schedule(external_schedule)

        debug_crash_flags = {
            external_schedule.name: {
                crash_location: crash_signal
            }
        }

        scheduler_process = spawn_ctx.Process(
            target=_test_launch_scheduled_runs_in_subprocess,
            args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
        )
        scheduler_process.start()
        scheduler_process.join(timeout=60)

        assert scheduler_process.exitcode != 0

        ticks = instance.get_ticks(external_schedule.get_external_origin_id(),
                                   external_schedule.selector_id)
        assert len(ticks) == 1
        assert ticks[0].status == TickStatus.STARTED

        assert instance.get_runs_count() == 0

    frozen_datetime = frozen_datetime.add(minutes=5)
    with pendulum.test(frozen_datetime):
        scheduler_process = spawn_ctx.Process(
            target=_test_launch_scheduled_runs_in_subprocess,
            args=[instance.get_ref(), frozen_datetime, None],
        )
        scheduler_process.start()
        scheduler_process.join(timeout=60)
        assert scheduler_process.exitcode == 0

        assert instance.get_runs_count() == 1
        wait_for_all_runs_to_start(instance)
        validate_run_exists(
            instance.get_runs()[0],
            execution_time=initial_datetime,
            partition_time=create_pendulum_time(2019, 2, 26),
        )

        ticks = instance.get_ticks(external_schedule.get_external_origin_id(),
                                   external_schedule.selector_id)
        assert len(ticks) == 1
        validate_tick(
            ticks[0],
            external_schedule,
            initial_datetime,
            TickStatus.SUCCESS,
            [instance.get_runs()[0].run_id],
        )
Exemple #24
0
def test_different_days_in_different_timezones(instance, workspace, external_repo):
    freeze_datetime = to_timezone(
        create_pendulum_time(2019, 2, 27, 22, 59, 59, tz="US/Central"), "US/Pacific"
    )
    with pendulum.test(freeze_datetime):
        # Runs every day at 11PM (CST)
        external_schedule = external_repo.get_external_schedule("daily_late_schedule")
        schedule_origin = external_schedule.get_external_origin()
        instance.start_schedule(external_schedule)

        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 0
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 0

    freeze_datetime = freeze_datetime.add(seconds=2)
    with pendulum.test(freeze_datetime):
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )

        assert instance.get_runs_count() == 1
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 1

        expected_datetime = to_timezone(
            create_pendulum_time(year=2019, month=2, day=27, hour=23, tz="US/Central"), "UTC"
        )

        validate_tick(
            ticks[0],
            external_schedule,
            expected_datetime,
            TickStatus.SUCCESS,
            [instance.get_runs()[0].run_id],
        )

        wait_for_all_runs_to_start(instance)
        validate_run_started(
            instance,
            instance.get_runs()[0],
            expected_datetime,
            create_pendulum_time(2019, 2, 26, tz="US/Central"),
        )

        # Verify idempotence
        list(
            launch_scheduled_runs(
                instance,
                workspace,
                logger(),
                pendulum.now("UTC"),
            )
        )
        assert instance.get_runs_count() == 1
        ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id)
        assert len(ticks) == 1
        assert ticks[0].status == TickStatus.SUCCESS
Exemple #25
0
def test_failure_after_run_launched(crash_location, crash_signal, capfd):
    frozen_datetime = to_timezone(
        create_pendulum_time(
            year=2019,
            month=2,
            day=28,
            hour=0,
            minute=0,
            second=0,
            tz="UTC",
        ),
        "US/Central",
    )
    with instance_with_sensors() as (
        instance,
        _grpc_server_registry,
        external_repo,
    ):
        with pendulum.test(frozen_datetime):
            external_sensor = external_repo.get_external_sensor("run_key_sensor")
            instance.add_instigator_state(
                InstigatorState(
                    external_sensor.get_external_origin(),
                    InstigatorType.SENSOR,
                    InstigatorStatus.RUNNING,
                )
            )

            # create a run, launch but crash
            debug_crash_flags = {external_sensor.name: {crash_location: crash_signal}}
            launch_process = spawn_ctx.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode != 0

            ticks = instance.get_ticks(external_sensor.get_external_origin_id())

            assert len(ticks) == 1
            assert ticks[0].status == TickStatus.STARTED
            assert instance.get_runs_count() == 1

            run = instance.get_runs()[0]
            wait_for_all_runs_to_start(instance)
            assert run.tags.get(SENSOR_NAME_TAG) == "run_key_sensor"
            assert run.tags.get(RUN_KEY_TAG) == "only_once"
            capfd.readouterr()

            launch_process = spawn_ctx.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime.add(seconds=1), None],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode == 0
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            captured = capfd.readouterr()

            assert (
                'Skipping 1 run for sensor run_key_sensor already completed with run keys: ["only_once"]'
                in captured.out
            )

            ticks = instance.get_ticks(external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            assert ticks[0].status == TickStatus.SKIPPED
Exemple #26
0
def test_failure_after_run_created_before_run_launched(external_repo_context,
                                                       crash_location,
                                                       crash_signal, capfd):
    frozen_datetime = to_timezone(
        create_pendulum_time(year=2019,
                             month=2,
                             day=28,
                             hour=0,
                             minute=0,
                             second=0,
                             tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            _grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(frozen_datetime):
            external_sensor = external_repo.get_external_sensor(
                "run_key_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))

            # create a starting tick, but crash
            debug_crash_flags = {
                external_sensor.name: {
                    crash_location: crash_signal
                }
            }
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode != 0

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())

            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.STARTED
            assert instance.get_runs_count() == 1

            run = instance.get_runs()[0]
            # Run was created, but hasn't launched yet
            assert run.status == PipelineRunStatus.NOT_STARTED
            assert run.tags.get(SENSOR_NAME_TAG) == "run_key_sensor"
            assert run.tags.get(RUN_KEY_TAG) == "only_once"

            # clear output
            capfd.readouterr()

            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    frozen_datetime.add(seconds=1), None
                ],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode == 0
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            captured = capfd.readouterr()

            assert (
                f"Run {run.run_id} already created with the run key `only_once` for run_key_sensor"
                in captured.out)

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            assert ticks[0].status == JobTickStatus.SUCCESS
def test_non_utc_timezone_run(external_repo_context, capfd):
    # Verify that schedule runs at the expected time in a non-UTC timezone
    with instance_with_schedules(external_repo_context) as (
            instance,
            workspace,
            external_repo,
    ):
        freeze_datetime = to_timezone(
            create_pendulum_time(2019, 2, 27, 23, 59, 59, tz="US/Central"),
            "US/Pacific")
        with pendulum.test(freeze_datetime):
            external_schedule = external_repo.get_external_schedule(
                "daily_central_time_schedule")

            schedule_origin = external_schedule.get_external_origin()

            instance.start_schedule_and_update_storage_state(external_schedule)

            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 0

            list(
                launch_scheduled_runs(
                    instance,
                    workspace,
                    logger(),
                    pendulum.now("UTC"),
                ))
            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 0

            captured = capfd.readouterr()

            assert (
                captured.out ==
                """2019-02-27 21:59:59 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: daily_central_time_schedule
2019-02-27 21:59:59 - SchedulerDaemon - INFO - No new runs for daily_central_time_schedule
""")
        freeze_datetime = freeze_datetime.add(seconds=2)
        with pendulum.test(freeze_datetime):
            list(
                launch_scheduled_runs(
                    instance,
                    workspace,
                    logger(),
                    pendulum.now("UTC"),
                ))

            assert instance.get_runs_count() == 1
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 1

            expected_datetime = to_timezone(
                create_pendulum_time(year=2019,
                                     month=2,
                                     day=28,
                                     tz="US/Central"), "UTC")

            validate_tick(
                ticks[0],
                external_schedule,
                expected_datetime,
                JobTickStatus.SUCCESS,
                [run.run_id for run in instance.get_runs()],
            )

            wait_for_all_runs_to_start(instance)
            validate_run_started(
                instance.get_runs()[0],
                expected_datetime,
                create_pendulum_time(2019, 2, 27, tz="US/Central"),
            )

            captured = capfd.readouterr()

            assert (
                captured.out ==
                """2019-02-27 22:00:01 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: daily_central_time_schedule
2019-02-27 22:00:01 - SchedulerDaemon - INFO - Evaluating schedule `daily_central_time_schedule` at 2019-02-28 00:00:00-0600
2019-02-27 22:00:01 - SchedulerDaemon - INFO - Completed scheduled launch of run {run_id} for daily_central_time_schedule
""".format(run_id=instance.get_runs()[0].run_id))

            # Verify idempotence
            list(
                launch_scheduled_runs(
                    instance,
                    workspace,
                    logger(),
                    pendulum.now("UTC"),
                ))
            assert instance.get_runs_count() == 1
            ticks = instance.get_job_ticks(schedule_origin.get_id())
            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.SUCCESS
Exemple #28
0
def _create_scheduler_run(
    instance,
    logger,
    schedule_time,
    repo_location,
    external_schedule,
    external_pipeline,
    run_request,
):
    run_config = run_request.run_config
    schedule_tags = run_request.tags

    execution_plan_errors = []
    execution_plan_snapshot = None

    try:
        external_execution_plan = repo_location.get_external_execution_plan(
            external_pipeline,
            run_config,
            external_schedule.mode,
            step_keys_to_execute=None,
            known_state=None,
        )
        execution_plan_snapshot = external_execution_plan.execution_plan_snapshot
    except Exception:  # pylint: disable=broad-except
        execution_plan_errors.append(serializable_error_info_from_exc_info(sys.exc_info()))

    pipeline_tags = external_pipeline.tags or {}
    check_tags(pipeline_tags, "pipeline_tags")
    tags = merge_dicts(pipeline_tags, schedule_tags)

    tags[SCHEDULED_EXECUTION_TIME_TAG] = to_timezone(schedule_time, "UTC").isoformat()
    if run_request.run_key:
        tags[RUN_KEY_TAG] = run_request.run_key

    # If the run was scheduled correctly but there was an error creating its
    # run config, enter it into the run DB with a FAILURE status
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=external_schedule.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=external_schedule.mode,
        solids_to_execute=external_pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=external_pipeline.solid_selection,
        status=(
            PipelineRunStatus.FAILURE
            if len(execution_plan_errors) > 0
            else PipelineRunStatus.NOT_STARTED
        ),
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        external_pipeline_origin=external_pipeline.get_external_origin(),
        pipeline_code_origin=external_pipeline.get_python_origin(),
    )

    if len(execution_plan_errors) > 0:
        for error in execution_plan_errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        error_string = "\n".join([error.to_string() for error in execution_plan_errors])
        logger.error(f"Failed to fetch execution plan for {external_schedule.name}: {error_string}")
    return (possibly_invalid_pipeline_run, execution_plan_errors)