Exemple #1
0
def test_one_task_dag():
    dag = DAG(
        dag_id="dag",
        default_args=default_args,
        schedule_interval=None,
    )
    dummy_operator = DummyOperator(
        task_id="dummy_operator",
        dag=dag,
    )

    pipeline_def = make_dagster_pipeline_from_airflow_dag(
        dag=dag,
        tags={
            AIRFLOW_EXECUTION_DATE_STR:
            get_current_datetime_in_utc().isoformat()
        },
    )
    result = execute_pipeline(pipeline_def)
    assert result.success
def test_skip(external_repo_context, capfd):
    with central_timezone():
        with instance_with_schedules(external_repo_context) as (instance,
                                                                external_repo):
            external_schedule = external_repo.get_external_schedule(
                "skip_schedule")
            schedule_origin = external_schedule.get_origin()
            initial_datetime = datetime(
                year=2019,
                month=2,
                day=27,
                hour=0,
                minute=0,
                second=0,
                tzinfo=get_utc_timezone(),
            )
            with freeze_time(initial_datetime):
                instance.start_schedule_and_update_storage_state(
                    external_schedule)

                launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                      get_current_datetime_in_utc())

                assert instance.get_runs_count() == 0
                ticks = instance.get_schedule_ticks(schedule_origin.get_id())
                assert len(ticks) == 1
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    ScheduleTickStatus.SKIPPED,
                    None,
                )

                captured = capfd.readouterr()
                assert (
                    captured.out ==
                    """2019-02-26 18:00:00 - dagster-scheduler - INFO - Checking for new runs for the following schedules: skip_schedule
2019-02-26 18:00:00 - dagster-scheduler - INFO - Launching run for skip_schedule at 2019-02-27 00:00:00+0000
2019-02-26 18:00:00 - dagster-scheduler - INFO - should_execute returned False for skip_schedule, skipping
""")
def test_bad_env_fn(external_repo_context, capfd):
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        external_schedule = external_repo.get_external_schedule(
            "bad_env_fn_schedule")
        schedule_origin = external_schedule.get_origin()
        initial_datetime = datetime(
            year=2019,
            month=2,
            day=27,
            hour=0,
            minute=0,
            second=0,
            tzinfo=get_utc_timezone(),
        )
        with freeze_time(initial_datetime):
            instance.start_schedule_and_update_storage_state(external_schedule)

            launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                  get_current_datetime_in_utc())

            assert instance.get_runs_count() == 0
            ticks = instance.get_schedule_ticks(schedule_origin.get_id())
            assert len(ticks) == 1

            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                ScheduleTickStatus.FAILURE,
                None,
                "Error occurred during the execution of run_config_fn for "
                "schedule bad_env_fn_schedule",
            )

            captured = capfd.readouterr()

            assert "Failed to fetch schedule data for bad_env_fn_schedule: " in captured.out

            assert ("Error occurred during the execution of run_config_fn for "
                    "schedule bad_env_fn_schedule" in captured.out)
Exemple #4
0
def test_include_execution_time_grpc():
    repository_handle = get_bar_repo_handle()

    execution_time = get_current_datetime_in_utc()
    with instance_for_test() as instance:
        execution_data = sync_get_external_schedule_execution_data_ephemeral_grpc(
            instance,
            repository_handle,
            "foo_schedule_echo_time",
            ScheduleExecutionDataMode.LAUNCH_SCHEDULED_EXECUTION,
            execution_time,
        )

        assert isinstance(execution_data, ExternalScheduleExecutionData)
        assert execution_data.run_config == {
            "passed_in_time": execution_time.isoformat()
        }
        assert execution_data.tags == {
            "dagster/schedule_name": "foo_schedule_echo_time"
        }
        assert execution_data.should_execute == True
Exemple #5
0
    def start_schedule_and_update_storage_state(self, instance,
                                                external_schedule):
        """
        Updates the status of the given schedule to `JobStatus.RUNNING` in schedule storage,
        then calls `start_schedule`.

        This should not be overridden by subclasses.

        Args:
            instance (DagsterInstance): The current instance.
            external_schedule (ExternalSchedule): The schedule to start

        """

        check.inst_param(instance, "instance", DagsterInstance)
        check.inst_param(external_schedule, "external_schedule",
                         ExternalSchedule)

        schedule_state = instance.get_job_state(
            external_schedule.get_external_origin_id())

        if not schedule_state:
            schedule_state = self._create_new_schedule_state(
                instance, external_schedule)

        if schedule_state.status == JobStatus.RUNNING:
            raise DagsterSchedulerError(
                "You have attempted to start schedule {name}, but it is already running"
                .format(name=external_schedule.name))

        self.start_schedule(instance, external_schedule)
        started_schedule = schedule_state.with_status(
            JobStatus.RUNNING).with_data(
                ScheduleJobData(
                    external_schedule.cron_schedule,
                    get_current_datetime_in_utc().timestamp(),
                    scheduler=self.__class__.__name__,
                ))
        instance.update_job_state(started_schedule)
        return started_schedule
    def test_update_schedule(self, storage):
        assert storage

        schedule = self.build_schedule("my_schedule", "* * * * *")
        storage.add_instigator_state(schedule)

        now_time = get_current_datetime_in_utc().timestamp()

        new_schedule = schedule.with_status(InstigatorStatus.RUNNING).with_data(
            ScheduleInstigatorData(
                cron_schedule=schedule.instigator_data.cron_schedule,
                start_timestamp=now_time,
            )
        )
        storage.update_instigator_state(new_schedule)

        schedules = storage.all_instigator_state(
            self.fake_repo_target().get_id(), InstigatorType.SCHEDULE
        )
        assert len(schedules) == 1

        schedule = schedules[0]
        assert schedule.instigator_name == "my_schedule"
        assert schedule.status == InstigatorStatus.RUNNING
        assert schedule.instigator_data.start_timestamp == now_time

        stopped_schedule = schedule.with_status(InstigatorStatus.STOPPED).with_data(
            ScheduleInstigatorData(schedule.instigator_data.cron_schedule)
        )
        storage.update_instigator_state(stopped_schedule)

        schedules = storage.all_instigator_state(
            self.fake_repo_target().get_id(), InstigatorType.SCHEDULE
        )
        assert len(schedules) == 1

        schedule = schedules[0]
        assert schedule.instigator_name == "my_schedule"
        assert schedule.status == InstigatorStatus.STOPPED
        assert schedule.instigator_data.start_timestamp == None
Exemple #7
0
    def test_update_schedule(self, storage):
        assert storage

        schedule = self.build_schedule("my_schedule", "* * * * *")
        storage.add_job_state(schedule)

        now_time = get_current_datetime_in_utc().timestamp()

        new_schedule = schedule.with_status(JobStatus.RUNNING).with_data(
            ScheduleJobData(
                cron_schedule=schedule.job_specific_data.cron_schedule,
                start_timestamp=now_time,
                scheduler=FAKE_SCHEDULER_NAME,
            ))
        storage.update_job_state(new_schedule)

        schedules = storage.all_stored_job_state(
            self.fake_repo_target().get_id(), JobType.SCHEDULE)
        assert len(schedules) == 1

        schedule = schedules[0]
        assert schedule.job_name == "my_schedule"
        assert schedule.status == JobStatus.RUNNING
        assert schedule.job_specific_data.start_timestamp == now_time
        assert schedule.job_specific_data.scheduler == FAKE_SCHEDULER_NAME

        stopped_schedule = schedule.with_status(JobStatus.STOPPED).with_data(
            ScheduleJobData(schedule.job_specific_data.cron_schedule,
                            scheduler=FAKE_SCHEDULER_NAME))
        storage.update_job_state(stopped_schedule)

        schedules = storage.all_stored_job_state(
            self.fake_repo_target().get_id(), JobType.SCHEDULE)
        assert len(schedules) == 1

        schedule = schedules[0]
        assert schedule.job_name == "my_schedule"
        assert schedule.status == JobStatus.STOPPED
        assert schedule.job_specific_data.start_timestamp == None
        assert schedule.job_specific_data.scheduler == FAKE_SCHEDULER_NAME
Exemple #8
0
def test_re_init(restore_cron_tab):  # pylint:disable=unused-argument,redefined-outer-name
    with TemporaryDirectory() as tempdir:
        instance = define_scheduler_instance(tempdir)
        with get_test_external_repo() as external_repo:

            now = get_current_datetime_in_utc()
            # Start schedule
            schedule_state = instance.start_schedule_and_update_storage_state(
                external_repo.get_external_schedule(
                    "no_config_pipeline_every_min_schedule"))

            assert schedule_state.start_timestamp == get_timestamp_from_utc_datetime(
                now)

            # Check schedules are saved to disk
            assert "schedules" in os.listdir(tempdir)

            schedule_states = instance.all_stored_schedule_state()

            for state in schedule_states:
                if state.name == "no_config_pipeline_every_min_schedule":
                    assert state == schedule_state
Exemple #9
0
def get_sensor_next_tick(graphene_info, sensor_state):
    from ..schema.instigation import GrapheneFutureInstigationTick

    check.inst_param(graphene_info, "graphene_info", ResolveInfo)
    check.inst_param(sensor_state, "sensor_state", InstigatorState)

    repository_origin = sensor_state.origin.external_repository_origin
    if not graphene_info.context.has_repository_location(
            repository_origin.repository_location_origin.location_name):
        return None

    repository_location = graphene_info.context.get_repository_location(
        repository_origin.repository_location_origin.location_name)
    if not repository_location.has_repository(
            repository_origin.repository_name):
        return None

    repository = repository_location.get_repository(
        repository_origin.repository_name)

    if not repository.has_external_sensor(sensor_state.name):
        return None

    external_sensor = repository.get_external_sensor(sensor_state.name)

    if not sensor_state.is_running:
        return None

    ticks = graphene_info.context.instance.get_ticks(
        sensor_state.instigator_origin_id, limit=1)
    if not ticks:
        return None
    latest_tick = ticks[0]

    next_timestamp = latest_tick.timestamp + external_sensor.min_interval_seconds
    if next_timestamp < get_timestamp_from_utc_datetime(
            get_current_datetime_in_utc()):
        return None
    return GrapheneFutureInstigationTick(sensor_state, next_timestamp)
def test_long_name():
    dag_name = "dag-with.dot-dash-lo00ong" * 10
    dag = DAG(dag_id=dag_name, default_args=default_args, schedule_interval=None,)
    long_name = "task-with.dot-dash2-loong" * 10  # 250 characters, Airflow's max allowed length
    dummy_operator = DummyOperator(task_id=long_name, dag=dag,)

    pipeline_def = make_dagster_pipeline_from_airflow_dag(
        dag=dag, tags={AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat()},
    )
    result = execute_pipeline(pipeline_def)

    assert result.success
    assert (
        result.pipeline_def.name
        == "airflow_dag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ong"
    )

    assert len(result.pipeline_def.solids) == 1
    assert (
        result.pipeline_def.solids[0].name
        == "airflow_task_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loong"
    )
Exemple #11
0
def get_schedule_next_tick(graphene_info, schedule_state):
    if schedule_state.status != JobStatus.RUNNING:
        return None

    repository_origin = schedule_state.origin.external_repository_origin
    if not graphene_info.context.has_repository_location(
        repository_origin.repository_location_origin.location_name
    ):
        return None
    repository_location = graphene_info.context.get_repository_location(
        repository_origin.repository_location_origin.location_name
    )
    if not repository_location.has_repository(repository_origin.repository_name):
        return None

    repository = repository_location.get_repository(repository_origin.repository_name)
    external_schedule = repository.get_external_job(schedule_state.name)
    time_iter = external_schedule.execution_time_iterator(
        get_timestamp_from_utc_datetime(get_current_datetime_in_utc())
    )

    next_timestamp = next(time_iter).timestamp()
    return graphene_info.schema.type_named("FutureJobTick")(schedule_state, next_timestamp)
Exemple #12
0
def test_normalize_name():
    dag = DAG(
        dag_id="dag-with.dot-dash",
        default_args=default_args,
        schedule_interval=None,
    )
    dummy_operator = DummyOperator(
        task_id="task-with.dot-dash",
        dag=dag,
    )

    pipeline_def = make_dagster_pipeline_from_airflow_dag(
        dag=dag,
        tags={
            AIRFLOW_EXECUTION_DATE_STR:
            get_current_datetime_in_utc().isoformat()
        },
    )
    result = execute_pipeline(pipeline_def)

    assert result.success
    assert result.pipeline_def.name == "airflow_dag_with_dot_dash"
    assert len(result.pipeline_def.solids) == 1
    assert result.pipeline_def.solids[0].name == "airflow_task_with_dot_dash"
Exemple #13
0
def test_reconcile_schedule_without_start_time():
    with TemporaryDirectory() as tempdir:
        instance = define_scheduler_instance(tempdir)
        external_repo = get_test_external_repo()
        external_schedule = external_repo.get_external_schedule(
            "no_config_pipeline_daily_schedule")

        legacy_schedule_state = ScheduleState(
            external_schedule.get_origin(),
            ScheduleStatus.RUNNING,
            external_schedule.cron_schedule,
            None,
        )

        instance.add_schedule_state(legacy_schedule_state)

        instance.reconcile_scheduler_state(external_repository=external_repo)

        reconciled_schedule_state = instance.get_schedule_state(
            external_schedule.get_origin_id())

        assert reconciled_schedule_state.status == ScheduleStatus.RUNNING
        assert reconciled_schedule_state.start_timestamp == get_timestamp_from_utc_datetime(
            get_current_datetime_in_utc())
Exemple #14
0
def test_template_task_dag():
    dag = DAG(
        dag_id="dag",
        default_args=default_args,
        schedule_interval=None,
    )

    t1 = BashOperator(
        task_id="print_hello",
        bash_command="echo hello dagsir",
        dag=dag,
    )

    t2 = BashOperator(
        task_id="sleep",
        bash_command="sleep 2",
        dag=dag,
    )

    templated_command = """
    {% for i in range(5) %}
        echo '{{ ds }}'
        echo '{{ macros.ds_add(ds, 7)}}'
        echo '{{ params.my_param }}'
    {% endfor %}
    """

    t3 = BashOperator(
        task_id="templated",
        depends_on_past=False,
        bash_command=templated_command,
        params={"my_param": "Parameter I passed in"},
        dag=dag,
    )

    # pylint: disable=pointless-statement
    t1 >> [t2, t3]

    instance = DagsterInstance.local_temp()
    manager = instance.compute_log_manager

    execution_date = get_current_datetime_in_utc()
    execution_date_add_one_week = execution_date + datetime.timedelta(days=7)
    execution_date_iso = execution_date.strftime("%Y-%m-%d")
    execution_date_add_one_week_iso = execution_date_add_one_week.strftime(
        "%Y-%m-%d")

    result = execute_pipeline(
        make_dagster_pipeline_from_airflow_dag(
            dag=dag, tags={AIRFLOW_EXECUTION_DATE_STR: execution_date_iso}),
        instance=instance,
    )

    compute_steps = [
        event.step_key for event in result.step_event_list
        if event.event_type == DagsterEventType.STEP_START
    ]

    assert compute_steps == [
        "airflow_print_hello.compute",
        "airflow_sleep.compute",
        "airflow_templated.compute",
    ]

    for step_key in compute_steps:
        compute_io_path = manager.get_local_path(result.run_id, step_key,
                                                 ComputeIOType.STDOUT)
        assert os.path.exists(compute_io_path)
        stdout_file = open(compute_io_path, "r")
        file_contents = normalize_file_content(stdout_file.read())
        stdout_file.close()

        if step_key == "airflow_print_hello.compute":
            assert file_contents.count(
                "INFO - Running command: echo hello dagsir\n") == 1
            assert file_contents.count(
                "INFO - Command exited with return code 0") == 1

        elif step_key == "airflow_sleep.compute":
            assert file_contents.count(
                "INFO - Running command: sleep 2\n") == 1
            assert file_contents.count("INFO - Output:\n") == 1
            assert file_contents.count(
                "INFO - Command exited with return code 0") == 1

        elif step_key == "airflow_templated.compute":
            assert (file_contents.count(
                "INFO - Running command: \n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n        "
                "echo '{execution_date_iso}'\n        "
                "echo '{execution_date_add_one_week_iso}'\n        "
                "echo 'Parameter I passed in'\n    \n    \n".format(
                    execution_date_iso=execution_date_iso,
                    execution_date_add_one_week_iso=
                    execution_date_add_one_week_iso,
                )) == 1)
            assert (file_contents.count("INFO - {execution_date_iso}\n".format(
                execution_date_iso=execution_date_iso)) == 5)
            assert (file_contents.count(
                "INFO - {execution_date_add_one_week_iso}\n".format(
                    execution_date_add_one_week_iso=
                    execution_date_add_one_week_iso)) == 5)
            assert file_contents.count("INFO - Parameter I passed in\n") == 5
            assert file_contents.count(
                "INFO - Command exited with return code 0") == 1
Exemple #15
0
def test_failure_recovery_before_run_created(external_repo_context,
                                             crash_location, crash_signal,
                                             capfd):
    with central_timezone():
        # Verify that if the scheduler crashes or is interrupted before a run is created,
        # it will create exactly one tick/run when it is re-launched
        with instance_with_schedules(external_repo_context) as (instance,
                                                                external_repo):
            initial_datetime = datetime(
                year=2019,
                month=2,
                day=27,
                hour=0,
                minute=0,
                second=0,
                tzinfo=get_utc_timezone(),
            )
            external_schedule = external_repo.get_external_schedule(
                "simple_schedule")
            with freeze_time(initial_datetime) as frozen_datetime:
                instance.start_schedule_and_update_storage_state(
                    external_schedule)

                debug_crash_flags = {
                    external_schedule.name: {
                        crash_location: crash_signal
                    }
                }

                scheduler_process = multiprocessing.Process(
                    target=_test_launch_scheduled_runs_in_subprocess,
                    args=[
                        instance.get_ref(),
                        get_current_datetime_in_utc(), debug_crash_flags
                    ],
                )
                scheduler_process.start()
                scheduler_process.join(timeout=60)

                assert scheduler_process.exitcode != 0

                captured = capfd.readouterr()
                assert (
                    captured.out ==
                    """2019-02-26 18:00:00 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:00:00 - dagster-scheduler - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000
""")

                ticks = instance.get_schedule_ticks(
                    external_schedule.get_origin_id())
                assert len(ticks) == 1
                assert ticks[0].status == ScheduleTickStatus.STARTED

                assert instance.get_runs_count() == 0

                frozen_datetime.tick(delta=timedelta(minutes=5))

                scheduler_process = multiprocessing.Process(
                    target=_test_launch_scheduled_runs_in_subprocess,
                    args=[
                        instance.get_ref(),
                        get_current_datetime_in_utc(), None
                    ],
                )
                scheduler_process.start()
                scheduler_process.join(timeout=60)
                assert scheduler_process.exitcode == 0

                assert instance.get_runs_count() == 1
                wait_for_all_runs_to_start(instance)
                validate_run_started(instance.get_runs()[0], initial_datetime,
                                     "2019-02-26")

                ticks = instance.get_schedule_ticks(
                    external_schedule.get_origin_id())
                assert len(ticks) == 1
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    ScheduleTickStatus.SUCCESS,
                    instance.get_runs()[0].run_id,
                )
                captured = capfd.readouterr()
                assert (
                    captured.out ==
                    """2019-02-26 18:05:00 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:05:00 - dagster-scheduler - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000
2019-02-26 18:05:00 - dagster-scheduler - INFO - Resuming previously interrupted schedule execution
2019-02-26 18:05:00 - dagster-scheduler - INFO - Completed scheduled launch of run {run_id} for simple_schedule
""".format(run_id=instance.get_runs()[0].run_id))
Exemple #16
0
    def _construct_run_with_snapshots(
        self,
        pipeline_name=None,
        run_id=None,
        environment_dict=None,
        mode=None,
        solid_subset=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        pipeline_snapshot=None,
        execution_plan_snapshot=None,
    ):

        if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:
            if AIRFLOW_EXECUTION_DATE_STR not in tags:
                tags[AIRFLOW_EXECUTION_DATE_STR] = get_current_datetime_in_utc(
                ).isoformat()

        pipeline_run = PipelineRun(
            pipeline_name=pipeline_name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=mode,
            solid_subset=solid_subset,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
        )

        if pipeline_snapshot is not None:
            from dagster.core.snap import create_pipeline_snapshot_id

            pipeline_snapshot_id = create_pipeline_snapshot_id(
                pipeline_snapshot)

            if not self._run_storage.has_pipeline_snapshot(
                    pipeline_snapshot_id):
                returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(
                    pipeline_snapshot)

                check.invariant(
                    pipeline_snapshot_id == returned_pipeline_snapshot_id)

            pipeline_run = pipeline_run.with_pipeline_snapshot_id(
                pipeline_snapshot_id)

        if execution_plan_snapshot is not None:
            from dagster.core.snap import create_execution_plan_snapshot_id

            check.invariant(execution_plan_snapshot.pipeline_snapshot_id ==
                            pipeline_snapshot_id)

            execution_plan_snapshot_id = create_execution_plan_snapshot_id(
                execution_plan_snapshot)

            if not self._run_storage.has_execution_plan_snapshot(
                    execution_plan_snapshot_id):
                returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(
                    execution_plan_snapshot)

                check.invariant(execution_plan_snapshot_id ==
                                returned_execution_plan_snapshot_id)

            pipeline_run = pipeline_run.with_execution_plan_snapshot_id(
                execution_plan_snapshot_id)

        return pipeline_run
Exemple #17
0
def test_simple_schedule(external_repo_context):
    initial_datetime = datetime(
        year=2019, month=2, day=27, hour=23, minute=59, second=59, tzinfo=get_utc_timezone(),
    )
    with instance_with_schedules(external_repo_context) as (instance, external_repo):
        with freeze_time(initial_datetime) as frozen_datetime:
            external_schedule = external_repo.get_external_schedule("simple_schedule")

            schedule_origin = external_schedule.get_origin()

            instance.start_schedule_and_update_storage_state(external_schedule)

            assert instance.get_runs_count() == 0
            ticks = instance.get_schedule_ticks(schedule_origin.get_id())
            assert len(ticks) == 0

            # launch_scheduled_runs does nothing before the first tick
            launch_scheduled_runs(instance, get_current_datetime_in_utc())
            assert instance.get_runs_count() == 0
            ticks = instance.get_schedule_ticks(schedule_origin.get_id())
            assert len(ticks) == 0

            # Move forward in time so we're past a tick
            frozen_datetime.tick(delta=timedelta(seconds=2))
            launch_scheduled_runs(instance, get_current_datetime_in_utc())

            assert instance.get_runs_count() == 1
            ticks = instance.get_schedule_ticks(schedule_origin.get_id())
            assert len(ticks) == 1

            expected_datetime = datetime(year=2019, month=2, day=28, tzinfo=get_utc_timezone())

            validate_tick(
                ticks[0],
                external_schedule,
                expected_datetime,
                ScheduleTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )

            wait_for_all_runs_to_start(instance)
            validate_run_started(instance.get_runs()[0], expected_datetime, "2019-02-27")

            # Verify idempotence
            launch_scheduled_runs(instance, get_current_datetime_in_utc())
            assert instance.get_runs_count() == 1
            ticks = instance.get_schedule_ticks(schedule_origin.get_id())
            assert len(ticks) == 1
            assert ticks[0].status == ScheduleTickStatus.SUCCESS

            # Verify advancing in time but not going past a tick doesn't add any new runs
            frozen_datetime.tick(delta=timedelta(seconds=2))
            launch_scheduled_runs(instance, get_current_datetime_in_utc())
            assert instance.get_runs_count() == 1
            ticks = instance.get_schedule_ticks(schedule_origin.get_id())
            assert len(ticks) == 1
            assert ticks[0].status == ScheduleTickStatus.SUCCESS

            # Traveling two more days in the future before running results in two new ticks
            frozen_datetime.tick(delta=timedelta(days=2))
            launch_scheduled_runs(instance, get_current_datetime_in_utc())
            assert instance.get_runs_count() == 3
            ticks = instance.get_schedule_ticks(schedule_origin.get_id())
            assert len(ticks) == 3
            assert len([tick for tick in ticks if tick.status == ScheduleTickStatus.SUCCESS]) == 3

            runs_by_partition = {run.tags[PARTITION_NAME_TAG]: run for run in instance.get_runs()}

            assert "2019-02-28" in runs_by_partition
            assert "2019-03-01" in runs_by_partition

            # Check idempotence again
            launch_scheduled_runs(instance, get_current_datetime_in_utc())
            assert instance.get_runs_count() == 3
            ticks = instance.get_schedule_ticks(schedule_origin.get_id())
            assert len(ticks) == 3
Exemple #18
0
from airflow.models.dag import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import days_ago
from dagster import DagsterEventType, execute_pipeline
from dagster.core.instance import AIRFLOW_EXECUTION_DATE_STR
from dagster.core.storage.compute_log_manager import ComputeIOType
from dagster.core.test_utils import instance_for_test
from dagster.seven import get_current_datetime_in_utc
from dagster_airflow.dagster_pipeline_factory import make_dagster_pipeline_from_airflow_dag

default_args = {
    "owner": "dagster",
    "start_date": days_ago(1),
}

EXECUTION_DATE = get_current_datetime_in_utc()
EXECUTION_DATE_MINUS_WEEK = EXECUTION_DATE - datetime.timedelta(days=7)

EXECUTION_DATE_FMT = EXECUTION_DATE.strftime("%Y-%m-%d")
EXECUTION_DATE_MINUS_WEEK_FMT = EXECUTION_DATE_MINUS_WEEK.strftime("%Y-%m-%d")


def normalize_file_content(s):
    return "\n".join(
        [line for line in s.replace(os.linesep, "\n").split("\n") if line])


def check_compute_logs(manager, result, execution_date_fmt):
    assert result.success

    compute_steps = [
Exemple #19
0
    def _construct_run_with_snapshots(
        self,
        pipeline_name,
        run_id,
        environment_dict,
        mode,
        solid_subset,
        step_keys_to_execute,
        status,
        tags,
        root_run_id,
        parent_run_id,
        pipeline_snapshot,
        execution_plan_snapshot,
        parent_pipeline_snapshot,
    ):

        # https://github.com/dagster-io/dagster/issues/2403
        if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:
            if AIRFLOW_EXECUTION_DATE_STR not in tags:
                tags[AIRFLOW_EXECUTION_DATE_STR] = get_current_datetime_in_utc(
                ).isoformat()

        pipeline_run = PipelineRun(
            pipeline_name=pipeline_name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=mode,
            solid_subset=solid_subset,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
        )

        if pipeline_snapshot is not None:
            from dagster.core.snap import create_pipeline_snapshot_id

            if pipeline_snapshot.lineage_snapshot:
                if not self._run_storage.has_pipeline_snapshot(
                        pipeline_snapshot.lineage_snapshot.parent_snapshot_id):
                    check.invariant(
                        create_pipeline_snapshot_id(
                            parent_pipeline_snapshot) ==
                        pipeline_snapshot.lineage_snapshot.parent_snapshot_id,
                        'Parent pipeline snapshot id out of sync with passed parent pipeline snapshot',
                    )

                    returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(
                        parent_pipeline_snapshot)
                    check.invariant(
                        pipeline_snapshot.lineage_snapshot.parent_snapshot_id
                        == returned_pipeline_snapshot_id)

            pipeline_snapshot_id = create_pipeline_snapshot_id(
                pipeline_snapshot)
            if not self._run_storage.has_pipeline_snapshot(
                    pipeline_snapshot_id):
                returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(
                    pipeline_snapshot)
                check.invariant(
                    pipeline_snapshot_id == returned_pipeline_snapshot_id)

            pipeline_run = pipeline_run.with_pipeline_snapshot_id(
                pipeline_snapshot_id)

        if execution_plan_snapshot is not None:
            from dagster.core.snap import create_execution_plan_snapshot_id

            check.invariant(execution_plan_snapshot.pipeline_snapshot_id ==
                            pipeline_snapshot_id)

            check.invariant(
                set(step_keys_to_execute) == set(
                    execution_plan_snapshot.step_keys_to_execute)
                if step_keys_to_execute else set(
                    execution_plan_snapshot.step_keys_to_execute) == set(
                        [step.key for step in execution_plan_snapshot.steps]),
                'We encode step_keys_to_execute twice in our stack, unfortunately. This check '
                'ensures that they are consistent. We check that step_keys_to_execute in the plan '
                'matches the step_keys_to_execute params if it is set. If it is not, this indicates '
                'a full execution plan, and so we verify that.',
            )

            execution_plan_snapshot_id = create_execution_plan_snapshot_id(
                execution_plan_snapshot)

            if not self._run_storage.has_execution_plan_snapshot(
                    execution_plan_snapshot_id):
                returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(
                    execution_plan_snapshot)

                check.invariant(execution_plan_snapshot_id ==
                                returned_execution_plan_snapshot_id)

            pipeline_run = pipeline_run.with_execution_plan_snapshot_id(
                execution_plan_snapshot_id)

        return pipeline_run
def test_failure_recovery_after_run_created(external_repo_context,
                                            crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted after a run is created,
    # it will just re-launch the already-created run when it runs again
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = datetime(
            year=2019,
            month=2,
            day=27,
            hour=0,
            minute=0,
            second=0,
            tzinfo=get_utc_timezone(),
        )
        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with freeze_time(initial_datetime) as frozen_datetime:
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    get_current_datetime_in_utc(), debug_crash_flags
                ],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            ticks = instance.get_schedule_ticks(
                external_schedule.get_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == ScheduleTickStatus.STARTED

            assert instance.get_runs_count() == 1

            if crash_location == "RUN_CREATED":
                run = instance.get_runs()[0]
                # Run was created, but hasn't launched yet
                assert run.tags[
                    SCHEDULED_EXECUTION_TIME_TAG] == initial_datetime.isoformat(
                    )
                assert run.tags[PARTITION_NAME_TAG] == "2019-02-26"
                assert run.status == PipelineRunStatus.NOT_STARTED
            else:
                # The run was created and launched - running again should do nothing other than
                # moving the tick to success state.

                # The fact that we need to add this line indicates that there is still a theoretical
                # possible race condition - if the scheduler fails after launching a run
                # and then runs again between when the run was launched and when its status is changed to STARTED by the executor, we could
                # end up launching the same run twice. Run queueing or some other way to immediately
                # identify that a run was launched would help eliminate this race condition. For now,
                # eliminate the possibility by waiting for the run to start before running the
                # scheduler again.
                wait_for_all_runs_to_start(instance)

                run = instance.get_runs()[0]
                validate_run_started(instance.get_runs()[0], initial_datetime,
                                     "2019-02-26")

                assert run.status in [
                    PipelineRunStatus.STARTED, PipelineRunStatus.SUCCESS
                ]

            frozen_datetime.tick(delta=timedelta(minutes=5))

            # Running again just launches the existing run and marks the tick as success
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(),
                      get_current_datetime_in_utc(), None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 "2019-02-26")

            ticks = instance.get_schedule_ticks(
                external_schedule.get_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                ScheduleTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )
def test_max_catchup_runs(capfd):
    initial_datetime = datetime(
        year=2019,
        month=2,
        day=27,
        hour=23,
        minute=59,
        second=59,
        tzinfo=get_utc_timezone(),
    )
    with central_timezone():
        with instance_with_schedules(grpc_repo) as (instance, external_repo):
            with freeze_time(initial_datetime) as frozen_datetime:
                external_schedule = external_repo.get_external_schedule(
                    "simple_schedule")
                schedule_origin = external_schedule.get_origin()
                instance.start_schedule_and_update_storage_state(
                    external_schedule)

                # Day is now March 4 at 11:59PM
                frozen_datetime.tick(delta=timedelta(days=5))

                launch_scheduled_runs(
                    instance,
                    get_default_scheduler_logger(),
                    get_current_datetime_in_utc(),
                    max_catchup_runs=2,
                )

                assert instance.get_runs_count() == 2
                ticks = instance.get_schedule_ticks(schedule_origin.get_id())
                assert len(ticks) == 2

                first_datetime = datetime(year=2019,
                                          month=3,
                                          day=4,
                                          tzinfo=get_utc_timezone())

                wait_for_all_runs_to_start(instance)

                validate_tick(
                    ticks[0],
                    external_schedule,
                    first_datetime,
                    ScheduleTickStatus.SUCCESS,
                    instance.get_runs()[0].run_id,
                )
                validate_run_started(instance.get_runs()[0], first_datetime,
                                     "2019-03-03")

                second_datetime = datetime(year=2019,
                                           month=3,
                                           day=3,
                                           tzinfo=get_utc_timezone())

                validate_tick(
                    ticks[1],
                    external_schedule,
                    second_datetime,
                    ScheduleTickStatus.SUCCESS,
                    instance.get_runs()[1].run_id,
                )

                validate_run_started(instance.get_runs()[1], second_datetime,
                                     "2019-03-02")

                captured = capfd.readouterr()
                assert (
                    captured.out ==
                    """2019-03-04 17:59:59 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule
2019-03-04 17:59:59 - dagster-scheduler - WARNING - simple_schedule has fallen behind, only launching 2 runs
2019-03-04 17:59:59 - dagster-scheduler - INFO - Launching 2 runs for simple_schedule at the following times: 2019-03-03 00:00:00+0000, 2019-03-04 00:00:00+0000
2019-03-04 17:59:59 - dagster-scheduler - INFO - Completed scheduled launch of run {first_run_id} for simple_schedule
2019-03-04 17:59:59 - dagster-scheduler - INFO - Completed scheduled launch of run {second_run_id} for simple_schedule
""".format(
                        first_run_id=instance.get_runs()[1].run_id,
                        second_run_id=instance.get_runs()[0].run_id,
                    ))
def test_failure_recovery_after_tick_success(external_repo_context,
                                             crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted after a run is created,
    # it will just re-launch the already-created run when it runs again
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = datetime(
            year=2019,
            month=2,
            day=27,
            hour=0,
            minute=0,
            second=0,
            tzinfo=get_utc_timezone(),
        )
        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with freeze_time(initial_datetime) as frozen_datetime:
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    get_current_datetime_in_utc(), debug_crash_flags
                ],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            # As above there's a possible race condition here if the scheduler crashes
            # and launches the same run twice if we crash right after the launch and re-run
            # before the run actually starts
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 "2019-02-26")

            ticks = instance.get_schedule_ticks(
                external_schedule.get_origin_id())
            assert len(ticks) == 1

            if crash_signal == signal.SIGKILL:
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    ScheduleTickStatus.STARTED,
                    None,
                )
            else:
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    ScheduleTickStatus.SUCCESS,
                    instance.get_runs()[0].run_id,
                )

            frozen_datetime.tick(delta=timedelta(minutes=5))

            # Running again just marks the tick as success since the run has already started
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(),
                      get_current_datetime_in_utc(), None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 "2019-02-26")

            ticks = instance.get_schedule_ticks(
                external_schedule.get_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                ScheduleTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )
def test_multiple_schedules_on_different_time_ranges(external_repo_context,
                                                     capfd):
    with central_timezone():
        with instance_with_schedules(external_repo_context) as (instance,
                                                                external_repo):
            external_schedule = external_repo.get_external_schedule(
                "simple_schedule")
            external_hourly_schedule = external_repo.get_external_schedule(
                "simple_hourly_schedule")
            initial_datetime = datetime(
                year=2019,
                month=2,
                day=27,
                hour=23,
                minute=59,
                second=59,
                tzinfo=get_utc_timezone(),
            )
            with freeze_time(initial_datetime) as frozen_datetime:
                instance.start_schedule_and_update_storage_state(
                    external_schedule)
                instance.start_schedule_and_update_storage_state(
                    external_hourly_schedule)
                frozen_datetime.tick(delta=timedelta(seconds=2))

                launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                      get_current_datetime_in_utc())

                assert instance.get_runs_count() == 2
                ticks = instance.get_schedule_ticks(
                    external_schedule.get_origin_id())
                assert len(ticks) == 1
                assert ticks[0].status == ScheduleTickStatus.SUCCESS

                hourly_ticks = instance.get_schedule_ticks(
                    external_hourly_schedule.get_origin_id())
                assert len(hourly_ticks) == 1
                assert hourly_ticks[0].status == ScheduleTickStatus.SUCCESS

                captured = capfd.readouterr()

                assert (
                    captured.out ==
                    """2019-02-27 18:00:01 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_hourly_schedule, simple_schedule
2019-02-27 18:00:01 - dagster-scheduler - INFO - Launching run for simple_hourly_schedule at 2019-02-28 00:00:00+0000
2019-02-27 18:00:01 - dagster-scheduler - INFO - Completed scheduled launch of run {first_run_id} for simple_hourly_schedule
2019-02-27 18:00:01 - dagster-scheduler - INFO - Launching run for simple_schedule at 2019-02-28 00:00:00+0000
2019-02-27 18:00:01 - dagster-scheduler - INFO - Completed scheduled launch of run {second_run_id} for simple_schedule
""".format(
                        first_run_id=instance.get_runs()[1].run_id,
                        second_run_id=instance.get_runs()[0].run_id,
                    ))

                frozen_datetime.tick(delta=timedelta(hours=1))

                launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                      get_current_datetime_in_utc())

                assert instance.get_runs_count() == 3

                ticks = instance.get_schedule_ticks(
                    external_schedule.get_origin_id())
                assert len(ticks) == 1
                assert ticks[0].status == ScheduleTickStatus.SUCCESS

                hourly_ticks = instance.get_schedule_ticks(
                    external_hourly_schedule.get_origin_id())
                assert len(hourly_ticks) == 2
                assert (len([
                    tick for tick in hourly_ticks
                    if tick.status == ScheduleTickStatus.SUCCESS
                ]) == 2)

                captured = capfd.readouterr()
                assert (
                    captured.out ==
                    """2019-02-27 19:00:01 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_hourly_schedule, simple_schedule
2019-02-27 19:00:01 - dagster-scheduler - INFO - Launching run for simple_hourly_schedule at 2019-02-28 01:00:00+0000
2019-02-27 19:00:01 - dagster-scheduler - INFO - Completed scheduled launch of run {third_run_id} for simple_hourly_schedule
2019-02-27 19:00:01 - dagster-scheduler - INFO - No new runs for simple_schedule
""".format(third_run_id=instance.get_runs()[0].run_id))
def test_bad_load(capfd):
    with schedule_instance() as instance:
        working_directory = os.path.dirname(__file__)
        recon_repo = ReconstructableRepository.for_file(
            __file__, "doesnt_exist", working_directory)
        schedule = recon_repo.get_reconstructable_schedule("also_doesnt_exist")
        fake_origin = schedule.get_origin()

        initial_datetime = datetime(
            year=2019,
            month=2,
            day=27,
            hour=23,
            minute=59,
            second=59,
            tzinfo=get_utc_timezone(),
        )
        with freeze_time(initial_datetime) as frozen_datetime:
            schedule_state = ScheduleState(
                fake_origin,
                ScheduleStatus.RUNNING,
                "0 0 * * *",
                get_timestamp_from_utc_datetime(get_current_datetime_in_utc()),
            )
            instance.add_schedule_state(schedule_state)

            frozen_datetime.tick(delta=timedelta(seconds=1))

            launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                  get_current_datetime_in_utc())

            assert instance.get_runs_count() == 0

            ticks = instance.get_schedule_ticks(fake_origin.get_id())

            assert len(ticks) == 1
            assert ticks[0].status == ScheduleTickStatus.FAILURE
            assert ticks[0].timestamp == get_timestamp_from_utc_datetime(
                get_current_datetime_in_utc())
            assert "doesnt_exist not found at module scope in file" in ticks[
                0].error.message

            captured = capfd.readouterr()
            assert "Error launching scheduled run" in captured.out
            assert "doesnt_exist not found at module scope" in captured.out

            frozen_datetime.tick(delta=timedelta(days=1))

            launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                  get_current_datetime_in_utc())

            assert instance.get_runs_count() == 0

            ticks = instance.get_schedule_ticks(fake_origin.get_id())

            assert len(ticks) == 2
            assert ticks[0].status == ScheduleTickStatus.FAILURE
            assert ticks[0].timestamp == get_timestamp_from_utc_datetime(
                get_current_datetime_in_utc())
            assert "doesnt_exist not found at module scope in file" in ticks[
                0].error.message

            captured = capfd.readouterr()
            assert "Error launching scheduled run" in captured.out
            assert "doesnt_exist not found at module scope" in captured.out
Exemple #25
0
    def get_or_create_run(
        self,
        pipeline_name=None,
        run_id=None,
        environment_dict=None,
        mode=None,
        selector=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        pipeline_snapshot=None,
        execution_plan_snapshot=None,
    ):

        if tags and IS_AIRFLOW_INGEST_PIPELINE_STR in tags:
            if AIRFLOW_EXECUTION_DATE_STR not in tags:
                tags[AIRFLOW_EXECUTION_DATE_STR] = get_current_datetime_in_utc(
                ).isoformat()

        pipeline_run = PipelineRun(
            pipeline_name=pipeline_name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=mode,
            selector=selector,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
        )

        if pipeline_snapshot is not None:
            from dagster.core.snap.pipeline_snapshot import create_pipeline_snapshot_id

            pipeline_snapshot_id = create_pipeline_snapshot_id(
                pipeline_snapshot)

            if not self._run_storage.has_pipeline_snapshot(
                    pipeline_snapshot_id):
                returned_pipeline_snapshot_id = self._run_storage.add_pipeline_snapshot(
                    pipeline_snapshot)

                check.invariant(
                    pipeline_snapshot_id == returned_pipeline_snapshot_id)

            pipeline_run = pipeline_run.with_pipeline_snapshot_id(
                pipeline_snapshot_id)

        if execution_plan_snapshot is not None:
            from dagster.core.snap.execution_plan_snapshot import create_execution_plan_snapshot_id

            check.invariant(execution_plan_snapshot.pipeline_snapshot_id ==
                            pipeline_snapshot_id)

            execution_plan_snapshot_id = create_execution_plan_snapshot_id(
                execution_plan_snapshot)

            if not self._run_storage.has_execution_plan_snapshot(
                    execution_plan_snapshot_id):
                returned_execution_plan_snapshot_id = self._run_storage.add_execution_plan_snapshot(
                    execution_plan_snapshot)

                check.invariant(execution_plan_snapshot_id ==
                                returned_execution_plan_snapshot_id)

            pipeline_run = pipeline_run.with_execution_plan_snapshot_id(
                execution_plan_snapshot_id)

        if self.has_run(pipeline_run.run_id):
            candidate_run = self.get_run_by_id(pipeline_run.run_id)

            field_diff = _check_run_equality(pipeline_run, candidate_run)

            if field_diff:
                raise DagsterRunConflict(
                    'Found conflicting existing run with same id {run_id}. Runs differ in:'
                    '\n{field_diff}'.format(
                        run_id=pipeline_run.run_id,
                        field_diff=_format_field_diff(field_diff),
                    ), )
            return candidate_run

        return self._run_storage.add_run(pipeline_run)
def test_failure_recovery_before_run_created(external_repo_context,
                                             crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted before a run is created,
    # it will create exactly one tick/run when it is re-launched
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = datetime(
            year=2019,
            month=2,
            day=27,
            hour=0,
            minute=0,
            second=0,
            tzinfo=get_utc_timezone(),
        )
        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with freeze_time(initial_datetime) as frozen_datetime:
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    get_current_datetime_in_utc(), debug_crash_flags
                ],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            ticks = instance.get_schedule_ticks(
                external_schedule.get_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == ScheduleTickStatus.STARTED

            assert instance.get_runs_count() == 0

            frozen_datetime.tick(delta=timedelta(minutes=5))

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(),
                      get_current_datetime_in_utc(), None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 "2019-02-26")

            ticks = instance.get_schedule_ticks(
                external_schedule.get_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                ScheduleTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )
def test_simple_schedule(external_repo_context, capfd):
    with central_timezone():
        initial_datetime = datetime(
            year=2019,
            month=2,
            day=27,
            hour=23,
            minute=59,
            second=59,
            tzinfo=get_utc_timezone(),
        )
        with instance_with_schedules(external_repo_context) as (instance,
                                                                external_repo):
            with freeze_time(initial_datetime) as frozen_datetime:
                external_schedule = external_repo.get_external_schedule(
                    "simple_schedule")

                schedule_origin = external_schedule.get_origin()

                instance.start_schedule_and_update_storage_state(
                    external_schedule)

                assert instance.get_runs_count() == 0
                ticks = instance.get_schedule_ticks(schedule_origin.get_id())
                assert len(ticks) == 0

                # launch_scheduled_runs does nothing before the first tick
                launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                      get_current_datetime_in_utc())
                assert instance.get_runs_count() == 0
                ticks = instance.get_schedule_ticks(schedule_origin.get_id())
                assert len(ticks) == 0

                captured = capfd.readouterr()

                assert (
                    captured.out ==
                    """2019-02-27 17:59:59 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-27 17:59:59 - dagster-scheduler - INFO - No new runs for simple_schedule
""")

                # Move forward in time so we're past a tick
                frozen_datetime.tick(delta=timedelta(seconds=2))

                launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                      get_current_datetime_in_utc())

                assert instance.get_runs_count() == 1
                ticks = instance.get_schedule_ticks(schedule_origin.get_id())
                assert len(ticks) == 1

                expected_datetime = datetime(year=2019,
                                             month=2,
                                             day=28,
                                             tzinfo=get_utc_timezone())

                validate_tick(
                    ticks[0],
                    external_schedule,
                    expected_datetime,
                    ScheduleTickStatus.SUCCESS,
                    instance.get_runs()[0].run_id,
                )

                wait_for_all_runs_to_start(instance)
                validate_run_started(instance.get_runs()[0], expected_datetime,
                                     "2019-02-27")

                captured = capfd.readouterr()

                assert (
                    captured.out ==
                    """2019-02-27 18:00:01 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-27 18:00:01 - dagster-scheduler - INFO - Launching run for simple_schedule at 2019-02-28 00:00:00+0000
2019-02-27 18:00:01 - dagster-scheduler - INFO - Completed scheduled launch of run {run_id} for simple_schedule
""".format(run_id=instance.get_runs()[0].run_id))

                # Verify idempotence
                launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                      get_current_datetime_in_utc())
                assert instance.get_runs_count() == 1
                ticks = instance.get_schedule_ticks(schedule_origin.get_id())
                assert len(ticks) == 1
                assert ticks[0].status == ScheduleTickStatus.SUCCESS

                # Verify advancing in time but not going past a tick doesn't add any new runs
                frozen_datetime.tick(delta=timedelta(seconds=2))
                launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                      get_current_datetime_in_utc())
                assert instance.get_runs_count() == 1
                ticks = instance.get_schedule_ticks(schedule_origin.get_id())
                assert len(ticks) == 1
                assert ticks[0].status == ScheduleTickStatus.SUCCESS

                capfd.readouterr()

                # Traveling two more days in the future before running results in two new ticks
                frozen_datetime.tick(delta=timedelta(days=2))
                launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                      get_current_datetime_in_utc())
                assert instance.get_runs_count() == 3
                ticks = instance.get_schedule_ticks(schedule_origin.get_id())
                assert len(ticks) == 3
                assert (len([
                    tick for tick in ticks
                    if tick.status == ScheduleTickStatus.SUCCESS
                ]) == 3)

                runs_by_partition = {
                    run.tags[PARTITION_NAME_TAG]: run
                    for run in instance.get_runs()
                }

                assert "2019-02-28" in runs_by_partition
                assert "2019-03-01" in runs_by_partition

                captured = capfd.readouterr()

                assert (
                    captured.out ==
                    """2019-03-01 18:00:03 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule
2019-03-01 18:00:03 - dagster-scheduler - INFO - Launching 2 runs for simple_schedule at the following times: 2019-03-01 00:00:00+0000, 2019-03-02 00:00:00+0000
2019-03-01 18:00:03 - dagster-scheduler - INFO - Completed scheduled launch of run {first_run_id} for simple_schedule
2019-03-01 18:00:03 - dagster-scheduler - INFO - Completed scheduled launch of run {second_run_id} for simple_schedule
""".format(
                        first_run_id=instance.get_runs()[1].run_id,
                        second_run_id=instance.get_runs()[0].run_id,
                    ))

                # Check idempotence again
                launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                      get_current_datetime_in_utc())
                assert instance.get_runs_count() == 3
                ticks = instance.get_schedule_ticks(schedule_origin.get_id())
                assert len(ticks) == 3
Exemple #28
0
    def reconcile_scheduler_state(self, instance, external_repository):
        """Reconcile the ExternalSchedule list from the repository and ScheduleStorage
        on the instance to ensure there is a 1-1 correlation between ExternalSchedule and
        JobStates of type JobType.SCHEDULE, where the ExternalSchedule list is the source of truth.

        If a new ExternalSchedule is introduced, a new JobState is added to storage with status
        JobStatus.STOPPED.

        For every previously existing ExternalSchedule (where target id is the primary key),
        any changes to the definition are persisted in the corresponding JobState and the status is
        left unchanged. The schedule is also restarted to make sure the external artifacts (such
        as a cron job) are up to date.

        For every ScheduleDefinitions that is removed, the corresponding JobState is removed from
        the storage and the corresponding job is ended.
        """

        schedules_to_restart = []
        for external_schedule in external_repository.get_external_schedules():
            # If a schedule already exists for schedule_def, overwrite bash script and
            # metadata file
            existing_schedule_state = instance.get_job_state(
                external_schedule.get_external_origin_id())
            if existing_schedule_state:
                new_timestamp = existing_schedule_state.job_specific_data.start_timestamp
                if not new_timestamp and existing_schedule_state.status == JobStatus.RUNNING:
                    new_timestamp = get_timestamp_from_utc_datetime(
                        get_current_datetime_in_utc())

                # Keep the status, update target and cron schedule
                schedule_state = JobState(
                    external_schedule.get_external_origin(),
                    JobType.SCHEDULE,
                    existing_schedule_state.status,
                    ScheduleJobData(
                        external_schedule.cron_schedule,
                        new_timestamp,
                        scheduler=self.__class__.__name__,
                    ),
                )

                instance.update_job_state(schedule_state)
                schedules_to_restart.append(
                    (existing_schedule_state, external_schedule))
            else:
                self._create_new_schedule_state(instance, external_schedule)

        # Delete all existing schedules that are not in external schedules
        external_schedule_origin_ids = {
            s.get_external_origin_id()
            for s in external_repository.get_external_schedules()
        }
        existing_schedule_origin_ids = set([
            job.job_origin_id for job in instance.all_stored_job_state(
                external_repository.get_external_origin_id())
            if job.job_type == JobType.SCHEDULE
        ])
        schedule_origin_ids_to_delete = existing_schedule_origin_ids - external_schedule_origin_ids

        schedule_reconciliation_errors = []
        for schedule_state, external_schedule in schedules_to_restart:
            # Restart is only needed if the schedule was previously running
            if schedule_state.status == JobStatus.RUNNING:
                try:
                    self.refresh_schedule(instance, external_schedule)
                except DagsterSchedulerError as e:
                    schedule_reconciliation_errors.append(e)

            if schedule_state.status == JobStatus.STOPPED:
                try:
                    self.stop_schedule(
                        instance, external_schedule.get_external_origin_id())
                except DagsterSchedulerError as e:
                    schedule_reconciliation_errors.append(e)

        for schedule_origin_id in schedule_origin_ids_to_delete:
            try:
                instance.stop_schedule_and_delete_from_storage(
                    schedule_origin_id)
            except DagsterSchedulerError as e:
                schedule_reconciliation_errors.append(e)

        if len(schedule_reconciliation_errors):
            raise DagsterScheduleReconciliationError(
                "One or more errors were encountered by the Scheduler while starting or stopping schedules. "
                "Individual error messages follow:",
                errors=schedule_reconciliation_errors,
            )
def test_bad_schedule_mixed_with_good_schedule(external_repo_context):
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        good_schedule = external_repo.get_external_schedule("simple_schedule")
        bad_schedule = external_repo.get_external_schedule(
            "bad_should_execute_schedule_on_odd_days")

        good_origin = good_schedule.get_origin()
        bad_origin = bad_schedule.get_origin()
        initial_datetime = datetime(
            year=2019,
            month=2,
            day=27,
            hour=0,
            minute=0,
            second=0,
            tzinfo=get_utc_timezone(),
        )
        with freeze_time(initial_datetime) as frozen_datetime:
            instance.start_schedule_and_update_storage_state(good_schedule)
            instance.start_schedule_and_update_storage_state(bad_schedule)

            launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                  get_current_datetime_in_utc())

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 "2019-02-26")

            good_ticks = instance.get_schedule_ticks(good_origin.get_id())
            assert len(good_ticks) == 1
            validate_tick(
                good_ticks[0],
                good_schedule,
                initial_datetime,
                ScheduleTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )

            bad_ticks = instance.get_schedule_ticks(bad_origin.get_id())
            assert len(bad_ticks) == 1

            assert bad_ticks[0].status == ScheduleTickStatus.FAILURE

            assert ("Error occurred during the execution of should_execute "
                    "for schedule bad_should_execute_schedule"
                    in bad_ticks[0].error.message)

            frozen_datetime.tick(delta=timedelta(days=1))

            new_now = get_current_datetime_in_utc()

            launch_scheduled_runs(instance, get_default_scheduler_logger(),
                                  get_current_datetime_in_utc())

            assert instance.get_runs_count() == 3
            wait_for_all_runs_to_start(instance)

            good_schedule_runs = instance.get_runs(
                filters=PipelineRunsFilter.for_schedule(good_schedule))
            assert len(good_schedule_runs) == 2
            validate_run_started(good_schedule_runs[0], new_now, "2019-02-27")

            good_ticks = instance.get_schedule_ticks(good_origin.get_id())
            assert len(good_ticks) == 2
            validate_tick(
                good_ticks[0],
                good_schedule,
                new_now,
                ScheduleTickStatus.SUCCESS,
                good_schedule_runs[0].run_id,
            )

            bad_schedule_runs = instance.get_runs(
                filters=PipelineRunsFilter.for_schedule(bad_schedule))
            assert len(bad_schedule_runs) == 1
            validate_run_started(bad_schedule_runs[0], new_now, "2019-02-27")

            bad_ticks = instance.get_schedule_ticks(bad_origin.get_id())
            assert len(bad_ticks) == 2
            validate_tick(
                bad_ticks[0],
                bad_schedule,
                new_now,
                ScheduleTickStatus.SUCCESS,
                bad_schedule_runs[0].run_id,
            )