def test_failure_recovery_between_multi_runs(external_repo_context,
                                             crash_location, crash_signal):
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = create_pendulum_time(year=2019,
                                                month=2,
                                                day=28,
                                                hour=0,
                                                minute=0,
                                                second=0)
        frozen_datetime = initial_datetime.add()
        external_schedule = external_repo.get_external_schedule(
            "multi_run_schedule")
        with pendulum.test(frozen_datetime):
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            wait_for_all_runs_to_start(instance)
            assert instance.get_runs_count() == 1
            validate_run_started(instance.get_runs()[0], initial_datetime)

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1

        frozen_datetime = frozen_datetime.add(minutes=1)
        with pendulum.test(frozen_datetime):
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0
            assert instance.get_runs_count() == 2
            validate_run_started(instance.get_runs()[0], initial_datetime)
            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                JobTickStatus.SUCCESS,
                [run.run_id for run in instance.get_runs()],
            )
Ejemplo n.º 2
0
def test_crash_after_submit(external_repo_context, crash_signal, capfd):
    with instance_for_context(external_repo_context) as (
            instance,
            _grpc_server_registry,
            external_repo,
    ):
        external_partition_set = external_repo.get_external_partition_set(
            "simple_partition_set")
        instance.add_backfill(
            PartitionBackfill(
                backfill_id="simple",
                partition_set_origin=external_partition_set.
                get_external_origin(),
                status=BulkActionStatus.REQUESTED,
                partition_names=["one", "two", "three"],
                from_failure=False,
                reexecution_steps=None,
                tags=None,
                backfill_timestamp=pendulum.now().timestamp(),
            ))
        launch_process = multiprocessing.Process(
            target=_test_backfill_in_subprocess,
            args=[instance.get_ref(), {
                "AFTER_SUBMIT": crash_signal
            }],
        )
        launch_process.start()
        launch_process.join(timeout=60)
        assert launch_process.exitcode != 0
        captured = capfd.readouterr()
        assert (
            captured.out.replace("\r\n", "\n") ==
            """2021-02-16 18:00:00 - BackfillDaemon - INFO - Starting backfill for simple
""")

        backfill = instance.get_backfill("simple")
        assert backfill.status == BulkActionStatus.REQUESTED
        assert instance.get_runs_count() == 3

        # resume backfill
        launch_process = multiprocessing.Process(
            target=_test_backfill_in_subprocess,
            args=[instance.get_ref(), None],
        )
        launch_process.start()
        launch_process.join(timeout=60)
        captured = capfd.readouterr()
        assert (
            captured.out.replace("\r\n", "\n") ==
            """2021-02-16 18:00:00 - BackfillDaemon - INFO - Starting backfill for simple
2021-02-16 18:00:00 - BackfillDaemon - INFO - Found 3 existing runs for backfill simple, skipping
2021-02-16 18:00:00 - BackfillDaemon - INFO - Backfill completed for simple for 3 partitions
""")

        backfill = instance.get_backfill("simple")
        assert backfill.status == BulkActionStatus.COMPLETED
        assert instance.get_runs_count() == 3
Ejemplo n.º 3
0
def test_before_submit(external_repo_context, crash_signal, capfd):
    with instance_for_context(external_repo_context) as (
        instance,
        _grpc_server_registry,
        external_repo,
    ):
        external_partition_set = external_repo.get_external_partition_set("simple_partition_set")
        instance.add_backfill(
            PartitionBackfill(
                backfill_id="simple",
                partition_set_origin=external_partition_set.get_external_origin(),
                status=BulkActionStatus.REQUESTED,
                partition_names=["one", "two", "three"],
                from_failure=False,
                reexecution_steps=None,
                tags=None,
                backfill_timestamp=pendulum.now().timestamp(),
            )
        )
        launch_process = multiprocessing.Process(
            target=_test_backfill_in_subprocess,
            args=[instance.get_ref(), {"BEFORE_SUBMIT": crash_signal}],
        )
        launch_process.start()
        launch_process.join(timeout=60)
        assert launch_process.exitcode != 0
        assert (
            get_logger_output_from_capfd(capfd, "dagster.daemon.BackfillDaemon")
            == """2021-02-16 18:00:00 -0600 - dagster.daemon.BackfillDaemon - INFO - Starting backfill for simple"""
        )

        backfill = instance.get_backfill("simple")
        assert backfill.status == BulkActionStatus.REQUESTED
        assert instance.get_runs_count() == 0

        # resume backfill
        launch_process = multiprocessing.Process(
            target=_test_backfill_in_subprocess,
            args=[instance.get_ref(), None],
        )
        launch_process.start()
        launch_process.join(timeout=60)
        assert (
            get_logger_output_from_capfd(capfd, "dagster.daemon.BackfillDaemon")
            == """2021-02-16 18:00:00 -0600 - dagster.daemon.BackfillDaemon - INFO - Starting backfill for simple
2021-02-16 18:00:00 -0600 - dagster.daemon.BackfillDaemon - INFO - Backfill completed for simple for 3 partitions"""
        )

        backfill = instance.get_backfill("simple")
        assert backfill.status == BulkActionStatus.COMPLETED
        assert instance.get_runs_count() == 3
Ejemplo n.º 4
0
def test_multi():
    with instance_for_test() as instance:
        pipeline_name = "foo_pipeline"
        pipeline_run = create_run_for_test(instance,
                                           pipeline_name=pipeline_name)

        step_keys = ["A", "B", "C"]

        with instance.compute_log_manager.watch(pipeline_run):
            print("outer 1")  # pylint: disable=print-call
            print("outer 2")  # pylint: disable=print-call
            print("outer 3")  # pylint: disable=print-call

            for step_key in step_keys:
                process = multiprocessing.Process(target=execute_inner,
                                                  args=(step_key, pipeline_run,
                                                        instance.get_ref()))
                process.start()
                process.join()

        for step_key in step_keys:
            stdout = instance.compute_log_manager.read_logs_file(
                pipeline_run.run_id, step_key, ComputeIOType.STDOUT)
            assert normalize_file_content(
                stdout.data) == expected_inner_output(step_key)

        full_out = instance.compute_log_manager.read_logs_file(
            pipeline_run.run_id, pipeline_name, ComputeIOType.STDOUT)

        # The way that the multiprocess compute-logging interacts with pytest (which stubs out the
        # sys.stdout fileno) makes this difficult to test.  The pytest-captured stdout only captures
        # the stdout from the outer process, not also the inner process
        assert normalize_file_content(full_out.data).startswith(
            expected_outer_prefix())
Ejemplo n.º 5
0
def test_simple(external_repo_context, capfd):
    with instance_for_context(external_repo_context) as (instance,
                                                         external_repo):
        external_partition_set = external_repo.get_external_partition_set(
            "simple_partition_set")
        instance.add_backfill(
            PartitionBackfill(
                backfill_id="simple",
                partition_set_origin=external_partition_set.
                get_external_origin(),
                status=BulkActionStatus.REQUESTED,
                partition_names=["one", "two", "three"],
                from_failure=False,
                reexecution_steps=None,
                tags=None,
                backfill_timestamp=pendulum.now().timestamp(),
            ))
        launch_process = multiprocessing.Process(
            target=_test_backfill_in_subprocess,
            args=[instance.get_ref(), None],
        )
        launch_process.start()
        launch_process.join(timeout=60)
        backfill = instance.get_backfill("simple")
        assert backfill.status == BulkActionStatus.COMPLETED
        captured = capfd.readouterr()
        assert (
            captured.out.replace("\r\n", "\n") ==
            """2021-02-16 18:00:00 - BackfillDaemon - INFO - Starting backfill for simple
2021-02-16 18:00:00 - BackfillDaemon - INFO - Backfill completed for simple for 3 partitions
""")
Ejemplo n.º 6
0
def execute_child_process_command(command):
    """Execute a ChildProcessCommand in a new process.

    This function starts a new process whose execution target is a ChildProcessCommand wrapped by
    _execute_command_in_child_process; polls the queue for events yielded by the child process
    until the process dies and the queue is empty.

    This function yields a complex set of objects to enable having multiple child process
    executions in flight:
        * None - nothing has happened, yielded to enable cooperative multitasking other iterators

        * ChildProcessEvent - Family of objects that communicates state changes in the child process

        * KeyboardInterrupt - Yielded in the case that an interrupt was recieved while
            polling the child process. Yielded instead of raised to allow forwarding of the
            interrupt to the child and completion of the iterator for this child and
            any others that may be executing

        * The actual values yielded by the child process command

    Args:
        command (ChildProcessCommand): The command to execute in the child process.

    Warning: if the child process is in an infinite loop, this will
    also infinitely loop.
    """

    check.inst_param(command, "command", ChildProcessCommand)

    event_queue = multiprocessing.Queue()
    try:
        process = multiprocessing.Process(
            target=_execute_command_in_child_process,
            args=(event_queue, command))
        process.start()

        completed_properly = False

        while not completed_properly:
            event = _poll_for_event(process, event_queue)

            if event == PROCESS_DEAD_AND_QUEUE_EMPTY:
                break

            yield event

            if isinstance(
                    event,
                (ChildProcessDoneEvent, ChildProcessSystemErrorEvent)):
                completed_properly = True

        if not completed_properly:
            # TODO Figure out what to do about stderr/stdout
            raise ChildProcessCrashException(exit_code=process.exitcode)

        process.join()
    finally:
        event_queue.close()
Ejemplo n.º 7
0
    def test_concurrent_sqlite_event_log_connections(self, storage):
        tmpdir_path = storage._base_dir  # pylint: disable=protected-access
        exceptions = multiprocessing.Queue()
        ps = []
        for _ in range(5):
            ps.append(multiprocessing.Process(target=self.cmd, args=(exceptions, tmpdir_path)))
        for p in ps:
            p.start()

        j = 0
        for p in ps:
            p.join()
            j += 1

        assert j == 5

        excs = []
        while not exceptions.empty():
            excs.append(exceptions.get())
        assert not excs, excs
Ejemplo n.º 8
0
def test_concurrent_sqlite_event_log_connections():
    exceptions = multiprocessing.Queue()
    with tempfile.TemporaryDirectory() as tmpdir_path:

        ps = []
        for _ in range(5):
            ps.append(multiprocessing.Process(target=cmd, args=(exceptions, tmpdir_path)))
        for p in ps:
            p.start()

        j = 0
        for p in ps:
            p.join()
            j += 1

        assert j == 5

        excs = []
        while not exceptions.empty():
            excs.append(exceptions.get())
        assert not excs, excs
def test_failure_recovery_after_run_created(external_repo_context,
                                            crash_location, crash_signal,
                                            capfd):
    # Verify that if the scheduler crashes or is interrupted after a run is created,
    # it will just re-launch the already-created run when it runs again
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = pendulum.datetime(year=2019,
                                             month=2,
                                             day=27,
                                             hour=0,
                                             minute=0,
                                             second=0)
        frozen_datetime = initial_datetime.add()
        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with pendulum.test(frozen_datetime):
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            capfd.readouterr()

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.STARTED

            assert instance.get_runs_count() == 1

            if crash_location == "RUN_CREATED":
                run = instance.get_runs()[0]
                # Run was created, but hasn't launched yet
                assert run.tags[
                    SCHEDULED_EXECUTION_TIME_TAG] == frozen_datetime.isoformat(
                    )
                assert run.tags[PARTITION_NAME_TAG] == "2019-02-26"
                assert run.status == PipelineRunStatus.NOT_STARTED
            else:
                # The run was created and launched - running again should do nothing other than
                # moving the tick to success state.

                # The fact that we need to add this line indicates that there is still a theoretical
                # possible race condition - if the scheduler fails after launching a run
                # and then runs again between when the run was launched and when its status is changed to STARTED by the executor, we could
                # end up launching the same run twice. Run queueing or some other way to immediately
                # identify that a run was launched would help eliminate this race condition. For now,
                # eliminate the possibility by waiting for the run to start before running the
                # scheduler again.
                wait_for_all_runs_to_start(instance)

                run = instance.get_runs()[0]
                validate_run_started(instance.get_runs()[0], frozen_datetime,
                                     pendulum.datetime(2019, 2, 26))

                assert run.status in [
                    PipelineRunStatus.STARTED, PipelineRunStatus.SUCCESS
                ]

        frozen_datetime = frozen_datetime.add(minutes=5)
        with pendulum.test(frozen_datetime):

            # Running again just launches the existing run and marks the tick as success
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 pendulum.datetime(2019, 2, 26))

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                JobTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )

            captured = capfd.readouterr()
            if crash_location == "RUN_CREATED":
                assert (
                    "Run {run_id} already created for this execution of simple_schedule"
                    .format(run_id=instance.get_runs()[0].run_id)
                    in captured.out)
            else:
                assert (
                    "Run {run_id} already completed for this execution of simple_schedule"
                    .format(run_id=instance.get_runs()[0].run_id)
                    in captured.out)
Ejemplo n.º 10
0
    def StartRun(self, request, _context):
        if self._shutdown_once_executions_finish_event.is_set():
            return api_pb2.StartRunReply(
                serialized_start_run_result=serialize_dagster_namedtuple(
                    StartRunResult(
                        success=False,
                        message="Tried to start a run on a server after telling it to shut down",
                        serializable_error_info=None,
                    )
                )
            )

        try:
            execute_run_args = check.inst(
                deserialize_json_to_dagster_namedtuple(request.serialized_execute_run_args),
                ExecuteExternalPipelineArgs,
            )
            run_id = execute_run_args.pipeline_run_id
            recon_pipeline = self._recon_pipeline_from_origin(execute_run_args.pipeline_origin)

        except:  # pylint: disable=bare-except
            return api_pb2.StartRunReply(
                serialized_start_run_result=serialize_dagster_namedtuple(
                    StartRunResult(
                        success=False,
                        message=None,
                        serializable_error_info=serializable_error_info_from_exc_info(
                            sys.exc_info()
                        ),
                    )
                )
            )

        event_queue = multiprocessing.Queue()
        termination_event = multiprocessing.Event()
        execution_process = multiprocessing.Process(
            target=start_run_in_subprocess,
            args=[
                request.serialized_execute_run_args,
                recon_pipeline,
                event_queue,
                termination_event,
            ],
        )

        with self._execution_lock:
            execution_process.start()
            self._executions[run_id] = (
                execution_process,
                execute_run_args.instance_ref,
            )
            self._termination_events[run_id] = termination_event

        success = None
        message = None
        serializable_error_info = None

        while success is None:
            time.sleep(EVENT_QUEUE_POLL_INTERVAL)
            # We use `get_nowait()` instead of `get()` so that we can handle the case where the
            # execution process has died unexpectedly -- `get()` would hang forever in that case
            try:
                dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait()
            except queue.Empty:
                if not execution_process.is_alive():
                    # subprocess died unexpectedly
                    success = False
                    message = (
                        "GRPC server: Subprocess for {run_id} terminated unexpectedly with "
                        "exit code {exit_code}".format(
                            run_id=run_id, exit_code=execution_process.exitcode,
                        )
                    )
                    serializable_error_info = serializable_error_info_from_exc_info(sys.exc_info())
            else:
                if isinstance(
                    dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful
                ):
                    success = True
                elif isinstance(
                    dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete
                ):
                    continue
                if isinstance(dagster_event_or_ipc_error_message_or_done, IPCErrorMessage):
                    success = False
                    message = dagster_event_or_ipc_error_message_or_done.message
                    serializable_error_info = (
                        dagster_event_or_ipc_error_message_or_done.serializable_error_info
                    )

        # Ensure that if the run failed, we remove it from the executions map before
        # returning so that CanCancel will never return True
        if not success:
            with self._execution_lock:
                self._clear_run(run_id)

        return api_pb2.StartRunReply(
            serialized_start_run_result=serialize_dagster_namedtuple(
                StartRunResult(
                    success=success,
                    message=message,
                    serializable_error_info=serializable_error_info,
                )
            )
        )
Ejemplo n.º 11
0
def test_failure_recovery_before_run_created(external_repo_context,
                                             crash_location, crash_signal,
                                             capfd):
    with central_timezone():
        # Verify that if the scheduler crashes or is interrupted before a run is created,
        # it will create exactly one tick/run when it is re-launched
        with instance_with_schedules(external_repo_context) as (instance,
                                                                external_repo):
            initial_datetime = datetime(
                year=2019,
                month=2,
                day=27,
                hour=0,
                minute=0,
                second=0,
                tzinfo=get_utc_timezone(),
            )
            external_schedule = external_repo.get_external_schedule(
                "simple_schedule")
            with freeze_time(initial_datetime) as frozen_datetime:
                instance.start_schedule_and_update_storage_state(
                    external_schedule)

                debug_crash_flags = {
                    external_schedule.name: {
                        crash_location: crash_signal
                    }
                }

                scheduler_process = multiprocessing.Process(
                    target=_test_launch_scheduled_runs_in_subprocess,
                    args=[
                        instance.get_ref(),
                        get_current_datetime_in_utc(), debug_crash_flags
                    ],
                )
                scheduler_process.start()
                scheduler_process.join(timeout=60)

                assert scheduler_process.exitcode != 0

                captured = capfd.readouterr()
                assert (
                    captured.out ==
                    """2019-02-26 18:00:00 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:00:00 - dagster-scheduler - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000
""")

                ticks = instance.get_schedule_ticks(
                    external_schedule.get_origin_id())
                assert len(ticks) == 1
                assert ticks[0].status == ScheduleTickStatus.STARTED

                assert instance.get_runs_count() == 0

                frozen_datetime.tick(delta=timedelta(minutes=5))

                scheduler_process = multiprocessing.Process(
                    target=_test_launch_scheduled_runs_in_subprocess,
                    args=[
                        instance.get_ref(),
                        get_current_datetime_in_utc(), None
                    ],
                )
                scheduler_process.start()
                scheduler_process.join(timeout=60)
                assert scheduler_process.exitcode == 0

                assert instance.get_runs_count() == 1
                wait_for_all_runs_to_start(instance)
                validate_run_started(instance.get_runs()[0], initial_datetime,
                                     "2019-02-26")

                ticks = instance.get_schedule_ticks(
                    external_schedule.get_origin_id())
                assert len(ticks) == 1
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    ScheduleTickStatus.SUCCESS,
                    instance.get_runs()[0].run_id,
                )
                captured = capfd.readouterr()
                assert (
                    captured.out ==
                    """2019-02-26 18:05:00 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:05:00 - dagster-scheduler - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000
2019-02-26 18:05:00 - dagster-scheduler - INFO - Resuming previously interrupted schedule execution
2019-02-26 18:05:00 - dagster-scheduler - INFO - Completed scheduled launch of run {run_id} for simple_schedule
""".format(run_id=instance.get_runs()[0].run_id))
def test_failure_recovery_before_run_created(external_repo_context,
                                             crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted before a run is created,
    # it will create exactly one tick/run when it is re-launched
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = datetime(
            year=2019,
            month=2,
            day=27,
            hour=0,
            minute=0,
            second=0,
            tzinfo=get_utc_timezone(),
        )
        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with freeze_time(initial_datetime) as frozen_datetime:
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    get_current_datetime_in_utc(), debug_crash_flags
                ],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            ticks = instance.get_schedule_ticks(
                external_schedule.get_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == ScheduleTickStatus.STARTED

            assert instance.get_runs_count() == 0

            frozen_datetime.tick(delta=timedelta(minutes=5))

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(),
                      get_current_datetime_in_utc(), None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 "2019-02-26")

            ticks = instance.get_schedule_ticks(
                external_schedule.get_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                ScheduleTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )
Ejemplo n.º 13
0
    def ExecuteRun(self, request, _context):
        if self._shutdown_once_executions_finish_event.is_set():
            yield api_pb2.ExecuteRunEvent(
                serialized_dagster_event_or_ipc_error_message=
                serialize_dagster_namedtuple(
                    IPCErrorMessage(
                        serializable_error_info=None,
                        message=
                        "Tried to start a run on a server after telling it to shut down",
                    )))

        try:
            execute_run_args = deserialize_json_to_dagster_namedtuple(
                request.serialized_execute_run_args)
            check.inst_param(execute_run_args, "execute_run_args",
                             ExecuteRunArgs)

            run_id = execute_run_args.pipeline_run_id

            recon_pipeline = self._recon_pipeline_from_origin(
                execute_run_args.pipeline_origin)

        except:  # pylint: disable=bare-except
            yield api_pb2.ExecuteRunEvent(
                serialized_dagster_event_or_ipc_error_message=
                serialize_dagster_namedtuple(
                    IPCErrorMessage(
                        serializable_error_info=
                        serializable_error_info_from_exc_info(sys.exc_info()),
                        message="Error during RPC setup for ExecuteRun",
                    )))
            return

        event_queue = multiprocessing.Queue()
        termination_event = multiprocessing.Event()
        execution_process = multiprocessing.Process(
            target=execute_run_in_subprocess,
            args=[
                request.serialized_execute_run_args,
                recon_pipeline,
                event_queue,
                termination_event,
            ],
        )
        with self._execution_lock:
            execution_process.start()
            self._executions[run_id] = (
                execution_process,
                execute_run_args.instance_ref,
            )
            self._termination_events[run_id] = termination_event

        done = False
        while not done:
            try:
                # We use `get_nowait()` instead of `get()` so that we can handle the case where the
                # execution process has died unexpectedly -- `get()` would hang forever in that case
                dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait(
                )
            except queue.Empty:
                if not execution_process.is_alive():
                    # subprocess died unexpectedly
                    yield api_pb2.ExecuteRunEvent(
                        serialized_dagster_event_or_ipc_error_message=
                        serialize_dagster_namedtuple(
                            IPCErrorMessage(
                                serializable_error_info=
                                serializable_error_info_from_exc_info(
                                    sys.exc_info()),
                                message=
                                ("GRPC server: Subprocess for {run_id} terminated unexpectedly"
                                 ).format(run_id=run_id),
                            )))
                    done = True
                time.sleep(EVENT_QUEUE_POLL_INTERVAL)
            else:
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              RunInSubprocessComplete):
                    done = True
                elif isinstance(dagster_event_or_ipc_error_message_or_done,
                                StartRunInSubprocessSuccessful):
                    continue
                else:
                    yield api_pb2.ExecuteRunEvent(
                        serialized_dagster_event_or_ipc_error_message=
                        serialize_dagster_namedtuple(
                            dagster_event_or_ipc_error_message_or_done))

        with self._execution_lock:
            if run_id in self._executions:
                del self._executions[run_id]
            if run_id in self._termination_events:
                del self._termination_events[run_id]
Ejemplo n.º 14
0
def test_failure_before_run_created(external_repo_context, crash_location,
                                    crash_signal, capfd):
    frozen_datetime = to_timezone(
        create_pendulum_time(year=2019,
                             month=2,
                             day=28,
                             hour=0,
                             minute=0,
                             second=1,
                             tz="UTC"),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            _grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(frozen_datetime):
            external_sensor = external_repo.get_external_sensor(
                "simple_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))

            # create a tick
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            launch_process.start()
            launch_process.join(timeout=60)
            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.SKIPPED
            captured = capfd.readouterr()

            # create a starting tick, but crash
            debug_crash_flags = {
                external_sensor.name: {
                    crash_location: crash_signal
                }
            }
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    frozen_datetime.add(seconds=31), debug_crash_flags
                ],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode != 0

            captured = capfd.readouterr()

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            assert ticks[0].status == JobTickStatus.STARTED
            assert not int(
                ticks[0].timestamp) % 2  # skip condition for simple_sensor
            assert instance.get_runs_count() == 0

            # create another tick, but ensure that the last evaluation time used is from the first,
            # successful tick rather than the failed tick
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    frozen_datetime.add(seconds=62), None
                ],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode == 0
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            captured = capfd.readouterr()
            assert (
                captured.out.replace("\r\n", "\n") ==
                f"""2019-02-27 18:01:03 - SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor
2019-02-27 18:01:03 - SensorDaemon - INFO - Launching run for simple_sensor
2019-02-27 18:01:03 - SensorDaemon - INFO - Completed launch of run {run.run_id} for simple_sensor
""")

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 3
            assert ticks[0].status == JobTickStatus.SUCCESS
Ejemplo n.º 15
0
def test_failure_after_run_launched(external_repo_context, crash_location,
                                    crash_signal, capfd):
    frozen_datetime = to_timezone(
        create_pendulum_time(
            year=2019,
            month=2,
            day=28,
            hour=0,
            minute=0,
            second=0,
            tz="UTC",
        ),
        "US/Central",
    )
    with instance_with_sensors(external_repo_context) as (
            instance,
            _grpc_server_registry,
            external_repo,
    ):
        with pendulum.test(frozen_datetime):
            external_sensor = external_repo.get_external_sensor(
                "run_key_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))

            # create a run, launch but crash
            debug_crash_flags = {
                external_sensor.name: {
                    crash_location: crash_signal
                }
            }
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode != 0

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())

            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.STARTED
            assert instance.get_runs_count() == 1

            run = instance.get_runs()[0]
            wait_for_all_runs_to_start(instance)
            assert run.tags.get(SENSOR_NAME_TAG) == "run_key_sensor"
            assert run.tags.get(RUN_KEY_TAG) == "only_once"
            capfd.readouterr()

            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    frozen_datetime.add(seconds=1), None
                ],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode == 0
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            captured = capfd.readouterr()

            assert (
                'Skipping 1 run for sensor run_key_sensor already completed with run keys: ["only_once"]'
                in captured.out)

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            assert ticks[0].status == JobTickStatus.SKIPPED
def test_failure_recovery_after_tick_success(external_repo_context,
                                             crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted after a run is created,
    # it will just re-launch the already-created run when it runs again
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = pendulum.datetime(year=2019,
                                             month=2,
                                             day=27,
                                             hour=0,
                                             minute=0,
                                             second=0)
        frozen_datetime = initial_datetime.add()
        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with pendulum.test(frozen_datetime):
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            # As above there's a possible race condition here if the scheduler crashes
            # and launches the same run twice if we crash right after the launch and re-run
            # before the run actually starts
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 pendulum.datetime(2019, 2, 26))

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1

            if crash_signal == get_terminate_signal():
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    JobTickStatus.STARTED,
                    None,
                )
            else:
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    JobTickStatus.SUCCESS,
                    instance.get_runs()[0].run_id,
                )

        frozen_datetime = frozen_datetime.add(minutes=1)
        with pendulum.test(frozen_datetime):
            # Running again just marks the tick as success since the run has already started
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            validate_run_started(instance.get_runs()[0], initial_datetime,
                                 pendulum.datetime(2019, 2, 26))

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                JobTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )
Ejemplo n.º 17
0
    def StartRun(self, request, _context):
        execute_run_args = check.inst(
            deserialize_json_to_dagster_namedtuple(
                request.serialized_execute_run_args),
            ExecuteRunArgs,
        )

        try:
            execute_run_args = check.inst(
                deserialize_json_to_dagster_namedtuple(
                    request.serialized_execute_run_args),
                ExecuteRunArgs,
            )

            run_id = execute_run_args.pipeline_run_id

            recon_pipeline = self._recon_pipeline_from_origin(
                execute_run_args.pipeline_origin)

        except:  # pylint: disable=bare-except
            return api_pb2.StartRunReply(
                serialized_start_run_result=serialize_dagster_namedtuple(
                    StartRunResult(
                        success=False,
                        message=None,
                        serializable_error_info=
                        serializable_error_info_from_exc_info(sys.exc_info()),
                    )))

        event_queue = multiprocessing.Queue()
        termination_event = multiprocessing.Event()
        execution_process = multiprocessing.Process(
            target=start_run_in_subprocess,
            args=[
                request.serialized_execute_run_args,
                recon_pipeline,
                event_queue,
                termination_event,
            ],
        )
        with self._execution_lock:
            execution_process.start()
            self._executions[run_id] = execution_process
            self._termination_events[run_id] = termination_event

        success = None
        message = None
        serializable_error_info = None

        while success is None:
            time.sleep(EVENT_QUEUE_POLL_INTERVAL)
            # We use `get_nowait()` instead of `get()` so that we can handle the case where the
            # execution process has died unexpectedly -- `get()` would hang forever in that case
            try:
                dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait(
                )
            except queue.Empty:
                if not execution_process.is_alive():
                    # subprocess died unexpectedly
                    success = False
                    message = (
                        'GRPC server: Subprocess for {run_id} terminated unexpectedly with '
                        'exit code {exit_code}'.format(
                            run_id=run_id,
                            exit_code=execution_process.exitcode,
                        ))
                    serializable_error_info = serializable_error_info_from_exc_info(
                        sys.exc_info())
            else:
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              StartRunInSubprocessSuccessful):
                    success = True
                elif isinstance(dagster_event_or_ipc_error_message_or_done,
                                RunInSubprocessComplete):
                    continue
                if isinstance(dagster_event_or_ipc_error_message_or_done,
                              IPCErrorMessage):
                    success = False
                    message = dagster_event_or_ipc_error_message_or_done.message
                    serializable_error_info = (
                        dagster_event_or_ipc_error_message_or_done.
                        serializable_error_info)

        return api_pb2.StartRunReply(
            serialized_start_run_result=serialize_dagster_namedtuple(
                StartRunResult(
                    success=success,
                    message=message,
                    serializable_error_info=serializable_error_info,
                )))
Ejemplo n.º 18
0
def test_failure_recovery_before_run_created(instance, external_repo,
                                             crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted before a run is created,
    # it will create exactly one tick/run when it is re-launched
    initial_datetime = to_timezone(
        create_pendulum_time(year=2019,
                             month=2,
                             day=27,
                             hour=0,
                             minute=0,
                             second=0,
                             tz="UTC"),
        "US/Central",
    )

    frozen_datetime = initial_datetime.add()

    external_schedule = external_repo.get_external_schedule("simple_schedule")
    with pendulum.test(frozen_datetime):
        instance.start_schedule_and_update_storage_state(external_schedule)

        debug_crash_flags = {
            external_schedule.name: {
                crash_location: crash_signal
            }
        }

        scheduler_process = multiprocessing.Process(
            target=_test_launch_scheduled_runs_in_subprocess,
            args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
        )
        scheduler_process.start()
        scheduler_process.join(timeout=60)

        assert scheduler_process.exitcode != 0

        ticks = instance.get_job_ticks(
            external_schedule.get_external_origin_id())
        assert len(ticks) == 1
        assert ticks[0].status == TickStatus.STARTED

        assert instance.get_runs_count() == 0

    frozen_datetime = frozen_datetime.add(minutes=5)
    with pendulum.test(frozen_datetime):
        scheduler_process = multiprocessing.Process(
            target=_test_launch_scheduled_runs_in_subprocess,
            args=[instance.get_ref(), frozen_datetime, None],
        )
        scheduler_process.start()
        scheduler_process.join(timeout=60)
        assert scheduler_process.exitcode == 0

        assert instance.get_runs_count() == 1
        wait_for_all_runs_to_start(instance)
        validate_run_exists(
            instance.get_runs()[0],
            execution_time=initial_datetime,
            partition_time=create_pendulum_time(2019, 2, 26),
        )

        ticks = instance.get_job_ticks(
            external_schedule.get_external_origin_id())
        assert len(ticks) == 1
        validate_tick(
            ticks[0],
            external_schedule,
            initial_datetime,
            TickStatus.SUCCESS,
            [instance.get_runs()[0].run_id],
        )
def test_failure_recovery_before_run_created(external_repo_context,
                                             crash_location, crash_signal,
                                             capfd):
    # Verify that if the scheduler crashes or is interrupted before a run is created,
    # it will create exactly one tick/run when it is re-launched
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        initial_datetime = pendulum.datetime(year=2019,
                                             month=2,
                                             day=27,
                                             hour=0,
                                             minute=0,
                                             second=0).in_tz("US/Central")

        frozen_datetime = initial_datetime.add()

        external_schedule = external_repo.get_external_schedule(
            "simple_schedule")
        with pendulum.test(frozen_datetime):
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {
                external_schedule.name: {
                    crash_location: crash_signal
                }
            }

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            captured = capfd.readouterr()
            assert (
                captured.out.replace("\r\n", "\n") ==
                """2019-02-26 18:00:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:00:00 - SchedulerDaemon - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000
""")

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.STARTED

            assert instance.get_runs_count() == 0

        frozen_datetime = frozen_datetime.add(minutes=5)
        with pendulum.test(frozen_datetime):
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(
                instance.get_runs()[0],
                execution_time=initial_datetime,
                partition_time=pendulum.datetime(2019, 2, 26),
            )

            ticks = instance.get_job_ticks(
                external_schedule.get_external_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                JobTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )
            captured = capfd.readouterr()
            assert (
                captured.out.replace("\r\n", "\n") ==
                """2019-02-26 18:05:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule
2019-02-26 18:05:00 - SchedulerDaemon - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000
2019-02-26 18:05:00 - SchedulerDaemon - INFO - Resuming previously interrupted schedule execution
2019-02-26 18:05:00 - SchedulerDaemon - INFO - Completed scheduled launch of run {run_id} for simple_schedule
""".format(run_id=instance.get_runs()[0].run_id))
Ejemplo n.º 20
0
def test_failure_after_run_created_before_run_launched(external_repo_context,
                                                       crash_location,
                                                       crash_signal, capfd):
    frozen_datetime = pendulum.datetime(
        year=2019,
        month=2,
        day=28,
        hour=0,
        minute=0,
        second=0,
    ).in_tz("US/Central")
    with instance_with_sensors(external_repo_context) as (instance,
                                                          external_repo):
        with pendulum.test(frozen_datetime):
            external_sensor = external_repo.get_external_sensor(
                "run_key_sensor")
            instance.add_job_state(
                JobState(external_sensor.get_external_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))

            # create a starting tick, but crash
            debug_crash_flags = {
                external_sensor.name: {
                    crash_location: crash_signal
                }
            }
            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[instance.get_ref(), frozen_datetime, debug_crash_flags],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode != 0

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())

            assert len(ticks) == 1
            assert ticks[0].status == JobTickStatus.STARTED
            assert instance.get_runs_count() == 1

            run = instance.get_runs()[0]
            # Run was created, but hasn't launched yet
            assert run.status == PipelineRunStatus.NOT_STARTED
            assert run.tags.get(SENSOR_NAME_TAG) == "run_key_sensor"
            assert run.tags.get(RUN_KEY_TAG) == "only_once"

            # clear output
            capfd.readouterr()

            launch_process = multiprocessing.Process(
                target=_test_launch_sensor_runs_in_subprocess,
                args=[
                    instance.get_ref(),
                    frozen_datetime.add(seconds=1), None
                ],
            )
            launch_process.start()
            launch_process.join(timeout=60)

            assert launch_process.exitcode == 0
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            run = instance.get_runs()[0]
            captured = capfd.readouterr()

            assert (
                f"Run {run.run_id} already created with the run key `only_once` for run_key_sensor"
                in captured.out)

            ticks = instance.get_job_ticks(
                external_sensor.get_external_origin_id())
            assert len(ticks) == 2
            assert ticks[0].status == JobTickStatus.SUCCESS