def test_failure_recovery_between_multi_runs(external_repo_context, crash_location, crash_signal): with instance_with_schedules(external_repo_context) as (instance, external_repo): initial_datetime = create_pendulum_time(year=2019, month=2, day=28, hour=0, minute=0, second=0) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule( "multi_run_schedule") with pendulum.test(frozen_datetime): instance.start_schedule_and_update_storage_state(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 validate_run_started(instance.get_runs()[0], initial_datetime) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 frozen_datetime = frozen_datetime.add(minutes=1) with pendulum.test(frozen_datetime): scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 2 validate_run_started(instance.get_runs()[0], initial_datetime) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.SUCCESS, [run.run_id for run in instance.get_runs()], )
def test_crash_after_submit(external_repo_context, crash_signal, capfd): with instance_for_context(external_repo_context) as ( instance, _grpc_server_registry, external_repo, ): external_partition_set = external_repo.get_external_partition_set( "simple_partition_set") instance.add_backfill( PartitionBackfill( backfill_id="simple", partition_set_origin=external_partition_set. get_external_origin(), status=BulkActionStatus.REQUESTED, partition_names=["one", "two", "three"], from_failure=False, reexecution_steps=None, tags=None, backfill_timestamp=pendulum.now().timestamp(), )) launch_process = multiprocessing.Process( target=_test_backfill_in_subprocess, args=[instance.get_ref(), { "AFTER_SUBMIT": crash_signal }], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode != 0 captured = capfd.readouterr() assert ( captured.out.replace("\r\n", "\n") == """2021-02-16 18:00:00 - BackfillDaemon - INFO - Starting backfill for simple """) backfill = instance.get_backfill("simple") assert backfill.status == BulkActionStatus.REQUESTED assert instance.get_runs_count() == 3 # resume backfill launch_process = multiprocessing.Process( target=_test_backfill_in_subprocess, args=[instance.get_ref(), None], ) launch_process.start() launch_process.join(timeout=60) captured = capfd.readouterr() assert ( captured.out.replace("\r\n", "\n") == """2021-02-16 18:00:00 - BackfillDaemon - INFO - Starting backfill for simple 2021-02-16 18:00:00 - BackfillDaemon - INFO - Found 3 existing runs for backfill simple, skipping 2021-02-16 18:00:00 - BackfillDaemon - INFO - Backfill completed for simple for 3 partitions """) backfill = instance.get_backfill("simple") assert backfill.status == BulkActionStatus.COMPLETED assert instance.get_runs_count() == 3
def test_before_submit(external_repo_context, crash_signal, capfd): with instance_for_context(external_repo_context) as ( instance, _grpc_server_registry, external_repo, ): external_partition_set = external_repo.get_external_partition_set("simple_partition_set") instance.add_backfill( PartitionBackfill( backfill_id="simple", partition_set_origin=external_partition_set.get_external_origin(), status=BulkActionStatus.REQUESTED, partition_names=["one", "two", "three"], from_failure=False, reexecution_steps=None, tags=None, backfill_timestamp=pendulum.now().timestamp(), ) ) launch_process = multiprocessing.Process( target=_test_backfill_in_subprocess, args=[instance.get_ref(), {"BEFORE_SUBMIT": crash_signal}], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode != 0 assert ( get_logger_output_from_capfd(capfd, "dagster.daemon.BackfillDaemon") == """2021-02-16 18:00:00 -0600 - dagster.daemon.BackfillDaemon - INFO - Starting backfill for simple""" ) backfill = instance.get_backfill("simple") assert backfill.status == BulkActionStatus.REQUESTED assert instance.get_runs_count() == 0 # resume backfill launch_process = multiprocessing.Process( target=_test_backfill_in_subprocess, args=[instance.get_ref(), None], ) launch_process.start() launch_process.join(timeout=60) assert ( get_logger_output_from_capfd(capfd, "dagster.daemon.BackfillDaemon") == """2021-02-16 18:00:00 -0600 - dagster.daemon.BackfillDaemon - INFO - Starting backfill for simple 2021-02-16 18:00:00 -0600 - dagster.daemon.BackfillDaemon - INFO - Backfill completed for simple for 3 partitions""" ) backfill = instance.get_backfill("simple") assert backfill.status == BulkActionStatus.COMPLETED assert instance.get_runs_count() == 3
def test_multi(): with instance_for_test() as instance: pipeline_name = "foo_pipeline" pipeline_run = create_run_for_test(instance, pipeline_name=pipeline_name) step_keys = ["A", "B", "C"] with instance.compute_log_manager.watch(pipeline_run): print("outer 1") # pylint: disable=print-call print("outer 2") # pylint: disable=print-call print("outer 3") # pylint: disable=print-call for step_key in step_keys: process = multiprocessing.Process(target=execute_inner, args=(step_key, pipeline_run, instance.get_ref())) process.start() process.join() for step_key in step_keys: stdout = instance.compute_log_manager.read_logs_file( pipeline_run.run_id, step_key, ComputeIOType.STDOUT) assert normalize_file_content( stdout.data) == expected_inner_output(step_key) full_out = instance.compute_log_manager.read_logs_file( pipeline_run.run_id, pipeline_name, ComputeIOType.STDOUT) # The way that the multiprocess compute-logging interacts with pytest (which stubs out the # sys.stdout fileno) makes this difficult to test. The pytest-captured stdout only captures # the stdout from the outer process, not also the inner process assert normalize_file_content(full_out.data).startswith( expected_outer_prefix())
def test_simple(external_repo_context, capfd): with instance_for_context(external_repo_context) as (instance, external_repo): external_partition_set = external_repo.get_external_partition_set( "simple_partition_set") instance.add_backfill( PartitionBackfill( backfill_id="simple", partition_set_origin=external_partition_set. get_external_origin(), status=BulkActionStatus.REQUESTED, partition_names=["one", "two", "three"], from_failure=False, reexecution_steps=None, tags=None, backfill_timestamp=pendulum.now().timestamp(), )) launch_process = multiprocessing.Process( target=_test_backfill_in_subprocess, args=[instance.get_ref(), None], ) launch_process.start() launch_process.join(timeout=60) backfill = instance.get_backfill("simple") assert backfill.status == BulkActionStatus.COMPLETED captured = capfd.readouterr() assert ( captured.out.replace("\r\n", "\n") == """2021-02-16 18:00:00 - BackfillDaemon - INFO - Starting backfill for simple 2021-02-16 18:00:00 - BackfillDaemon - INFO - Backfill completed for simple for 3 partitions """)
def execute_child_process_command(command): """Execute a ChildProcessCommand in a new process. This function starts a new process whose execution target is a ChildProcessCommand wrapped by _execute_command_in_child_process; polls the queue for events yielded by the child process until the process dies and the queue is empty. This function yields a complex set of objects to enable having multiple child process executions in flight: * None - nothing has happened, yielded to enable cooperative multitasking other iterators * ChildProcessEvent - Family of objects that communicates state changes in the child process * KeyboardInterrupt - Yielded in the case that an interrupt was recieved while polling the child process. Yielded instead of raised to allow forwarding of the interrupt to the child and completion of the iterator for this child and any others that may be executing * The actual values yielded by the child process command Args: command (ChildProcessCommand): The command to execute in the child process. Warning: if the child process is in an infinite loop, this will also infinitely loop. """ check.inst_param(command, "command", ChildProcessCommand) event_queue = multiprocessing.Queue() try: process = multiprocessing.Process( target=_execute_command_in_child_process, args=(event_queue, command)) process.start() completed_properly = False while not completed_properly: event = _poll_for_event(process, event_queue) if event == PROCESS_DEAD_AND_QUEUE_EMPTY: break yield event if isinstance( event, (ChildProcessDoneEvent, ChildProcessSystemErrorEvent)): completed_properly = True if not completed_properly: # TODO Figure out what to do about stderr/stdout raise ChildProcessCrashException(exit_code=process.exitcode) process.join() finally: event_queue.close()
def test_concurrent_sqlite_event_log_connections(self, storage): tmpdir_path = storage._base_dir # pylint: disable=protected-access exceptions = multiprocessing.Queue() ps = [] for _ in range(5): ps.append(multiprocessing.Process(target=self.cmd, args=(exceptions, tmpdir_path))) for p in ps: p.start() j = 0 for p in ps: p.join() j += 1 assert j == 5 excs = [] while not exceptions.empty(): excs.append(exceptions.get()) assert not excs, excs
def test_concurrent_sqlite_event_log_connections(): exceptions = multiprocessing.Queue() with tempfile.TemporaryDirectory() as tmpdir_path: ps = [] for _ in range(5): ps.append(multiprocessing.Process(target=cmd, args=(exceptions, tmpdir_path))) for p in ps: p.start() j = 0 for p in ps: p.join() j += 1 assert j == 5 excs = [] while not exceptions.empty(): excs.append(exceptions.get()) assert not excs, excs
def test_failure_recovery_after_run_created(external_repo_context, crash_location, crash_signal, capfd): # Verify that if the scheduler crashes or is interrupted after a run is created, # it will just re-launch the already-created run when it runs again with instance_with_schedules(external_repo_context) as (instance, external_repo): initial_datetime = pendulum.datetime(year=2019, month=2, day=27, hour=0, minute=0, second=0) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule( "simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule_and_update_storage_state(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 capfd.readouterr() ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == JobTickStatus.STARTED assert instance.get_runs_count() == 1 if crash_location == "RUN_CREATED": run = instance.get_runs()[0] # Run was created, but hasn't launched yet assert run.tags[ SCHEDULED_EXECUTION_TIME_TAG] == frozen_datetime.isoformat( ) assert run.tags[PARTITION_NAME_TAG] == "2019-02-26" assert run.status == PipelineRunStatus.NOT_STARTED else: # The run was created and launched - running again should do nothing other than # moving the tick to success state. # The fact that we need to add this line indicates that there is still a theoretical # possible race condition - if the scheduler fails after launching a run # and then runs again between when the run was launched and when its status is changed to STARTED by the executor, we could # end up launching the same run twice. Run queueing or some other way to immediately # identify that a run was launched would help eliminate this race condition. For now, # eliminate the possibility by waiting for the run to start before running the # scheduler again. wait_for_all_runs_to_start(instance) run = instance.get_runs()[0] validate_run_started(instance.get_runs()[0], frozen_datetime, pendulum.datetime(2019, 2, 26)) assert run.status in [ PipelineRunStatus.STARTED, PipelineRunStatus.SUCCESS ] frozen_datetime = frozen_datetime.add(minutes=5) with pendulum.test(frozen_datetime): # Running again just launches the existing run and marks the tick as success scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 wait_for_all_runs_to_start(instance) validate_run_started(instance.get_runs()[0], initial_datetime, pendulum.datetime(2019, 2, 26)) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.SUCCESS, instance.get_runs()[0].run_id, ) captured = capfd.readouterr() if crash_location == "RUN_CREATED": assert ( "Run {run_id} already created for this execution of simple_schedule" .format(run_id=instance.get_runs()[0].run_id) in captured.out) else: assert ( "Run {run_id} already completed for this execution of simple_schedule" .format(run_id=instance.get_runs()[0].run_id) in captured.out)
def StartRun(self, request, _context): if self._shutdown_once_executions_finish_event.is_set(): return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message="Tried to start a run on a server after telling it to shut down", serializable_error_info=None, ) ) ) try: execute_run_args = check.inst( deserialize_json_to_dagster_namedtuple(request.serialized_execute_run_args), ExecuteExternalPipelineArgs, ) run_id = execute_run_args.pipeline_run_id recon_pipeline = self._recon_pipeline_from_origin(execute_run_args.pipeline_origin) except: # pylint: disable=bare-except return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message=None, serializable_error_info=serializable_error_info_from_exc_info( sys.exc_info() ), ) ) ) event_queue = multiprocessing.Queue() termination_event = multiprocessing.Event() execution_process = multiprocessing.Process( target=start_run_in_subprocess, args=[ request.serialized_execute_run_args, recon_pipeline, event_queue, termination_event, ], ) with self._execution_lock: execution_process.start() self._executions[run_id] = ( execution_process, execute_run_args.instance_ref, ) self._termination_events[run_id] = termination_event success = None message = None serializable_error_info = None while success is None: time.sleep(EVENT_QUEUE_POLL_INTERVAL) # We use `get_nowait()` instead of `get()` so that we can handle the case where the # execution process has died unexpectedly -- `get()` would hang forever in that case try: dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait() except queue.Empty: if not execution_process.is_alive(): # subprocess died unexpectedly success = False message = ( "GRPC server: Subprocess for {run_id} terminated unexpectedly with " "exit code {exit_code}".format( run_id=run_id, exit_code=execution_process.exitcode, ) ) serializable_error_info = serializable_error_info_from_exc_info(sys.exc_info()) else: if isinstance( dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful ): success = True elif isinstance( dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete ): continue if isinstance(dagster_event_or_ipc_error_message_or_done, IPCErrorMessage): success = False message = dagster_event_or_ipc_error_message_or_done.message serializable_error_info = ( dagster_event_or_ipc_error_message_or_done.serializable_error_info ) # Ensure that if the run failed, we remove it from the executions map before # returning so that CanCancel will never return True if not success: with self._execution_lock: self._clear_run(run_id) return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=success, message=message, serializable_error_info=serializable_error_info, ) ) )
def test_failure_recovery_before_run_created(external_repo_context, crash_location, crash_signal, capfd): with central_timezone(): # Verify that if the scheduler crashes or is interrupted before a run is created, # it will create exactly one tick/run when it is re-launched with instance_with_schedules(external_repo_context) as (instance, external_repo): initial_datetime = datetime( year=2019, month=2, day=27, hour=0, minute=0, second=0, tzinfo=get_utc_timezone(), ) external_schedule = external_repo.get_external_schedule( "simple_schedule") with freeze_time(initial_datetime) as frozen_datetime: instance.start_schedule_and_update_storage_state( external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[ instance.get_ref(), get_current_datetime_in_utc(), debug_crash_flags ], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 captured = capfd.readouterr() assert ( captured.out == """2019-02-26 18:00:00 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule 2019-02-26 18:00:00 - dagster-scheduler - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000 """) ticks = instance.get_schedule_ticks( external_schedule.get_origin_id()) assert len(ticks) == 1 assert ticks[0].status == ScheduleTickStatus.STARTED assert instance.get_runs_count() == 0 frozen_datetime.tick(delta=timedelta(minutes=5)) scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[ instance.get_ref(), get_current_datetime_in_utc(), None ], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 wait_for_all_runs_to_start(instance) validate_run_started(instance.get_runs()[0], initial_datetime, "2019-02-26") ticks = instance.get_schedule_ticks( external_schedule.get_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, ScheduleTickStatus.SUCCESS, instance.get_runs()[0].run_id, ) captured = capfd.readouterr() assert ( captured.out == """2019-02-26 18:05:00 - dagster-scheduler - INFO - Checking for new runs for the following schedules: simple_schedule 2019-02-26 18:05:00 - dagster-scheduler - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000 2019-02-26 18:05:00 - dagster-scheduler - INFO - Resuming previously interrupted schedule execution 2019-02-26 18:05:00 - dagster-scheduler - INFO - Completed scheduled launch of run {run_id} for simple_schedule """.format(run_id=instance.get_runs()[0].run_id))
def test_failure_recovery_before_run_created(external_repo_context, crash_location, crash_signal): # Verify that if the scheduler crashes or is interrupted before a run is created, # it will create exactly one tick/run when it is re-launched with instance_with_schedules(external_repo_context) as (instance, external_repo): initial_datetime = datetime( year=2019, month=2, day=27, hour=0, minute=0, second=0, tzinfo=get_utc_timezone(), ) external_schedule = external_repo.get_external_schedule( "simple_schedule") with freeze_time(initial_datetime) as frozen_datetime: instance.start_schedule_and_update_storage_state(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[ instance.get_ref(), get_current_datetime_in_utc(), debug_crash_flags ], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 ticks = instance.get_schedule_ticks( external_schedule.get_origin_id()) assert len(ticks) == 1 assert ticks[0].status == ScheduleTickStatus.STARTED assert instance.get_runs_count() == 0 frozen_datetime.tick(delta=timedelta(minutes=5)) scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), get_current_datetime_in_utc(), None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 wait_for_all_runs_to_start(instance) validate_run_started(instance.get_runs()[0], initial_datetime, "2019-02-26") ticks = instance.get_schedule_ticks( external_schedule.get_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, ScheduleTickStatus.SUCCESS, instance.get_runs()[0].run_id, )
def ExecuteRun(self, request, _context): if self._shutdown_once_executions_finish_event.is_set(): yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( IPCErrorMessage( serializable_error_info=None, message= "Tried to start a run on a server after telling it to shut down", ))) try: execute_run_args = deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args) check.inst_param(execute_run_args, "execute_run_args", ExecuteRunArgs) run_id = execute_run_args.pipeline_run_id recon_pipeline = self._recon_pipeline_from_origin( execute_run_args.pipeline_origin) except: # pylint: disable=bare-except yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( IPCErrorMessage( serializable_error_info= serializable_error_info_from_exc_info(sys.exc_info()), message="Error during RPC setup for ExecuteRun", ))) return event_queue = multiprocessing.Queue() termination_event = multiprocessing.Event() execution_process = multiprocessing.Process( target=execute_run_in_subprocess, args=[ request.serialized_execute_run_args, recon_pipeline, event_queue, termination_event, ], ) with self._execution_lock: execution_process.start() self._executions[run_id] = ( execution_process, execute_run_args.instance_ref, ) self._termination_events[run_id] = termination_event done = False while not done: try: # We use `get_nowait()` instead of `get()` so that we can handle the case where the # execution process has died unexpectedly -- `get()` would hang forever in that case dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait( ) except queue.Empty: if not execution_process.is_alive(): # subprocess died unexpectedly yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( IPCErrorMessage( serializable_error_info= serializable_error_info_from_exc_info( sys.exc_info()), message= ("GRPC server: Subprocess for {run_id} terminated unexpectedly" ).format(run_id=run_id), ))) done = True time.sleep(EVENT_QUEUE_POLL_INTERVAL) else: if isinstance(dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete): done = True elif isinstance(dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful): continue else: yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( dagster_event_or_ipc_error_message_or_done)) with self._execution_lock: if run_id in self._executions: del self._executions[run_id] if run_id in self._termination_events: del self._termination_events[run_id]
def test_failure_before_run_created(external_repo_context, crash_location, crash_signal, capfd): frozen_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, hour=0, minute=0, second=1, tz="UTC"), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, _grpc_server_registry, external_repo, ): with pendulum.test(frozen_datetime): external_sensor = external_repo.get_external_sensor( "simple_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) # create a tick launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) launch_process.start() launch_process.join(timeout=60) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == JobTickStatus.SKIPPED captured = capfd.readouterr() # create a starting tick, but crash debug_crash_flags = { external_sensor.name: { crash_location: crash_signal } } launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[ instance.get_ref(), frozen_datetime.add(seconds=31), debug_crash_flags ], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode != 0 captured = capfd.readouterr() ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 assert ticks[0].status == JobTickStatus.STARTED assert not int( ticks[0].timestamp) % 2 # skip condition for simple_sensor assert instance.get_runs_count() == 0 # create another tick, but ensure that the last evaluation time used is from the first, # successful tick rather than the failed tick launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[ instance.get_ref(), frozen_datetime.add(seconds=62), None ], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode == 0 wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] captured = capfd.readouterr() assert ( captured.out.replace("\r\n", "\n") == f"""2019-02-27 18:01:03 - SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor 2019-02-27 18:01:03 - SensorDaemon - INFO - Launching run for simple_sensor 2019-02-27 18:01:03 - SensorDaemon - INFO - Completed launch of run {run.run_id} for simple_sensor """) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 3 assert ticks[0].status == JobTickStatus.SUCCESS
def test_failure_after_run_launched(external_repo_context, crash_location, crash_signal, capfd): frozen_datetime = to_timezone( create_pendulum_time( year=2019, month=2, day=28, hour=0, minute=0, second=0, tz="UTC", ), "US/Central", ) with instance_with_sensors(external_repo_context) as ( instance, _grpc_server_registry, external_repo, ): with pendulum.test(frozen_datetime): external_sensor = external_repo.get_external_sensor( "run_key_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) # create a run, launch but crash debug_crash_flags = { external_sensor.name: { crash_location: crash_signal } } launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode != 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == JobTickStatus.STARTED assert instance.get_runs_count() == 1 run = instance.get_runs()[0] wait_for_all_runs_to_start(instance) assert run.tags.get(SENSOR_NAME_TAG) == "run_key_sensor" assert run.tags.get(RUN_KEY_TAG) == "only_once" capfd.readouterr() launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[ instance.get_ref(), frozen_datetime.add(seconds=1), None ], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode == 0 wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] captured = capfd.readouterr() assert ( 'Skipping 1 run for sensor run_key_sensor already completed with run keys: ["only_once"]' in captured.out) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 assert ticks[0].status == JobTickStatus.SKIPPED
def test_failure_recovery_after_tick_success(external_repo_context, crash_location, crash_signal): # Verify that if the scheduler crashes or is interrupted after a run is created, # it will just re-launch the already-created run when it runs again with instance_with_schedules(external_repo_context) as (instance, external_repo): initial_datetime = pendulum.datetime(year=2019, month=2, day=27, hour=0, minute=0, second=0) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule( "simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule_and_update_storage_state(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 # As above there's a possible race condition here if the scheduler crashes # and launches the same run twice if we crash right after the launch and re-run # before the run actually starts wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 validate_run_started(instance.get_runs()[0], initial_datetime, pendulum.datetime(2019, 2, 26)) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 if crash_signal == get_terminate_signal(): validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.STARTED, None, ) else: validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.SUCCESS, instance.get_runs()[0].run_id, ) frozen_datetime = frozen_datetime.add(minutes=1) with pendulum.test(frozen_datetime): # Running again just marks the tick as success since the run has already started scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 validate_run_started(instance.get_runs()[0], initial_datetime, pendulum.datetime(2019, 2, 26)) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.SUCCESS, instance.get_runs()[0].run_id, )
def StartRun(self, request, _context): execute_run_args = check.inst( deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args), ExecuteRunArgs, ) try: execute_run_args = check.inst( deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args), ExecuteRunArgs, ) run_id = execute_run_args.pipeline_run_id recon_pipeline = self._recon_pipeline_from_origin( execute_run_args.pipeline_origin) except: # pylint: disable=bare-except return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message=None, serializable_error_info= serializable_error_info_from_exc_info(sys.exc_info()), ))) event_queue = multiprocessing.Queue() termination_event = multiprocessing.Event() execution_process = multiprocessing.Process( target=start_run_in_subprocess, args=[ request.serialized_execute_run_args, recon_pipeline, event_queue, termination_event, ], ) with self._execution_lock: execution_process.start() self._executions[run_id] = execution_process self._termination_events[run_id] = termination_event success = None message = None serializable_error_info = None while success is None: time.sleep(EVENT_QUEUE_POLL_INTERVAL) # We use `get_nowait()` instead of `get()` so that we can handle the case where the # execution process has died unexpectedly -- `get()` would hang forever in that case try: dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait( ) except queue.Empty: if not execution_process.is_alive(): # subprocess died unexpectedly success = False message = ( 'GRPC server: Subprocess for {run_id} terminated unexpectedly with ' 'exit code {exit_code}'.format( run_id=run_id, exit_code=execution_process.exitcode, )) serializable_error_info = serializable_error_info_from_exc_info( sys.exc_info()) else: if isinstance(dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful): success = True elif isinstance(dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete): continue if isinstance(dagster_event_or_ipc_error_message_or_done, IPCErrorMessage): success = False message = dagster_event_or_ipc_error_message_or_done.message serializable_error_info = ( dagster_event_or_ipc_error_message_or_done. serializable_error_info) return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=success, message=message, serializable_error_info=serializable_error_info, )))
def test_failure_recovery_before_run_created(instance, external_repo, crash_location, crash_signal): # Verify that if the scheduler crashes or is interrupted before a run is created, # it will create exactly one tick/run when it is re-launched initial_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, hour=0, minute=0, second=0, tz="UTC"), "US/Central", ) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule("simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule_and_update_storage_state(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == TickStatus.STARTED assert instance.get_runs_count() == 0 frozen_datetime = frozen_datetime.add(minutes=5) with pendulum.test(frozen_datetime): scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 wait_for_all_runs_to_start(instance) validate_run_exists( instance.get_runs()[0], execution_time=initial_datetime, partition_time=create_pendulum_time(2019, 2, 26), ) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, TickStatus.SUCCESS, [instance.get_runs()[0].run_id], )
def test_failure_recovery_before_run_created(external_repo_context, crash_location, crash_signal, capfd): # Verify that if the scheduler crashes or is interrupted before a run is created, # it will create exactly one tick/run when it is re-launched with instance_with_schedules(external_repo_context) as (instance, external_repo): initial_datetime = pendulum.datetime(year=2019, month=2, day=27, hour=0, minute=0, second=0).in_tz("US/Central") frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule( "simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule_and_update_storage_state(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 captured = capfd.readouterr() assert ( captured.out.replace("\r\n", "\n") == """2019-02-26 18:00:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule 2019-02-26 18:00:00 - SchedulerDaemon - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000 """) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == JobTickStatus.STARTED assert instance.get_runs_count() == 0 frozen_datetime = frozen_datetime.add(minutes=5) with pendulum.test(frozen_datetime): scheduler_process = multiprocessing.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 wait_for_all_runs_to_start(instance) validate_run_started( instance.get_runs()[0], execution_time=initial_datetime, partition_time=pendulum.datetime(2019, 2, 26), ) ticks = instance.get_job_ticks( external_schedule.get_external_origin_id()) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, JobTickStatus.SUCCESS, instance.get_runs()[0].run_id, ) captured = capfd.readouterr() assert ( captured.out.replace("\r\n", "\n") == """2019-02-26 18:05:00 - SchedulerDaemon - INFO - Checking for new runs for the following schedules: simple_schedule 2019-02-26 18:05:00 - SchedulerDaemon - INFO - Launching run for simple_schedule at 2019-02-27 00:00:00+0000 2019-02-26 18:05:00 - SchedulerDaemon - INFO - Resuming previously interrupted schedule execution 2019-02-26 18:05:00 - SchedulerDaemon - INFO - Completed scheduled launch of run {run_id} for simple_schedule """.format(run_id=instance.get_runs()[0].run_id))
def test_failure_after_run_created_before_run_launched(external_repo_context, crash_location, crash_signal, capfd): frozen_datetime = pendulum.datetime( year=2019, month=2, day=28, hour=0, minute=0, second=0, ).in_tz("US/Central") with instance_with_sensors(external_repo_context) as (instance, external_repo): with pendulum.test(frozen_datetime): external_sensor = external_repo.get_external_sensor( "run_key_sensor") instance.add_job_state( JobState(external_sensor.get_external_origin(), JobType.SENSOR, JobStatus.RUNNING)) # create a starting tick, but crash debug_crash_flags = { external_sensor.name: { crash_location: crash_signal } } launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode != 0 ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == JobTickStatus.STARTED assert instance.get_runs_count() == 1 run = instance.get_runs()[0] # Run was created, but hasn't launched yet assert run.status == PipelineRunStatus.NOT_STARTED assert run.tags.get(SENSOR_NAME_TAG) == "run_key_sensor" assert run.tags.get(RUN_KEY_TAG) == "only_once" # clear output capfd.readouterr() launch_process = multiprocessing.Process( target=_test_launch_sensor_runs_in_subprocess, args=[ instance.get_ref(), frozen_datetime.add(seconds=1), None ], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode == 0 wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] captured = capfd.readouterr() assert ( f"Run {run.run_id} already created with the run key `only_once` for run_key_sensor" in captured.out) ticks = instance.get_job_ticks( external_sensor.get_external_origin_id()) assert len(ticks) == 2 assert ticks[0].status == JobTickStatus.SUCCESS