def start_termination_thread(termination_event): check.inst_param(termination_event, "termination_event", ttype=type(multiprocessing.Event())) int_thread = threading.Thread( target=_kill_on_event, args=(termination_event,), name="kill-on-event" ) int_thread.daemon = True int_thread.start()
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess engine: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retries=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: try: # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step. key] = self.execute_step_out_of_process( step_context, step, errors, term_events) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event( event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_key=key, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event( step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] active_execution.verify_complete( pipeline_context, key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up gracefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, "Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess engine: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def StartRun(self, request, _context): if self._shutdown_once_executions_finish_event.is_set(): return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message="Tried to start a run on a server after telling it to shut down", serializable_error_info=None, ) ) ) try: execute_run_args = check.inst( deserialize_json_to_dagster_namedtuple(request.serialized_execute_run_args), ExecuteExternalPipelineArgs, ) run_id = execute_run_args.pipeline_run_id recon_pipeline = self._recon_pipeline_from_origin(execute_run_args.pipeline_origin) except: # pylint: disable=bare-except return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message=None, serializable_error_info=serializable_error_info_from_exc_info( sys.exc_info() ), ) ) ) event_queue = multiprocessing.Queue() termination_event = multiprocessing.Event() execution_process = multiprocessing.Process( target=start_run_in_subprocess, args=[ request.serialized_execute_run_args, recon_pipeline, event_queue, termination_event, ], ) with self._execution_lock: execution_process.start() self._executions[run_id] = ( execution_process, execute_run_args.instance_ref, ) self._termination_events[run_id] = termination_event success = None message = None serializable_error_info = None while success is None: time.sleep(EVENT_QUEUE_POLL_INTERVAL) # We use `get_nowait()` instead of `get()` so that we can handle the case where the # execution process has died unexpectedly -- `get()` would hang forever in that case try: dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait() except queue.Empty: if not execution_process.is_alive(): # subprocess died unexpectedly success = False message = ( "GRPC server: Subprocess for {run_id} terminated unexpectedly with " "exit code {exit_code}".format( run_id=run_id, exit_code=execution_process.exitcode, ) ) serializable_error_info = serializable_error_info_from_exc_info(sys.exc_info()) else: if isinstance( dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful ): success = True elif isinstance( dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete ): continue if isinstance(dagster_event_or_ipc_error_message_or_done, IPCErrorMessage): success = False message = dagster_event_or_ipc_error_message_or_done.message serializable_error_info = ( dagster_event_or_ipc_error_message_or_done.serializable_error_info ) # Ensure that if the run failed, we remove it from the executions map before # returning so that CanCancel will never return True if not success: with self._execution_lock: self._clear_run(run_id) return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=success, message=message, serializable_error_info=serializable_error_info, ) ) )
def ExecuteRun(self, request, _context): if self._shutdown_once_executions_finish_event.is_set(): yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( IPCErrorMessage( serializable_error_info=None, message= "Tried to start a run on a server after telling it to shut down", ))) try: execute_run_args = deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args) check.inst_param(execute_run_args, "execute_run_args", ExecuteRunArgs) run_id = execute_run_args.pipeline_run_id recon_pipeline = self._recon_pipeline_from_origin( execute_run_args.pipeline_origin) except: # pylint: disable=bare-except yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( IPCErrorMessage( serializable_error_info= serializable_error_info_from_exc_info(sys.exc_info()), message="Error during RPC setup for ExecuteRun", ))) return event_queue = multiprocessing.Queue() termination_event = multiprocessing.Event() execution_process = multiprocessing.Process( target=execute_run_in_subprocess, args=[ request.serialized_execute_run_args, recon_pipeline, event_queue, termination_event, ], ) with self._execution_lock: execution_process.start() self._executions[run_id] = ( execution_process, execute_run_args.instance_ref, ) self._termination_events[run_id] = termination_event done = False while not done: try: # We use `get_nowait()` instead of `get()` so that we can handle the case where the # execution process has died unexpectedly -- `get()` would hang forever in that case dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait( ) except queue.Empty: if not execution_process.is_alive(): # subprocess died unexpectedly yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( IPCErrorMessage( serializable_error_info= serializable_error_info_from_exc_info( sys.exc_info()), message= ("GRPC server: Subprocess for {run_id} terminated unexpectedly" ).format(run_id=run_id), ))) done = True time.sleep(EVENT_QUEUE_POLL_INTERVAL) else: if isinstance(dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete): done = True elif isinstance(dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful): continue else: yield api_pb2.ExecuteRunEvent( serialized_dagster_event_or_ipc_error_message= serialize_dagster_namedtuple( dagster_event_or_ipc_error_message_or_done)) with self._execution_lock: if run_id in self._executions: del self._executions[run_id] if run_id in self._termination_events: del self._termination_events[run_id]
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess executor: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retry_mode=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: received termination signal - " "forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True active_execution.mark_interrupted() for key, event in term_events.items(): event.set() # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step.key] = self.execute_step_out_of_process( step_context, step, errors, term_events, active_execution.get_known_state(), ) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_handle=active_execution.get_step_by_key( key).handle, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event(step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skipped and abandoned steps yield from active_execution.plan_events_iterator( pipeline_context) errs = {pid: err for pid, err in errors.items() if err} # After termination starts, raise an interrupted exception once all subprocesses # have finished cleaning up (and the only errors were from being interrupted) if (stopping and (not active_iters) and all([ err_info.cls_name == "DagsterExecutionInterruptedError" for err_info in errs.values() ])): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: interrupted all active child processes", event_specific_data=EngineEventData(), ) raise DagsterExecutionInterruptedError() elif errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def StartRun(self, request, _context): execute_run_args = check.inst( deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args), ExecuteRunArgs, ) try: execute_run_args = check.inst( deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args), ExecuteRunArgs, ) run_id = execute_run_args.pipeline_run_id recon_pipeline = self._recon_pipeline_from_origin( execute_run_args.pipeline_origin) except: # pylint: disable=bare-except return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message=None, serializable_error_info= serializable_error_info_from_exc_info(sys.exc_info()), ))) event_queue = multiprocessing.Queue() termination_event = multiprocessing.Event() execution_process = multiprocessing.Process( target=start_run_in_subprocess, args=[ request.serialized_execute_run_args, recon_pipeline, event_queue, termination_event, ], ) with self._execution_lock: execution_process.start() self._executions[run_id] = execution_process self._termination_events[run_id] = termination_event success = None message = None serializable_error_info = None while success is None: time.sleep(EVENT_QUEUE_POLL_INTERVAL) # We use `get_nowait()` instead of `get()` so that we can handle the case where the # execution process has died unexpectedly -- `get()` would hang forever in that case try: dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait( ) except queue.Empty: if not execution_process.is_alive(): # subprocess died unexpectedly success = False message = ( 'GRPC server: Subprocess for {run_id} terminated unexpectedly with ' 'exit code {exit_code}'.format( run_id=run_id, exit_code=execution_process.exitcode, )) serializable_error_info = serializable_error_info_from_exc_info( sys.exc_info()) else: if isinstance(dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful): success = True elif isinstance(dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete): continue if isinstance(dagster_event_or_ipc_error_message_or_done, IPCErrorMessage): success = False message = dagster_event_or_ipc_error_message_or_done.message serializable_error_info = ( dagster_event_or_ipc_error_message_or_done. serializable_error_info) return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=success, message=message, serializable_error_info=serializable_error_info, )))