def _core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): from .tasks import make_app check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.callable_param(step_execution_fn, 'step_execution_fn') check.param_invariant( isinstance(pipeline_context.executor_config, (CeleryConfig, CeleryK8sJobConfig)), 'pipeline_context', 'Expected executor_config to be Celery config got {}'.format( pipeline_context.executor_config ), ) celery_config = pipeline_context.executor_config # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Celery, use filesystem (on top of NFS or ' 'similar system that allows files to be available to all nodes), S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: ( -1 * int(step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority)) + -1 * _get_run_priority(pipeline_context) ) priority_for_key = lambda step_key: ( priority_for_step(execution_plan.get_step_by_key(step_key)) ) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step ) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted(step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[step_key] = serializable_error_info_from_exc_info(sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple(step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator(pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".'.format( step_key=step.key, queue=queue ), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority ) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'.format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info(sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}'.format( error_list='\n'.join( [ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ] ) ), subprocess_error_infos=list(step_errors.values()), )
def execute(self, pipeline_context: PlanOrchestrationContext, execution_plan: ExecutionPlan): check.inst_param(pipeline_context, "pipeline_context", PlanOrchestrationContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) self._event_cursor = -1 # pylint: disable=attribute-defined-outside-init yield DagsterEvent.engine_event( pipeline_context, f"Starting execution with step handler {self._step_handler.name}", EngineEventData(), ) with execution_plan.start(retry_mode=self.retries) as active_execution: stopping = False running_steps: Dict[str, ExecutionStep] = {} last_check_step_health_time = pendulum.now("UTC") while (not active_execution.is_complete and not stopping) or running_steps: events = [] if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Executor received termination signal, forwarding to steps", EngineEventData.interrupted(list( running_steps.keys())), ) stopping = True active_execution.mark_interrupted() for _, step in running_steps.items(): events.extend( self._log_new_events( self._step_handler.terminate_step( self._get_step_handler_context( pipeline_context, [step], active_execution)), pipeline_context, running_steps, )) running_steps.clear() events.extend( self._pop_events( pipeline_context.plan_data.instance, pipeline_context.plan_data.pipeline_run.run_id, )) if not stopping: curr_time = pendulum.now("UTC") if (curr_time - last_check_step_health_time).total_seconds( ) >= self._check_step_health_interval_seconds: last_check_step_health_time = curr_time for _, step in running_steps.items(): events.extend( self._log_new_events( self._step_handler.check_step_health( self._get_step_handler_context( pipeline_context, [step], active_execution)), pipeline_context, running_steps, )) for step in active_execution.get_steps_to_execute(): running_steps[step.key] = step events.extend( self._log_new_events( self._step_handler.launch_step( self._get_step_handler_context( pipeline_context, [step], active_execution)), pipeline_context, running_steps, )) for dagster_event in events: yield dagster_event active_execution.handle_event(dagster_event) if (dagster_event.is_step_success or dagster_event.is_step_failure or dagster_event.is_step_skipped): assert isinstance(dagster_event.step_key, str) del running_steps[dagster_event.step_key] active_execution.verify_complete( pipeline_context, dagster_event.step_key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( pipeline_context): yield event time.sleep(self._sleep_seconds)
def execute(pipeline_context, execution_plan, step_keys_to_execute=None): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_key_set = None if step_keys_to_execute is None else set( step_keys_to_execute) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps in process (pid: {pid})'.format(pid=os.getpid()), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), ) with time_execution_scope() as timer_result: check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config), ) failed_or_skipped_steps = set() step_levels = execution_plan.topological_step_levels() # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 for step_level in step_levels: for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) failed_inputs = [] for step_input in step.step_inputs: failed_inputs.extend( failed_or_skipped_steps.intersection( step_input.dependency_keys)) if failed_inputs: step_context.log.info(( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.' ).format(step=step.key, failed_inputs=failed_inputs)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional( uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue for step_event in check.generator( dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event yield DagsterEvent.engine_event( pipeline_context, 'Finished steps in process (pid: {pid}) in {duration_ms}'.format( pid=os.getpid(), duration_ms=format_duration(timer_result.millis)), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), )
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, 'Executing steps using multiprocess engine: parent process (pid: {pid})' .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: active_execution = execution_plan.start(retries=self.retries) active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: try: # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step.key] = self.execute_step_out_of_process( step_context, step, errors, term_events) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ('Multiprocess executor: child process for step {step_key} ' 'unexpectedly exited with code {exit_code}' ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_key=key, ) yield DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up gracefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes', EngineEventData.interrupted(list(term_events.keys())), ) stopping = True for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( 'During multiprocess execution errors occurred in child processes:\n{error_list}' .format(error_list='\n'.join([ 'In process {pid}: {err}'.format(pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: parent process exiting after {duration} (pid: {pid})' .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) step_keys_to_execute = execution_plan.step_keys_to_execute for_step_key = execution_plan.step_key_for_single_step_plans() yield DagsterEvent.engine_event( pipeline_context, 'Executing steps in process (pid: {pid})'.format(pid=os.getpid()), event_specific_data=EngineEventData.in_process( os.getpid(), step_keys_to_execute, marker_end=pipeline_context.executor_config.marker_to_close, ), step_key=for_step_key, ) with time_execution_scope() as timer_result: check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config), ) for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan): yield event # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries) while not active_execution.is_complete: step = active_execution.get_next_step() step_context = pipeline_context.for_step(step) check.invariant( all( hasattr(step_context.resources, resource_key) for resource_key in step_context.required_resource_keys), 'expected step context to have all required resources', ) with mirror_step_io(step_context): # capture all of the logs for this step uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional( uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) yield DagsterEvent.step_skipped_event(step_context) active_execution.mark_skipped(step.key) else: for step_event in check.generator( dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event yield DagsterEvent.engine_event( pipeline_context, 'Finished steps in process (pid: {pid}) in {duration_ms}'.format( pid=os.getpid(), duration_ms=format_duration(timer_result.millis)), event_specific_data=EngineEventData.in_process( os.getpid(), step_keys_to_execute), step_key=for_step_key, )
def core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): check.inst_param(pipeline_context, "pipeline_context", PlanOrchestrationContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.callable_param(step_execution_fn, "step_execution_fn") executor = pipeline_context.executor # If there are no step keys to execute, then any io managers will not be used. if len(execution_plan.step_keys_to_execute) > 0: # https://github.com/dagster-io/dagster/issues/2440 check.invariant( execution_plan.artifacts_persisted, "Cannot use in-memory storage with Celery, use filesystem (on top of NFS or " "similar system that allows files to be available to all nodes), S3, or GCS", ) app = make_app(executor.app_args()) priority_for_step = lambda step: (-1 * int( step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} with execution_plan.start( retry_mode=pipeline_context.executor.retries, sort_key_fn=priority_for_step, ) as active_execution: stopping = False while (not active_execution.is_complete and not stopping) or step_results: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Celery executor: received termination signal - revoking active tasks from workers", EngineEventData.interrupted(list(step_results.keys())), ) stopping = True active_execution.mark_interrupted() for result in step_results.values(): result.revoke() results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except TaskRevokedError: step_events = [] yield DagsterEvent.engine_event( pipeline_context, 'celery task for running step "{step_key}" was revoked.' .format(step_key=step_key, ), EngineEventData(marker_end=DELEGATE_MARKER), step_handle=active_execution.get_step_by_key( step_key).handle, ) except Exception: # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping or step_errors: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_handle=step.handle, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority, active_execution.get_known_state(), ) except Exception: yield DagsterEvent.engine_event( pipeline_context, "Encountered error during celery task submission.". format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( "During celery execution errors occurred in workers:\n{error_list}" .format(error_list="\n".join([ "[{step}]: {err}".format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def pipeline_execution_iterator( pipeline_context: PlanOrchestrationContext, execution_plan: ExecutionPlan) -> Iterator[DagsterEvent]: """A complete execution of a pipeline. Yields pipeline start, success, and failure events. Args: pipeline_context (PlanOrchestrationContext): execution_plan (ExecutionPlan): """ # TODO: restart event? if not pipeline_context.resume_from_failure: yield DagsterEvent.pipeline_start(pipeline_context) pipeline_exception_info = None pipeline_canceled_info = None failed_steps = [] generator_closed = False try: for event in pipeline_context.executor.execute(pipeline_context, execution_plan): if event.is_step_failure: failed_steps.append(event.step_key) yield event except GeneratorExit: # Shouldn't happen, but avoid runtime-exception in case this generator gets GC-ed # (see https://amir.rachum.com/blog/2017/03/03/generator-cleanup/). generator_closed = True pipeline_exception_info = serializable_error_info_from_exc_info( sys.exc_info()) if pipeline_context.raise_on_error: raise except (KeyboardInterrupt, DagsterExecutionInterruptedError): pipeline_canceled_info = serializable_error_info_from_exc_info( sys.exc_info()) if pipeline_context.raise_on_error: raise except BaseException: pipeline_exception_info = serializable_error_info_from_exc_info( sys.exc_info()) if pipeline_context.raise_on_error: raise # finally block will run before this is re-raised finally: if pipeline_canceled_info: reloaded_run = pipeline_context.instance.get_run_by_id( pipeline_context.run_id) if reloaded_run and reloaded_run.status == PipelineRunStatus.CANCELING: event = DagsterEvent.pipeline_canceled(pipeline_context, pipeline_canceled_info) elif reloaded_run and reloaded_run.status == PipelineRunStatus.CANCELED: # This happens if the run was force-terminated but was still able to send # a cancellation request event = DagsterEvent.engine_event( pipeline_context, "Computational resources were cleaned up after the run was forcibly marked as canceled.", EngineEventData(), ) elif pipeline_context.instance.run_will_resume( pipeline_context.run_id): event = DagsterEvent.engine_event( pipeline_context, "Execution was interrupted unexpectedly. " "No user initiated termination request was found, not treating as failure because run will be resumed.", EngineEventData(), ) else: event = DagsterEvent.pipeline_failure( pipeline_context, "Execution was interrupted unexpectedly. " "No user initiated termination request was found, treating as failure.", pipeline_canceled_info, ) elif pipeline_exception_info: event = DagsterEvent.pipeline_failure( pipeline_context, "An exception was thrown during execution.", pipeline_exception_info, ) elif failed_steps: event = DagsterEvent.pipeline_failure( pipeline_context, "Steps failed: {}.".format(failed_steps), ) else: event = DagsterEvent.pipeline_success(pipeline_context) if not generator_closed: yield event
def execute(pipeline_context, execution_plan, step_keys_to_execute=None): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_levels = execution_plan.topological_step_levels() intermediates_manager = pipeline_context.intermediates_manager limit = pipeline_context.executor_config.max_concurrent step_key_set = None if step_keys_to_execute is None else set( step_keys_to_execute) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps using multiprocess engine: parent process (pid: {pid})' .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=step_key_set), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: for step_level in step_levels: step_contexts_to_execute = [] for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) if not intermediates_manager.all_inputs_covered( step_context, step): uncovered_inputs = intermediates_manager.uncovered_inputs( step_context, step) step_context.log.error(( 'Not all inputs covered for {step}. Not executing.' 'Output missing for inputs: {uncovered_inputs}' ).format(uncovered_inputs=uncovered_inputs, step=step.key)) continue step_contexts_to_execute.append(step_context) for step_event in bounded_parallel_executor( step_contexts_to_execute, limit): yield step_event yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: parent process exiting after {duration} (pid: {pid})' .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config pipeline_name = pipeline_context.pipeline_def.name handle_dict = pipeline_context.execution_target_handle.to_dict() instance_ref_dict = pipeline_context.instance.get_ref().to_dict() environment_dict = dict(pipeline_context.environment_dict, execution={'in_process': {}}) mode = pipeline_context.mode_def.name run_id = pipeline_context.pipeline_run.run_id app = make_app(celery_config) task_signatures = {} # Dict[step_key, celery.Signature] apply_kwargs = defaultdict(dict) # Dict[step_key, Dict[str, Any]] priority_for_step = lambda step: (-1 * step.metadata.get( 'dagster-celery/priority', task_default_priority)) priority_for_key = lambda step_key: (-1 * apply_kwargs[step_key][ 'priority']) _warn_on_priority_misuse(pipeline_context, execution_plan) for step_key in execution_plan.step_keys_to_execute: step = execution_plan.get_step_by_key(step_key) priority = step.metadata.get('dagster-celery/priority', task_default_priority) queue = step.metadata.get('dagster-celery/queue', task_default_queue) task = create_task(app) variables = { 'executionParams': { 'selector': { 'name': pipeline_name }, 'environmentConfigData': environment_dict, 'mode': mode, 'executionMetadata': { 'runId': run_id }, 'stepKeys': [step_key], } } task_signatures[step_key] = task.si(handle_dict, variables, instance_ref_dict) apply_kwargs[step_key] = { 'priority': priority, 'queue': queue, 'routing_key': '{queue}.execute_query'.format(queue=queue), } step_results = {} # Dict[ExecutionStep, celery.AsyncResult] completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start(sort_key_fn=priority_for_step) while not active_execution.is_complete or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] for step_event in step_events: yield deserialize_json_to_dagster_namedtuple( step_event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.mark_complete(step_key) # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_available_steps(): try: step_results[step.key] = task_signatures[ step.key].apply_async(**apply_kwargs[step.key]) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS)
def inner_plan_execution_iterator( pipeline_context: PlanExecutionContext, execution_plan: ExecutionPlan ) -> Iterator[DagsterEvent]: check.inst_param(pipeline_context, "pipeline_context", PlanExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) with execution_plan.start(retry_mode=pipeline_context.retry_mode) as active_execution: # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 while not active_execution.is_complete: step = active_execution.get_next_step() step_context = cast( StepExecutionContext, pipeline_context.for_step( step, active_execution.retry_state.get_attempt_count(step.key) ), ) step_event_list = [] missing_resources = [ resource_key for resource_key in step_context.required_resource_keys if not hasattr(step_context.resources, resource_key) ] check.invariant( len(missing_resources) == 0, ( "Expected step context for solid {solid_name} to have all required resources, but " "missing {missing_resources}." ).format(solid_name=step_context.solid.name, missing_resources=missing_resources), ) # capture all of the logs for this step with ExitStack() as stack: log_capture_error = None try: stack.enter_context( pipeline_context.instance.compute_log_manager.watch( step_context.pipeline_run, step_context.step.key ) ) except Exception as e: yield DagsterEvent.engine_event( pipeline_context=pipeline_context, message="Exception while setting up compute log capture", event_specific_data=EngineEventData( error=serializable_error_info_from_exc_info(sys.exc_info()) ), step_handle=step_context.step.handle, ) log_capture_error = e if not log_capture_error: yield DagsterEvent.capture_logs( step_context, log_key=step_context.step.key, steps=[step_context.step] ) for step_event in check.generator(dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) step_event_list.append(step_event) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) try: stack.close() except Exception: yield DagsterEvent.engine_event( pipeline_context=pipeline_context, message="Exception while cleaning up compute log capture", event_specific_data=EngineEventData( error=serializable_error_info_from_exc_info(sys.exc_info()) ), step_handle=step_context.step.handle, ) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator(pipeline_context): step_event_list.append(event) yield event # pass a list of step events to hooks for hook_event in _trigger_hook(step_context, step_event_list): yield hook_event
def execute(self, pipeline_context: PlanOrchestrationContext, execution_plan: ExecutionPlan): check.inst_param(pipeline_context, "pipeline_context", PlanOrchestrationContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) yield DagsterEvent.engine_event( pipeline_context, f"Starting execution with step handler {self._step_handler.name}", EngineEventData(), ) self._step_handler.initialize_for_execution(pipeline_context) with execution_plan.start(retry_mode=self.retries) as active_execution: stopping = False running_steps: Dict[str, ExecutionStep] = {} while (not active_execution.is_complete and not stopping) or running_steps: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Core executor: received termination signal - " "forwarding to steps", EngineEventData(), ) stopping = True active_execution.mark_interrupted() for step_key in running_steps: self._step_handler.terminate_steps([step_key]) events = self._step_handler.pop_events() for step in running_steps.values(): events.extend( self._step_handler.check_step_health( [pipeline_context.for_step(step)], active_execution.get_known_state(), )) for dagster_event in events: yield dagster_event active_execution.handle_event(dagster_event) if (dagster_event.is_step_success or dagster_event.is_step_failure or dagster_event.is_step_skipped): assert isinstance(dagster_event.step_key, str) del running_steps[dagster_event.step_key] active_execution.verify_complete( pipeline_context, dagster_event.step_key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( pipeline_context): yield event for step in active_execution.get_steps_to_execute(): running_steps[step.key] = step self._step_handler.launch_steps( [pipeline_context.for_step(step)], active_execution.get_known_state(), ) time.sleep(self._sleep_seconds) for step_handle in execution_plan.step_dict: active_execution.verify_complete(pipeline_context, step_handle.to_key())
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multithread executor (pid: {pid})".format(pid=os.getpid()), event_specific_data=EngineEventData.in_process(os.getpid(), execution_plan.step_keys_to_execute), ) with time_execution_scope() as timer_result: with execution_plan.start(retries=self.retries) as active_execution: active_iters = {} errors = {} while not active_execution.is_complete or active_iters: # start iterators while len(active_iters) < limit: steps = active_execution.get_steps_to_execute(limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) active_iters[step.key] = self.execute_step_in_thread(step.key, step_context, errors) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue yield event_or_none active_execution.handle_event(event_or_none) except ThreadCrashException: serializable_error = serializable_error_info_from_exc_info(sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, f"Multithread executor: thread for step {key} exited unexpectedly", EngineEventData.engine_error(serializable_error), ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step(active_execution.get_step_by_key(key)), step_failure_data=StepFailureData(error=serializable_error, user_failure_data=None), ) active_execution.handle_event(step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] active_execution.verify_complete(pipeline_context, key) # process skipped and abandoned steps for event in active_execution.plan_events_iterator(pipeline_context): yield event errs = {tid: err for tid, err in errors.items() if err} if errs: raise DagsterThreadError( "During multithread execution errors occurred in threads:\n{error_list}".format( error_list="\n".join( [ "In thread {tid}: {err}".format(tid=tid, err=err.to_string()) for tid, err in errs.items() ] ) ), thread_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multithread executor: parent process exiting after {duration} (pid: {pid})".format( duration=format_duration(timer_result.millis), pid=os.getpid() ), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def execute(self, plan_context: PlanOrchestrationContext, execution_plan: ExecutionPlan): check.inst_param(plan_context, "plan_context", PlanOrchestrationContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) self._event_cursor = -1 # pylint: disable=attribute-defined-outside-init yield DagsterEvent.engine_event( plan_context, f"Starting execution with step handler {self._step_handler.name}", EngineEventData(), ) with execution_plan.start(retry_mode=self.retries) as active_execution: running_steps: Dict[str, ExecutionStep] = {} if plan_context.resume_from_failure: yield DagsterEvent.engine_event( plan_context, "Resuming execution from failure", EngineEventData(), ) prior_events = self._pop_events( plan_context.instance, plan_context.run_id, ) for dagster_event in prior_events: yield dagster_event possibly_in_flight_steps = active_execution.rebuild_from_events( prior_events) for step in possibly_in_flight_steps: yield DagsterEvent.engine_event( plan_context, "Checking on status of possibly launched steps", EngineEventData(), step.handle, ) # TODO: check if failure event included. For now, hacky assumption that # we don't log anything on successful check if self._step_handler.check_step_health( self._get_step_handler_context( plan_context, [step], active_execution)): # health check failed, launch the step self._log_new_events( self._step_handler.launch_step( self._get_step_handler_context( plan_context, [step], active_execution)), plan_context, { step.key: step for step in possibly_in_flight_steps }, ) running_steps[step.key] = step last_check_step_health_time = pendulum.now("UTC") # Order of events is important here. During an interation, we call handle_event, then get_steps_to_execute, # then is_complete. get_steps_to_execute updates the state of ActiveExecution, and without it # is_complete can return true when we're just between steps. while not active_execution.is_complete: if active_execution.check_for_interrupts(): if not plan_context.instance.run_will_resume( plan_context.run_id): yield DagsterEvent.engine_event( plan_context, "Executor received termination signal, forwarding to steps", EngineEventData.interrupted( list(running_steps.keys())), ) active_execution.mark_interrupted() for _, step in running_steps.items(): self._log_new_events( self._step_handler.terminate_step( self._get_step_handler_context( plan_context, [step], active_execution)), plan_context, running_steps, ) else: yield DagsterEvent.engine_event( plan_context, "Executor received termination signal, not forwarding to steps because " "run will be resumed", EngineEventData(metadata_entries=[ MetadataEntry("steps_in_flight", value=str(running_steps.keys())) ]), ) active_execution.mark_interrupted() return for dagster_event in self._pop_events( plan_context.instance, plan_context.run_id, ): # type: ignore # STEP_SKIPPED events are only emitted by ActiveExecution, which already handles # and yields them. if dagster_event.is_step_skipped: assert isinstance(dagster_event.step_key, str) active_execution.verify_complete( plan_context, dagster_event.step_key) else: yield dagster_event active_execution.handle_event(dagster_event) if dagster_event.is_step_success or dagster_event.is_step_failure: assert isinstance(dagster_event.step_key, str) del running_steps[dagster_event.step_key] active_execution.verify_complete( plan_context, dagster_event.step_key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( plan_context): yield event curr_time = pendulum.now("UTC") if (curr_time - last_check_step_health_time).total_seconds( ) >= self._check_step_health_interval_seconds: last_check_step_health_time = curr_time for _, step in running_steps.items(): self._log_new_events( self._step_handler.check_step_health( self._get_step_handler_context( plan_context, [step], active_execution)), plan_context, running_steps, ) for step in active_execution.get_steps_to_execute(): running_steps[step.key] = step self._log_new_events( self._step_handler.launch_step( self._get_step_handler_context( plan_context, [step], active_execution)), plan_context, running_steps, ) time.sleep(self._sleep_seconds)
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) limit = self.max_concurrent yield DagsterEvent.engine_event( pipeline_context, "Executing steps using multiprocess executor: parent process (pid: {pid})" .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: with execution_plan.start( retry_mode=self.retries) as active_execution: active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: received termination signal - " "forwarding to active child processes", EngineEventData.interrupted( list(term_events.keys())), ) stopping = True active_execution.mark_interrupted() for key, event in term_events.items(): event.set() # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_steps_to_execute( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[step.key] = multiprocessing.Event() active_iters[ step.key] = self.execute_step_out_of_process( step_context, step, errors, term_events, active_execution.get_known_state(), ) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none active_execution.handle_event(event_or_none) except ChildProcessCrashException as crash: serializable_error = serializable_error_info_from_exc_info( sys.exc_info()) yield DagsterEvent.engine_event( pipeline_context, ("Multiprocess executor: child process for step {step_key} " "unexpectedly exited with code {exit_code}" ).format(step_key=key, exit_code=crash.exit_code), EngineEventData.engine_error( serializable_error), step_handle=active_execution.get_step_by_key( key).handle, ) step_failure_event = DagsterEvent.step_failure_event( step_context=pipeline_context.for_step( active_execution.get_step_by_key(key)), step_failure_data=StepFailureData( error=serializable_error, user_failure_data=None), ) active_execution.handle_event(step_failure_event) yield step_failure_event empty_iters.append(key) except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] del term_events[key] active_execution.verify_complete(pipeline_context, key) # process skipped and abandoned steps yield from active_execution.plan_events_iterator( pipeline_context) errs = {pid: err for pid, err in errors.items() if err} # After termination starts, raise an interrupted exception once all subprocesses # have finished cleaning up (and the only errors were from being interrupted) if (stopping and (not active_iters) and all([ err_info.cls_name == "DagsterExecutionInterruptedError" for err_info in errs.values() ])): yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: interrupted all active child processes", event_specific_data=EngineEventData(), ) raise DagsterExecutionInterruptedError() elif errs: raise DagsterSubprocessError( "During multiprocess execution errors occurred in child processes:\n{error_list}" .format(error_list="\n".join([ "In process {pid}: {err}".format( pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, "Multiprocess executor: parent process exiting after {duration} (pid: {pid})" .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor_config, CeleryConfig), 'pipeline_context', 'Expected executor_config to be CeleryConfig got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config storage = pipeline_context.environment_dict.get('storage') if (celery_config.broker and not is_local_uri(celery_config.broker) ) or (celery_config.backend and not is_local_uri(celery_config.backend)): check.invariant( storage.get('s3') or storage.get('gcs'), 'Must use S3 or GCS storage with non-local Celery broker: {broker} ' 'and backend: {backend}'.format(broker=celery_config.broker, backend=celery_config.backend), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Celery, use filesystem, S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: (-1 * int( step.tags.get('dagster-celery/priority', task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception as e: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get('dagster-celery/queue', task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) step_results[step.key] = _submit_task( app, pipeline_context, step, queue) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'. format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps in process (pid: {pid})'.format(pid=os.getpid()), event_specific_data=EngineEventData.in_process( os.getpid(), execution_plan.step_keys_to_execute), ) with time_execution_scope() as timer_result: check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config), ) for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan): yield event failed_or_skipped_steps = set() # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 active_execution = execution_plan.start() while not active_execution.is_complete: steps = active_execution.get_available_steps(limit=1) check.invariant( len(steps) == 1, 'Invariant Violation: expected step to be available to execute' ) step = steps[0] step_context = pipeline_context.for_step(step) with mirror_step_io(step_context): # capture all of the logs for this step failed_inputs = [] for step_input in step.step_inputs: failed_inputs.extend( failed_or_skipped_steps.intersection( step_input.dependency_keys)) if failed_inputs: step_context.log.info(( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.' ).format(step=step.key, failed_inputs=failed_inputs)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) active_execution.mark_complete(step.key) continue uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional( uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) active_execution.mark_complete(step.key) continue for step_event in check.generator( dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event active_execution.mark_complete(step.key) yield DagsterEvent.engine_event( pipeline_context, 'Finished steps in process (pid: {pid}) in {duration_ms}'.format( pid=os.getpid(), duration_ms=format_duration(timer_result.millis)), event_specific_data=EngineEventData.in_process( os.getpid(), execution_plan.step_keys_to_execute), )
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) intermediates_manager = pipeline_context.intermediates_manager limit = pipeline_context.executor_config.max_concurrent yield DagsterEvent.engine_event( pipeline_context, 'Executing steps using multiprocess engine: parent process (pid: {pid})' .format(pid=os.getpid()), event_specific_data=EngineEventData.multiprocess( os.getpid(), step_keys_to_execute=execution_plan.step_keys_to_execute), ) # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 with time_execution_scope() as timer_result: for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan): yield event active_execution = execution_plan.start() active_iters = {} errors = {} term_events = {} stopping = False while (not stopping and not active_execution.is_complete) or active_iters: try: # start iterators while len(active_iters) < limit and not stopping: steps = active_execution.get_available_steps( limit=(limit - len(active_iters))) if not steps: break for step in steps: step_context = pipeline_context.for_step(step) term_events[ step.key] = get_multiprocessing_context( ).Event() active_iters[ step.key] = execute_step_out_of_process( step_context, step, errors, term_events) # process active iterators empty_iters = [] for key, step_iter in active_iters.items(): try: event_or_none = next(step_iter) if event_or_none is None: continue else: yield event_or_none except StopIteration: empty_iters.append(key) # clear and mark complete finished iterators for key in empty_iters: del active_iters[key] if term_events[key].is_set(): stopping = True del term_events[key] active_execution.mark_complete(key) # In the very small chance that we get interrupted in this coordination section and not # polling the subprocesses for events - try to clean up greacefully except KeyboardInterrupt: yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: received KeyboardInterrupt - forwarding to active child processes', EngineEventData.interrupted(list(term_events.keys())), ) for event in term_events.values(): event.set() errs = {pid: err for pid, err in errors.items() if err} if errs: raise DagsterSubprocessError( 'During multiprocess execution errors occured in child processes:\n{error_list}' .format(error_list='\n'.join([ 'In process {pid}: {err}'.format(pid=pid, err=err.to_string()) for pid, err in errs.items() ])), subprocess_error_infos=list(errs.values()), ) yield DagsterEvent.engine_event( pipeline_context, 'Multiprocess engine: parent process exiting after {duration} (pid: {pid})' .format(duration=format_duration(timer_result.millis), pid=os.getpid()), event_specific_data=EngineEventData.multiprocess(os.getpid()), )