def _user_event_sequence_for_step_compute_fn(step_context, evaluated_inputs): check.inst_param(step_context, "step_context", SystemStepExecutionContext) check.dict_param(evaluated_inputs, "evaluated_inputs", key_type=str) with user_code_error_boundary( DagsterExecutionStepExecutionError, control_flow_exceptions=[Failure, RetryRequested], msg_fn=lambda: """Error occurred during the execution of step: step key: "{key}" solid invocation: "{solid}" solid definition: "{solid_def}" """.format( key=step_context.step.key, solid_def=step_context.solid_def.name, solid=step_context.solid.name, ), step_key=step_context.step.key, solid_def_name=step_context.solid_def.name, solid_name=step_context.solid.name, ): gen = check.opt_generator( step_context.step.compute_fn(step_context, evaluated_inputs)) if not gen: return # Allow interrupts again during each step of the execution for event in iterate_with_context(raise_interrupts_immediately, gen): yield event
def _user_event_sequence_for_step_compute_fn( step_context: SystemStepExecutionContext, evaluated_inputs: Dict[str, Any]) -> Iterator[SolidOutputUnion]: check.inst_param(step_context, "step_context", SystemStepExecutionContext) check.dict_param(evaluated_inputs, "evaluated_inputs", key_type=str) gen = execute_core_compute( step_context.for_compute(), evaluated_inputs, step_context.solid_def.compute_fn, ) for event in iterate_with_context( lambda: user_code_error_boundary( DagsterExecutionStepExecutionError, control_flow_exceptions=[Failure, RetryRequested], msg_fn=lambda: """Error occurred during the execution of step: step key: "{key}" solid invocation: "{solid}" solid definition: "{solid_def}" """.format( key=step_context.step.key, solid_def=step_context.solid_def.name, solid=step_context.solid.name, ), step_key=step_context.step.key, solid_def_name=step_context.solid_def.name, solid_name=step_context.solid.name, ), gen, ): yield event
def _user_event_sequence_for_step_compute_fn( step_context: StepExecutionContext, evaluated_inputs: Dict[str, Any]) -> Iterator[SolidOutputUnion]: check.inst_param(step_context, "step_context", StepExecutionContext) check.dict_param(evaluated_inputs, "evaluated_inputs", key_type=str) gen = execute_core_compute( step_context, evaluated_inputs, step_context.solid_def.compute_fn, ) for event in iterate_with_context( lambda: solid_execution_error_boundary( DagsterExecutionStepExecutionError, msg_fn=lambda: f'Error occurred while executing solid "{step_context.solid.name}":', step_context=step_context, step_key=step_context.step.key, solid_def_name=step_context.solid_def.name, solid_name=step_context.solid.name, ), gen, ): yield event
def _yield_compute_results( step_context: StepExecutionContext, inputs: Dict[str, Any], compute_fn: Callable ) -> Iterator[SolidOutputUnion]: check.inst_param(step_context, "step_context", StepExecutionContext) user_event_generator = compute_fn(SolidExecutionContext(step_context), inputs) if isinstance(user_event_generator, Output): raise DagsterInvariantViolationError( ( "Compute function for solid {solid_name} returned a Output rather than " "yielding it. The compute_fn of the core SolidDefinition must yield " "its results" ).format(solid_name=str(step_context.step.solid_handle)) ) if user_event_generator is None: return if inspect.isasyncgen(user_event_generator): user_event_generator = gen_from_async_gen(user_event_generator) for event in iterate_with_context( lambda: solid_execution_error_boundary( DagsterExecutionStepExecutionError, msg_fn=lambda: f'Error occurred while executing solid "{step_context.solid.name}":', step_context=step_context, step_key=step_context.step.key, solid_def_name=step_context.solid_def.name, solid_name=step_context.solid.name, ), user_event_generator, ): yield _validate_event(event, step_context.step.solid_handle)
def _yield_compute_results(step_context: StepExecutionContext, inputs: Dict[str, Any], compute_fn: Callable) -> Iterator[SolidOutputUnion]: check.inst_param(step_context, "step_context", StepExecutionContext) context = SolidExecutionContext(step_context) user_event_generator = compute_fn(context, inputs) if isinstance(user_event_generator, Output): raise DagsterInvariantViolationError(( "Compute function for {described_node} returned an Output rather than " "yielding it. The compute_fn of the {node_type} must yield " "its results").format( described_node=step_context.describe_op(), node_type=step_context.solid_def.node_type_str, )) if user_event_generator is None: return if inspect.isasyncgen(user_event_generator): user_event_generator = gen_from_async_gen(user_event_generator) op_label = step_context.describe_op() for event in iterate_with_context( lambda: solid_execution_error_boundary( DagsterExecutionStepExecutionError, msg_fn=lambda: f"Error occurred while executing {op_label}:", step_context=step_context, step_key=step_context.step.key, op_def_name=step_context.solid_def.name, op_name=step_context.solid.name, ), user_event_generator, ): if context.has_events(): yield from context.consume_events() yield _validate_event(event, step_context) if context.has_events(): yield from context.consume_events()
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), "pipeline_context", "Expected executor to be DaskExecutor got {}".format( pipeline_context.executor), ) check.invariant( pipeline_context.instance.is_persistent, "Dask execution requires a persistent DagsterInstance", ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == "local": from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == "yarn": from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == "ssh": from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == "pbs": from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == "moab": from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == "oar": from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == "kube": from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={"in_process": {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) dask_task_name = "%s.%s" % (pipeline_name, step.key) recon_pipeline = recon_repo.get_reconstructable_pipeline( pipeline_name) future = client.submit( query_on_dask_worker, dependencies, recon_pipeline, pipeline_context.pipeline_run, run_config, [step.key], pipeline_context.mode_def.name, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master futures = dask.distributed.as_completed(execution_futures, with_results=True) # Allow interrupts while waiting for the results from Dask for future, result in iterate_with_context( raise_interrupts_immediately, futures): for step_event in result: check.inst(step_event, DagsterEvent) yield step_event
def _store_output( step_context: StepExecutionContext, step_output_handle: StepOutputHandle, output: Union[Output, DynamicOutput], input_lineage: List[AssetLineageInfo], ) -> Iterator[DagsterEvent]: output_def = step_context.solid_def.output_def_named(step_output_handle.output_name) output_manager = step_context.get_io_manager(step_output_handle) output_context = step_context.get_output_context(step_output_handle) manager_materializations = [] manager_metadata_entries: List[Union[PartitionMetadataEntry, MetadataEntry]] = [] # output_manager.handle_output is either a generator function, or a normal function with or # without a return value. In the case that handle_output is a normal function, we need to # catch errors should they be raised before a return value. We can do this by wrapping # handle_output in a generator so that errors will be caught within iterate_with_context. if not inspect.isgeneratorfunction(output_manager.handle_output): def _gen_fn(): gen_output = output_manager.handle_output(output_context, output.value) for event in output_context.consume_events(): yield event if gen_output: yield gen_output handle_output_gen = _gen_fn() else: handle_output_gen = output_manager.handle_output(output_context, output.value) for elt in iterate_with_context( lambda: solid_execution_error_boundary( DagsterExecutionHandleOutputError, msg_fn=lambda: ( f'Error occurred while handling output "{output_context.name}" of ' f'step "{step_context.step.key}":' ), step_context=step_context, step_key=step_context.step.key, output_name=output_context.name, ), handle_output_gen, ): for event in output_context.consume_events(): yield event manager_metadata_entries.extend(output_context.consume_logged_metadata_entries()) if isinstance(elt, DagsterEvent): yield elt elif isinstance(elt, AssetMaterialization): manager_materializations.append(elt) elif isinstance(elt, (MetadataEntry, PartitionMetadataEntry)): experimental_functionality_warning( "Yielding metadata from an IOManager's handle_output() function" ) manager_metadata_entries.append(elt) else: raise DagsterInvariantViolationError( f"IO manager on output {output_def.name} has returned " f"value {elt} of type {type(elt).__name__}. The return type can only be " "one of AssetMaterialization, MetadataEntry, PartitionMetadataEntry." ) for event in output_context.consume_events(): yield event manager_metadata_entries.extend(output_context.consume_logged_metadata_entries()) # do not alter explicitly created AssetMaterializations for materialization in manager_materializations: if materialization.metadata_entries and manager_metadata_entries: raise DagsterInvariantViolationError( f"When handling output '{output_context.name}' of {output_context.solid_def.node_type_str} '{output_context.solid_def.name}', received a materialization with metadata, while context.add_output_metadata was used within the same call to handle_output. Due to potential conflicts, this is not allowed. Please specify metadata in one place within the `handle_output` function." ) if manager_metadata_entries: materialization = AssetMaterialization( asset_key=materialization.asset_key, description=materialization.description, metadata_entries=manager_metadata_entries, partition=materialization.partition, tags=materialization.tags, metadata=None, ) yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage) asset_key, partitions = _asset_key_and_partitions_for_output( output_context, output_def, output_manager ) if asset_key: for materialization in _get_output_asset_materializations( asset_key, partitions, output, output_def, manager_metadata_entries, ): yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage) yield DagsterEvent.handled_output( step_context, output_name=step_output_handle.output_name, manager_key=output_def.io_manager_key, metadata_entries=[ entry for entry in manager_metadata_entries if isinstance(entry, MetadataEntry) ], )
def _store_output( step_context: StepExecutionContext, step_output_handle: StepOutputHandle, output: Union[Output, DynamicOutput], input_lineage: List[AssetLineageInfo], ) -> Iterator[DagsterEvent]: output_def = step_context.solid_def.output_def_named( step_output_handle.output_name) output_manager = step_context.get_io_manager(step_output_handle) output_context = step_context.get_output_context(step_output_handle) handle_output_res = output_manager.handle_output(output_context, output.value) manager_materializations = [] manager_metadata_entries = [] if handle_output_res is not None: for elt in iterate_with_context( lambda: solid_execution_error_boundary( DagsterExecutionHandleOutputError, msg_fn=lambda: (f'Error occurred while handling output "{output_context.name}" of ' f'step "{step_context.step.key}":'), step_context=step_context, step_key=step_context.step.key, output_name=output_context.name, ), ensure_gen(handle_output_res), ): if isinstance(elt, AssetMaterialization): manager_materializations.append(elt) elif isinstance(elt, (EventMetadataEntry, PartitionMetadataEntry)): experimental_functionality_warning( "Yielding metadata from an IOManager's handle_output() function" ) manager_metadata_entries.append(elt) else: raise DagsterInvariantViolationError( f"IO manager on output {output_def.name} has returned " f"value {elt} of type {type(elt).__name__}. The return type can only be " "one of AssetMaterialization, EventMetadataEntry, PartitionMetadataEntry." ) # do not alter explicitly created AssetMaterializations for materialization in manager_materializations: yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage) asset_key, partitions = _asset_key_and_partitions_for_output( output_context, output_def, output_manager) if asset_key: for materialization in _get_output_asset_materializations( asset_key, partitions, output, output_def, manager_metadata_entries, ): yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage) yield DagsterEvent.handled_output( step_context, output_name=step_output_handle.output_name, manager_key=output_def.io_manager_key, message_override= f'Handled input "{step_output_handle.output_name}" using intermediate storage' if isinstance(output_manager, IntermediateStorageAdapter) else None, metadata_entries=[ entry for entry in manager_metadata_entries if isinstance(entry, EventMetadataEntry) ], )
def _store_output( step_context: StepExecutionContext, step_output_handle: StepOutputHandle, output: Union[Output, DynamicOutput], input_lineage: List[AssetLineageInfo], ) -> Iterator[DagsterEvent]: output_def = step_context.solid_def.output_def_named(step_output_handle.output_name) output_manager = step_context.get_io_manager(step_output_handle) output_context = step_context.get_output_context(step_output_handle) manager_materializations = [] manager_metadata_entries = [] # output_manager.handle_output is either a generator function, or a normal function with or # without a return value. In the case that handle_output is a normal function, we need to # catch errors should they be raised before a return value. We can do this by wrapping # handle_output in a generator so that errors will be caught within iterate_with_context. if not inspect.isgeneratorfunction(output_manager.handle_output): def _gen_fn(): gen_output = output_manager.handle_output(output_context, output.value) if gen_output: yield gen_output handle_output_gen = _gen_fn() else: handle_output_gen = output_manager.handle_output(output_context, output.value) for elt in iterate_with_context( lambda: solid_execution_error_boundary( DagsterExecutionHandleOutputError, msg_fn=lambda: ( f'Error occurred while handling output "{output_context.name}" of ' f'step "{step_context.step.key}":' ), step_context=step_context, step_key=step_context.step.key, output_name=output_context.name, ), handle_output_gen, ): if isinstance(elt, AssetMaterialization): manager_materializations.append(elt) elif isinstance(elt, (EventMetadataEntry, PartitionMetadataEntry)): experimental_functionality_warning( "Yielding metadata from an IOManager's handle_output() function" ) manager_metadata_entries.append(elt) else: raise DagsterInvariantViolationError( f"IO manager on output {output_def.name} has returned " f"value {elt} of type {type(elt).__name__}. The return type can only be " "one of AssetMaterialization, EventMetadataEntry, PartitionMetadataEntry." ) # do not alter explicitly created AssetMaterializations for materialization in manager_materializations: yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage) asset_key, partitions = _asset_key_and_partitions_for_output( output_context, output_def, output_manager ) if asset_key: for materialization in _get_output_asset_materializations( asset_key, partitions, output, output_def, manager_metadata_entries, ): yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage) yield DagsterEvent.handled_output( step_context, output_name=step_output_handle.output_name, manager_key=output_def.io_manager_key, metadata_entries=[ entry for entry in manager_metadata_entries if isinstance(entry, EventMetadataEntry) ], )