Beispiel #1
0
def _user_event_sequence_for_step_compute_fn(step_context, evaluated_inputs):
    check.inst_param(step_context, "step_context", SystemStepExecutionContext)
    check.dict_param(evaluated_inputs, "evaluated_inputs", key_type=str)

    with user_code_error_boundary(
            DagsterExecutionStepExecutionError,
            control_flow_exceptions=[Failure, RetryRequested],
            msg_fn=lambda: """Error occurred during the execution of step:
        step key: "{key}"
        solid invocation: "{solid}"
        solid definition: "{solid_def}"
        """.format(
                key=step_context.step.key,
                solid_def=step_context.solid_def.name,
                solid=step_context.solid.name,
            ),
            step_key=step_context.step.key,
            solid_def_name=step_context.solid_def.name,
            solid_name=step_context.solid.name,
    ):
        gen = check.opt_generator(
            step_context.step.compute_fn(step_context, evaluated_inputs))
        if not gen:
            return

        # Allow interrupts again during each step of the execution
        for event in iterate_with_context(raise_interrupts_immediately, gen):
            yield event
Beispiel #2
0
def _user_event_sequence_for_step_compute_fn(
        step_context: SystemStepExecutionContext,
        evaluated_inputs: Dict[str, Any]) -> Iterator[SolidOutputUnion]:
    check.inst_param(step_context, "step_context", SystemStepExecutionContext)
    check.dict_param(evaluated_inputs, "evaluated_inputs", key_type=str)

    gen = execute_core_compute(
        step_context.for_compute(),
        evaluated_inputs,
        step_context.solid_def.compute_fn,
    )

    for event in iterate_with_context(
            lambda: user_code_error_boundary(
                DagsterExecutionStepExecutionError,
                control_flow_exceptions=[Failure, RetryRequested],
                msg_fn=lambda: """Error occurred during the execution of step:
            step key: "{key}"
            solid invocation: "{solid}"
            solid definition: "{solid_def}"
            """.format(
                    key=step_context.step.key,
                    solid_def=step_context.solid_def.name,
                    solid=step_context.solid.name,
                ),
                step_key=step_context.step.key,
                solid_def_name=step_context.solid_def.name,
                solid_name=step_context.solid.name,
            ),
            gen,
    ):
        yield event
Beispiel #3
0
def _user_event_sequence_for_step_compute_fn(
        step_context: StepExecutionContext,
        evaluated_inputs: Dict[str, Any]) -> Iterator[SolidOutputUnion]:
    check.inst_param(step_context, "step_context", StepExecutionContext)
    check.dict_param(evaluated_inputs, "evaluated_inputs", key_type=str)

    gen = execute_core_compute(
        step_context,
        evaluated_inputs,
        step_context.solid_def.compute_fn,
    )

    for event in iterate_with_context(
            lambda: solid_execution_error_boundary(
                DagsterExecutionStepExecutionError,
                msg_fn=lambda:
                f'Error occurred while executing solid "{step_context.solid.name}":',
                step_context=step_context,
                step_key=step_context.step.key,
                solid_def_name=step_context.solid_def.name,
                solid_name=step_context.solid.name,
            ),
            gen,
    ):
        yield event
Beispiel #4
0
def _yield_compute_results(
    step_context: StepExecutionContext, inputs: Dict[str, Any], compute_fn: Callable
) -> Iterator[SolidOutputUnion]:
    check.inst_param(step_context, "step_context", StepExecutionContext)

    user_event_generator = compute_fn(SolidExecutionContext(step_context), inputs)

    if isinstance(user_event_generator, Output):
        raise DagsterInvariantViolationError(
            (
                "Compute function for solid {solid_name} returned a Output rather than "
                "yielding it. The compute_fn of the core SolidDefinition must yield "
                "its results"
            ).format(solid_name=str(step_context.step.solid_handle))
        )

    if user_event_generator is None:
        return

    if inspect.isasyncgen(user_event_generator):
        user_event_generator = gen_from_async_gen(user_event_generator)

    for event in iterate_with_context(
        lambda: solid_execution_error_boundary(
            DagsterExecutionStepExecutionError,
            msg_fn=lambda: f'Error occurred while executing solid "{step_context.solid.name}":',
            step_context=step_context,
            step_key=step_context.step.key,
            solid_def_name=step_context.solid_def.name,
            solid_name=step_context.solid.name,
        ),
        user_event_generator,
    ):
        yield _validate_event(event, step_context.step.solid_handle)
Beispiel #5
0
def _yield_compute_results(step_context: StepExecutionContext,
                           inputs: Dict[str, Any],
                           compute_fn: Callable) -> Iterator[SolidOutputUnion]:
    check.inst_param(step_context, "step_context", StepExecutionContext)

    context = SolidExecutionContext(step_context)
    user_event_generator = compute_fn(context, inputs)

    if isinstance(user_event_generator, Output):
        raise DagsterInvariantViolationError((
            "Compute function for {described_node} returned an Output rather than "
            "yielding it. The compute_fn of the {node_type} must yield "
            "its results").format(
                described_node=step_context.describe_op(),
                node_type=step_context.solid_def.node_type_str,
            ))

    if user_event_generator is None:
        return

    if inspect.isasyncgen(user_event_generator):
        user_event_generator = gen_from_async_gen(user_event_generator)

    op_label = step_context.describe_op()

    for event in iterate_with_context(
            lambda: solid_execution_error_boundary(
                DagsterExecutionStepExecutionError,
                msg_fn=lambda: f"Error occurred while executing {op_label}:",
                step_context=step_context,
                step_key=step_context.step.key,
                op_def_name=step_context.solid_def.name,
                op_name=step_context.solid.name,
            ),
            user_event_generator,
    ):
        if context.has_events():
            yield from context.consume_events()
        yield _validate_event(event, step_context)

    if context.has_events():
        yield from context.consume_events()
Beispiel #6
0
    def execute(self, pipeline_context, execution_plan):
        check.inst_param(pipeline_context, "pipeline_context",
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, "execution_plan", ExecutionPlan)
        check.param_invariant(
            isinstance(pipeline_context.executor, DaskExecutor),
            "pipeline_context",
            "Expected executor to be DaskExecutor got {}".format(
                pipeline_context.executor),
        )

        check.invariant(
            pipeline_context.instance.is_persistent,
            "Dask execution requires a persistent DagsterInstance",
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        cluster_type = self.cluster_type
        if cluster_type == "local":
            from dask.distributed import LocalCluster

            cluster = LocalCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "yarn":
            from dask_yarn import YarnCluster

            cluster = YarnCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "ssh":
            from dask.distributed import SSHCluster

            cluster = SSHCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "pbs":
            from dask_jobqueue import PBSCluster

            cluster = PBSCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "moab":
            from dask_jobqueue import MoabCluster

            cluster = MoabCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "sge":
            from dask_jobqueue import SGECluster

            cluster = SGECluster(**self.build_dict(pipeline_name))
        elif cluster_type == "lsf":
            from dask_jobqueue import LSFCluster

            cluster = LSFCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "slurm":
            from dask_jobqueue import SLURMCluster

            cluster = SLURMCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "oar":
            from dask_jobqueue import OARCluster

            cluster = OARCluster(**self.build_dict(pipeline_name))
        elif cluster_type == "kube":
            from dask_kubernetes import KubeCluster

            cluster = KubeCluster(**self.build_dict(pipeline_name))
        else:
            raise ValueError(
                f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}"
            )

        with dask.distributed.Client(cluster) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    run_config = dict(pipeline_context.run_config,
                                      execution={"in_process": {}})
                    recon_repo = pipeline_context.pipeline.get_reconstructable_repository(
                    )

                    dask_task_name = "%s.%s" % (pipeline_name, step.key)

                    recon_pipeline = recon_repo.get_reconstructable_pipeline(
                        pipeline_name)

                    future = client.submit(
                        query_on_dask_worker,
                        dependencies,
                        recon_pipeline,
                        pipeline_context.pipeline_run,
                        run_config,
                        [step.key],
                        pipeline_context.mode_def.name,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            futures = dask.distributed.as_completed(execution_futures,
                                                    with_results=True)

            # Allow interrupts while waiting for the results from Dask
            for future, result in iterate_with_context(
                    raise_interrupts_immediately, futures):
                for step_event in result:
                    check.inst(step_event, DagsterEvent)
                    yield step_event
Beispiel #7
0
def _store_output(
    step_context: StepExecutionContext,
    step_output_handle: StepOutputHandle,
    output: Union[Output, DynamicOutput],
    input_lineage: List[AssetLineageInfo],
) -> Iterator[DagsterEvent]:

    output_def = step_context.solid_def.output_def_named(step_output_handle.output_name)
    output_manager = step_context.get_io_manager(step_output_handle)
    output_context = step_context.get_output_context(step_output_handle)

    manager_materializations = []
    manager_metadata_entries: List[Union[PartitionMetadataEntry, MetadataEntry]] = []

    # output_manager.handle_output is either a generator function, or a normal function with or
    # without a return value. In the case that handle_output is a normal function, we need to
    # catch errors should they be raised before a return value. We can do this by wrapping
    # handle_output in a generator so that errors will be caught within iterate_with_context.

    if not inspect.isgeneratorfunction(output_manager.handle_output):

        def _gen_fn():
            gen_output = output_manager.handle_output(output_context, output.value)
            for event in output_context.consume_events():
                yield event
            if gen_output:
                yield gen_output

        handle_output_gen = _gen_fn()
    else:
        handle_output_gen = output_manager.handle_output(output_context, output.value)

    for elt in iterate_with_context(
        lambda: solid_execution_error_boundary(
            DagsterExecutionHandleOutputError,
            msg_fn=lambda: (
                f'Error occurred while handling output "{output_context.name}" of '
                f'step "{step_context.step.key}":'
            ),
            step_context=step_context,
            step_key=step_context.step.key,
            output_name=output_context.name,
        ),
        handle_output_gen,
    ):
        for event in output_context.consume_events():
            yield event

        manager_metadata_entries.extend(output_context.consume_logged_metadata_entries())
        if isinstance(elt, DagsterEvent):
            yield elt
        elif isinstance(elt, AssetMaterialization):
            manager_materializations.append(elt)
        elif isinstance(elt, (MetadataEntry, PartitionMetadataEntry)):
            experimental_functionality_warning(
                "Yielding metadata from an IOManager's handle_output() function"
            )
            manager_metadata_entries.append(elt)
        else:
            raise DagsterInvariantViolationError(
                f"IO manager on output {output_def.name} has returned "
                f"value {elt} of type {type(elt).__name__}. The return type can only be "
                "one of AssetMaterialization, MetadataEntry, PartitionMetadataEntry."
            )

    for event in output_context.consume_events():
        yield event

    manager_metadata_entries.extend(output_context.consume_logged_metadata_entries())
    # do not alter explicitly created AssetMaterializations
    for materialization in manager_materializations:
        if materialization.metadata_entries and manager_metadata_entries:
            raise DagsterInvariantViolationError(
                f"When handling output '{output_context.name}' of {output_context.solid_def.node_type_str} '{output_context.solid_def.name}', received a materialization with metadata, while context.add_output_metadata was used within the same call to handle_output. Due to potential conflicts, this is not allowed. Please specify metadata in one place within the `handle_output` function."
            )
        if manager_metadata_entries:
            materialization = AssetMaterialization(
                asset_key=materialization.asset_key,
                description=materialization.description,
                metadata_entries=manager_metadata_entries,
                partition=materialization.partition,
                tags=materialization.tags,
                metadata=None,
            )
        yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage)

    asset_key, partitions = _asset_key_and_partitions_for_output(
        output_context, output_def, output_manager
    )
    if asset_key:
        for materialization in _get_output_asset_materializations(
            asset_key,
            partitions,
            output,
            output_def,
            manager_metadata_entries,
        ):
            yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage)

    yield DagsterEvent.handled_output(
        step_context,
        output_name=step_output_handle.output_name,
        manager_key=output_def.io_manager_key,
        metadata_entries=[
            entry for entry in manager_metadata_entries if isinstance(entry, MetadataEntry)
        ],
    )
Beispiel #8
0
def _store_output(
    step_context: StepExecutionContext,
    step_output_handle: StepOutputHandle,
    output: Union[Output, DynamicOutput],
    input_lineage: List[AssetLineageInfo],
) -> Iterator[DagsterEvent]:

    output_def = step_context.solid_def.output_def_named(
        step_output_handle.output_name)
    output_manager = step_context.get_io_manager(step_output_handle)
    output_context = step_context.get_output_context(step_output_handle)

    handle_output_res = output_manager.handle_output(output_context,
                                                     output.value)

    manager_materializations = []
    manager_metadata_entries = []
    if handle_output_res is not None:
        for elt in iterate_with_context(
                lambda: solid_execution_error_boundary(
                    DagsterExecutionHandleOutputError,
                    msg_fn=lambda:
                    (f'Error occurred while handling output "{output_context.name}" of '
                     f'step "{step_context.step.key}":'),
                    step_context=step_context,
                    step_key=step_context.step.key,
                    output_name=output_context.name,
                ),
                ensure_gen(handle_output_res),
        ):
            if isinstance(elt, AssetMaterialization):
                manager_materializations.append(elt)
            elif isinstance(elt, (EventMetadataEntry, PartitionMetadataEntry)):
                experimental_functionality_warning(
                    "Yielding metadata from an IOManager's handle_output() function"
                )
                manager_metadata_entries.append(elt)
            else:
                raise DagsterInvariantViolationError(
                    f"IO manager on output {output_def.name} has returned "
                    f"value {elt} of type {type(elt).__name__}. The return type can only be "
                    "one of AssetMaterialization, EventMetadataEntry, PartitionMetadataEntry."
                )

    # do not alter explicitly created AssetMaterializations
    for materialization in manager_materializations:
        yield DagsterEvent.asset_materialization(step_context, materialization,
                                                 input_lineage)

    asset_key, partitions = _asset_key_and_partitions_for_output(
        output_context, output_def, output_manager)
    if asset_key:
        for materialization in _get_output_asset_materializations(
                asset_key,
                partitions,
                output,
                output_def,
                manager_metadata_entries,
        ):
            yield DagsterEvent.asset_materialization(step_context,
                                                     materialization,
                                                     input_lineage)

    yield DagsterEvent.handled_output(
        step_context,
        output_name=step_output_handle.output_name,
        manager_key=output_def.io_manager_key,
        message_override=
        f'Handled input "{step_output_handle.output_name}" using intermediate storage'
        if isinstance(output_manager, IntermediateStorageAdapter) else None,
        metadata_entries=[
            entry for entry in manager_metadata_entries
            if isinstance(entry, EventMetadataEntry)
        ],
    )
Beispiel #9
0
def _store_output(
    step_context: StepExecutionContext,
    step_output_handle: StepOutputHandle,
    output: Union[Output, DynamicOutput],
    input_lineage: List[AssetLineageInfo],
) -> Iterator[DagsterEvent]:

    output_def = step_context.solid_def.output_def_named(step_output_handle.output_name)
    output_manager = step_context.get_io_manager(step_output_handle)
    output_context = step_context.get_output_context(step_output_handle)

    manager_materializations = []
    manager_metadata_entries = []

    # output_manager.handle_output is either a generator function, or a normal function with or
    # without a return value. In the case that handle_output is a normal function, we need to
    # catch errors should they be raised before a return value. We can do this by wrapping
    # handle_output in a generator so that errors will be caught within iterate_with_context.

    if not inspect.isgeneratorfunction(output_manager.handle_output):

        def _gen_fn():
            gen_output = output_manager.handle_output(output_context, output.value)
            if gen_output:
                yield gen_output

        handle_output_gen = _gen_fn()
    else:
        handle_output_gen = output_manager.handle_output(output_context, output.value)

    for elt in iterate_with_context(
        lambda: solid_execution_error_boundary(
            DagsterExecutionHandleOutputError,
            msg_fn=lambda: (
                f'Error occurred while handling output "{output_context.name}" of '
                f'step "{step_context.step.key}":'
            ),
            step_context=step_context,
            step_key=step_context.step.key,
            output_name=output_context.name,
        ),
        handle_output_gen,
    ):
        if isinstance(elt, AssetMaterialization):
            manager_materializations.append(elt)
        elif isinstance(elt, (EventMetadataEntry, PartitionMetadataEntry)):
            experimental_functionality_warning(
                "Yielding metadata from an IOManager's handle_output() function"
            )
            manager_metadata_entries.append(elt)
        else:
            raise DagsterInvariantViolationError(
                f"IO manager on output {output_def.name} has returned "
                f"value {elt} of type {type(elt).__name__}. The return type can only be "
                "one of AssetMaterialization, EventMetadataEntry, PartitionMetadataEntry."
            )

    # do not alter explicitly created AssetMaterializations
    for materialization in manager_materializations:
        yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage)

    asset_key, partitions = _asset_key_and_partitions_for_output(
        output_context, output_def, output_manager
    )
    if asset_key:
        for materialization in _get_output_asset_materializations(
            asset_key,
            partitions,
            output,
            output_def,
            manager_metadata_entries,
        ):
            yield DagsterEvent.asset_materialization(step_context, materialization, input_lineage)

    yield DagsterEvent.handled_output(
        step_context,
        output_name=step_output_handle.output_name,
        manager_key=output_def.io_manager_key,
        metadata_entries=[
            entry for entry in manager_metadata_entries if isinstance(entry, EventMetadataEntry)
        ],
    )