Example #1
0
def test_intermediate_storage_def_to_io_manager_def():
    called = {}

    @intermediate_storage()
    def no_config_intermediate_storage(init_context):
        called["ran"] = True
        object_store = InMemoryObjectStore()
        return build_intermediate_storage_from_object_store(
            object_store=object_store, init_context=init_context)

    @solid
    def return_one(_):
        return 1

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={
                "io_manager":
                io_manager_from_intermediate_storage(
                    no_config_intermediate_storage)
            })
    ])
    def foo():
        return_one()

    assert execute_pipeline(foo).success
Example #2
0
    def _get_config_schema(
        self,
        resource_defs: Optional[Dict[str, ResourceDefinition]],
        executor_def: "ExecutorDefinition",
    ) -> ConfigType:
        from .pipeline import PipelineDefinition

        return (PipelineDefinition(
            name=self.name,
            graph_def=self,
            mode_defs=[
                ModeDefinition(resource_defs=resource_defs,
                               executor_defs=[executor_def])
            ],
        ).get_run_config_schema("default").run_config_schema_type)
Example #3
0
    def _get_config_schema(
        self,
        resource_defs: Optional[Dict[str, ResourceDefinition]],
        executor_def: "ExecutorDefinition",
        logger_defs: Optional[Dict[str, LoggerDefinition]],
    ) -> ConfigType:
        from .job_definition import JobDefinition

        return (JobDefinition(
            name=self.name,
            graph_def=self,
            mode_def=ModeDefinition(
                resource_defs=resource_defs,
                executor_defs=[executor_def],
                logger_defs=logger_defs,
            ),
        ).get_run_config_schema("default").run_config_schema_type)
Example #4
0
from dagster.core.storage.fs_io_manager import fs_io_manager
from dagster.core.test_utils import create_run_for_test, instance_for_test
from dagster.grpc.types import ExecuteStepArgs
from dagster_k8s.executor import K8sStepHandler, k8s_job_executor
from dagster_k8s.job import DagsterK8sJobConfig, UserDefinedDagsterK8sConfig


@solid
def foo():
    return 1


@pipeline(
    mode_defs=[
        ModeDefinition(
            executor_defs=[k8s_job_executor], resource_defs={"io_manager": fs_io_manager}
        )
    ]
)
def bar():
    foo()


def test_requires_k8s_launcher_fail():
    with instance_for_test() as instance:
        with pytest.raises(
            DagsterUnmetExecutorRequirementsError,
            match="This engine is only compatible with a K8sRunLauncher",
        ):
            execute_pipeline(reconstructable(bar), instance=instance)
    )


@solid
def bar_solid(_):
    return "bar"


@solid(tags={"foo": "bar"})
def baz_solid(_, bar):
    return bar * 2


@pipeline(mode_defs=[
    ModeDefinition(
        executor_defs=[test_step_delegating_executor],
        resource_defs={"io_manager": fs_io_manager},
    )
])
def foo_pipline():
    baz_solid(bar_solid())
    bar_solid()


def test_execute():
    TestStepHandler.reset()
    with instance_for_test() as instance:
        result = execute_pipeline(
            reconstructable(foo_pipline),
            instance=instance,
            run_config={
                "execution": {
Example #6
0
    materialize,
    get_materialization_lock,
    get_materialize_sensor_lock,
    lock_materialization_process,
    release_materialization_process,
)


@discord_message_on_failure
@discord_message_on_success
@pipeline(
    mode_defs=[
        ModeDefinition(
            "dev",
            resource_defs={
                "discord_webhook": discord_webhook,
                "timezone_config": timezone_config,
            },
        ),
    ],
    tags={
        "pipeline": "update_managed_materialized_views",
        "dagster-k8s/config": {
            "container_config": {
                "resources": {
                    "requests": {
                        "cpu": "50m",
                        "memory": "100Mi"
                    },
                    "limits": {
                        "cpu": "500m",
Example #7
0
    def execute_in_process(
        self,
        run_config: Any = None,
        instance: Optional["DagsterInstance"] = None,
        resources: Optional[Dict[str, Any]] = None,
        raise_on_error: bool = True,
        op_selection: Optional[List[str]] = None,
    ) -> "ExecuteInProcessResult":
        """
        Execute this graph in-process, collecting results in-memory.

        Args:
            run_config (Optional[Dict[str, Any]]):
                Run config to provide to execution. The configuration for the underlying graph
                should exist under the "ops" key.
            instance (Optional[DagsterInstance]):
                The instance to execute against, an ephemeral one will be used if none provided.
            resources (Optional[Dict[str, Any]]):
                The resources needed if any are required. Can provide resource instances directly,
                or resource definitions.
            raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
                Defaults to ``True``.
            op_selection (Optional[List[str]]): A list of op selection queries (including single op
                names) to execute. For example:
                * ``['some_op']``: selects ``some_op`` itself.
                * ``['*some_op']``: select ``some_op`` and all its ancestors (upstream dependencies).
                * ``['*some_op+++']``: select ``some_op``, all its ancestors, and its descendants
                (downstream dependencies) within 3 levels down.
                * ``['*some_op', 'other_op_a', 'other_op_b+']``: select ``some_op`` and all its
                ancestors, ``other_op_a`` itself, and ``other_op_b`` and its direct child ops.

        Returns:
            :py:class:`~dagster.ExecuteInProcessResult`
        """
        from dagster.core.execution.build_resources import wrap_resources_for_execution
        from dagster.core.execution.execute_in_process import core_execute_in_process
        from dagster.core.instance import DagsterInstance
        from .job_definition import JobDefinition
        from .executor_definition import execute_in_process_executor

        instance = check.opt_inst_param(instance, "instance", DagsterInstance)
        resources = check.opt_dict_param(resources, "resources", key_type=str)

        resource_defs = wrap_resources_for_execution(resources)
        in_proc_mode = ModeDefinition(
            executor_defs=[execute_in_process_executor],
            resource_defs=resource_defs)
        ephemeral_job = JobDefinition(
            name=self._name, graph_def=self,
            mode_def=in_proc_mode).get_job_def_for_op_selection(op_selection)

        run_config = run_config if run_config is not None else {}
        op_selection = check.opt_list_param(op_selection, "op_selection", str)

        return core_execute_in_process(
            node=self,
            ephemeral_pipeline=ephemeral_job,
            run_config=run_config,
            instance=instance,
            output_capturing_enabled=True,
            raise_on_error=raise_on_error,
        )
Example #8
0
    def to_job(
        self,
        name: Optional[str] = None,
        description: Optional[str] = None,
        resource_defs: Optional[Dict[str, ResourceDefinition]] = None,
        config: Union[ConfigMapping, Dict[str, Any],
                      "PartitionedConfig"] = None,
        tags: Optional[Dict[str, Any]] = None,
        logger_defs: Optional[Dict[str, LoggerDefinition]] = None,
        executor_def: Optional["ExecutorDefinition"] = None,
        hooks: Optional[AbstractSet[HookDefinition]] = None,
        op_retry_policy: Optional[RetryPolicy] = None,
        version_strategy: Optional[VersionStrategy] = None,
        op_selection: Optional[List[str]] = None,
        partitions_def: Optional["PartitionsDefinition"] = None,
    ) -> "JobDefinition":
        """
        Make this graph in to an executable Job by providing remaining components required for execution.

        Args:
            name (Optional[str]):
                The name for the Job. Defaults to the name of the this graph.
            resource_defs (Optional[Dict[str, ResourceDefinition]]):
                Resources that are required by this graph for execution.
                If not defined, `io_manager` will default to filesystem.
            config:
                Describes how the job is parameterized at runtime.

                If no value is provided, then the schema for the job's run config is a standard
                format based on its solids and resources.

                If a dictionary is provided, then it must conform to the standard config schema, and
                it will be used as the job's run config for the job whenever the job is executed.
                The values provided will be viewable and editable in the Dagit playground, so be
                careful with secrets.

                If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is
                determined by the config mapping, and the ConfigMapping, which should return
                configuration in the standard format to configure the job.

                If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config
                values that can parameterize the job, as well as a function for mapping those
                values to the base config. The values provided will be viewable and editable in the
                Dagit playground, so be careful with secrets.
            tags (Optional[Dict[str, Any]]):
                Arbitrary metadata for any execution of the Job.
                Values that are not strings will be json encoded and must meet the criteria that
                `json.loads(json.dumps(value)) == value`.  These tag values may be overwritten by tag
                values provided at invocation time.
            logger_defs (Optional[Dict[str, LoggerDefinition]]):
                A dictionary of string logger identifiers to their implementations.
            executor_def (Optional[ExecutorDefinition]):
                How this Job will be executed. Defaults to :py:class:`multi_or_in_process_executor`,
                which can be switched between multi-process and in-process modes of execution. The
                default mode of execution is multi-process.
            op_retry_policy (Optional[RetryPolicy]): The default retry policy for all ops in this job.
                Only used if retry policy is not defined on the op definition or op invocation.
            version_strategy (Optional[VersionStrategy]):
                Defines how each solid (and optionally, resource) in the job can be versioned. If
                provided, memoizaton will be enabled for this job.
            partitions_def (Optional[PartitionsDefinition]): Defines a discrete set of partition
                keys that can parameterize the job. If this argument is supplied, the config
                argument can't also be supplied.

        Returns:
            JobDefinition
        """
        from .job_definition import JobDefinition
        from .partition import PartitionedConfig, PartitionsDefinition
        from .executor_definition import ExecutorDefinition, multi_or_in_process_executor

        job_name = check_valid_name(name or self.name)

        tags = check.opt_dict_param(tags, "tags", key_type=str)
        executor_def = check.opt_inst_param(
            executor_def,
            "executor_def",
            ExecutorDefinition,
            default=multi_or_in_process_executor)

        if resource_defs and "io_manager" in resource_defs:
            resource_defs_with_defaults = resource_defs
        else:
            resource_defs_with_defaults = merge_dicts(
                {"io_manager": default_job_io_manager}, resource_defs or {})

        hooks = check.opt_set_param(hooks, "hooks", of_type=HookDefinition)
        op_retry_policy = check.opt_inst_param(op_retry_policy,
                                               "op_retry_policy", RetryPolicy)
        op_selection = check.opt_list_param(op_selection,
                                            "op_selection",
                                            of_type=str)
        presets = []
        config_mapping = None
        partitioned_config = None

        if partitions_def:
            check.inst_param(partitions_def, "partitions_def",
                             PartitionsDefinition)
            check.invariant(
                config is None,
                "Can't supply both the 'config' and 'partitions_def' arguments"
            )
            partitioned_config = PartitionedConfig(partitions_def,
                                                   lambda _: {})

        if isinstance(config, ConfigMapping):
            config_mapping = config
        elif isinstance(config, PartitionedConfig):
            partitioned_config = config
        elif isinstance(config, dict):
            presets = [PresetDefinition(name="default", run_config=config)]
            # Using config mapping here is a trick to make it so that the preset will be used even
            # when no config is supplied for the job.
            config_mapping = _config_mapping_with_default_value(
                self._get_config_schema(resource_defs_with_defaults,
                                        executor_def, logger_defs),
                config,
                job_name,
                self.name,
            )
        elif config is not None:
            check.failed(
                f"config param must be a ConfigMapping, a PartitionedConfig, or a dictionary, but "
                f"is an object of type {type(config)}")

        return JobDefinition(
            name=job_name,
            description=description or self.description,
            graph_def=self,
            mode_def=ModeDefinition(
                resource_defs=resource_defs_with_defaults,
                logger_defs=logger_defs,
                executor_defs=[executor_def],
                _config_mapping=config_mapping,
                _partitioned_config=partitioned_config,
            ),
            preset_defs=presets,
            tags=tags,
            hook_defs=hooks,
            version_strategy=version_strategy,
            op_retry_policy=op_retry_policy,
        ).get_job_def_for_op_selection(op_selection)
Example #9
0
    def to_job(
        self,
        name: Optional[str] = None,
        description: Optional[str] = None,
        resource_defs: Optional[Dict[str, ResourceDefinition]] = None,
        config: Union[ConfigMapping, Dict[str, Any],
                      "PartitionedConfig"] = None,
        tags: Optional[Dict[str, str]] = None,
        logger_defs: Optional[Dict[str, LoggerDefinition]] = None,
        executor_def: Optional["ExecutorDefinition"] = None,
        hooks: Optional[AbstractSet[HookDefinition]] = None,
    ) -> "PipelineDefinition":
        """
        Make this graph in to an executable Job by providing remaining components required for execution.

        Args:
            name (Optional[str]):
                The name for the Job. Defaults to the name of the this graph.
            resource_defs (Optional[Dict[str, ResourceDefinition]]):
                Resources that are required by this graph for execution.
                If not defined, `io_manager` will default to filesystem.
            config:
                Describes how the job is parameterized at runtime.

                If no value is provided, then the schema for the job's run config is a standard
                format based on its solids and resources.

                If a dictionary is provided, then it must conform to the standard config schema, and
                it will be used as the job's run config for the job whenever the job is executed.
                The values provided will be viewable and editable in the Dagit playground, so be
                careful with secrets.

                If a :py:class:`ConfigMapping` object is provided, then the schema for the job's run config is
                determined by the config mapping, and the ConfigMapping, which should return
                configuration in the standard format to configure the job.

                If a :py:class:`PartitionedConfig` object is provided, then it defines a discrete set of config
                values that can parameterize the pipeline, as well as a function for mapping those
                values to the base config. The values provided will be viewable and editable in the
                Dagit playground, so be careful with secrets.
            tags (Optional[Dict[str, Any]]):
                Arbitrary metadata for any execution of the Job.
                Values that are not strings will be json encoded and must meet the criteria that
                `json.loads(json.dumps(value)) == value`.  These tag values may be overwritten by tag
                values provided at invocation time.
            logger_defs (Optional[Dict[str, LoggerDefinition]]):
                A dictionary of string logger identifiers to their implementations.
            executor_def (Optional[ExecutorDefinition]):
                How this Job will be executed. Defaults to :py:class:`multiprocess_executor` .

        Returns:
            PipelineDefinition: The "Job" currently implemented as a single-mode pipeline
        """
        from .pipeline import PipelineDefinition
        from .partition import PartitionedConfig
        from .executor import ExecutorDefinition, multiprocess_executor

        tags = check.opt_dict_param(tags, "tags", key_type=str, value_type=str)
        executor_def = check.opt_inst_param(executor_def,
                                            "executor_def",
                                            ExecutorDefinition,
                                            default=multiprocess_executor)

        if resource_defs and "io_manager" in resource_defs:
            resource_defs_with_defaults = resource_defs
        else:
            resource_defs_with_defaults = merge_dicts(
                {"io_manager": default_job_io_manager}, resource_defs or {})

        hooks = check.opt_set_param(hooks, "hooks", of_type=HookDefinition)
        presets = []
        config_mapping = None
        partitioned_config = None

        if isinstance(config, ConfigMapping):
            config_mapping = config
        elif isinstance(config, PartitionedConfig):
            partitioned_config = config
        elif isinstance(config, dict):
            presets = [PresetDefinition(name="default", run_config=config)]
            # Using config mapping here is a trick to make it so that the preset will be used even
            # when no config is supplied for the job.
            config_mapping = _config_mapping_with_default_value(
                self._get_config_schema(resource_defs_with_defaults,
                                        executor_def), config)
        elif config is not None:
            check.failed(
                f"config param must be a ConfigMapping, a PartitionedConfig, or a dictionary, but "
                f"is an object of type {type(config)}")

        job_name = name or self.name

        return PipelineDefinition(
            name=job_name,
            description=description,
            graph_def=self,
            mode_defs=[
                ModeDefinition(
                    resource_defs=resource_defs_with_defaults,
                    logger_defs=logger_defs,
                    executor_defs=[executor_def],
                    _config_mapping=config_mapping,
                    _partitioned_config=partitioned_config,
                )
            ],
            preset_defs=presets,
            tags=tags,
            hook_defs=hooks,
        )