Esempio n. 1
0
def from_dagster_event_record(event_record, pipeline_name,
                              external_execution_plan):
    # Lots of event types. Pylint thinks there are too many branches
    # pylint: disable=too-many-branches
    check.inst_param(event_record, 'event_record', EventRecord)
    check.param_invariant(event_record.is_dagster_event, 'event_record')
    check.str_param(pipeline_name, 'pipeline_name')
    check.opt_inst_param(external_execution_plan, 'external_execution_plan',
                         ExternalExecutionPlan)

    # circular ref at module scope
    from .errors import DauphinPythonError

    dagster_event = event_record.dagster_event
    basic_params = construct_basic_params(event_record,
                                          external_execution_plan)
    if dagster_event.event_type == DagsterEventType.STEP_START:
        return DauphinExecutionStepStartEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_SKIPPED:
        return DauphinExecutionStepSkippedEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_UP_FOR_RETRY:
        return DauphinExecutionStepUpForRetryEvent(
            error=dagster_event.step_retry_data.error,
            secondsToWait=dagster_event.step_retry_data.seconds_to_wait,
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_RESTARTED:
        return DauphinExecutionStepRestartEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_SUCCESS:
        return DauphinExecutionStepSuccessEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_INPUT:
        input_data = dagster_event.event_specific_data
        return DauphinExecutionStepInputEvent(
            input_name=input_data.input_name,
            type_check=input_data.type_check_data,
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_OUTPUT:
        output_data = dagster_event.step_output_data
        return DauphinExecutionStepOutputEvent(
            output_name=output_data.output_name,
            type_check=output_data.type_check_data,
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_MATERIALIZATION:
        materialization = dagster_event.step_materialization_data.materialization
        return DauphinStepMaterializationEvent(materialization=materialization,
                                               **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_EXPECTATION_RESULT:
        expectation_result = dagster_event.event_specific_data.expectation_result
        return DauphinStepExpectationResultEvent(
            expectation_result=expectation_result, **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_FAILURE:
        check.inst(dagster_event.step_failure_data, StepFailureData)
        return DauphinExecutionStepFailureEvent(
            error=DauphinPythonError(dagster_event.step_failure_data.error),
            failureMetadata=dagster_event.step_failure_data.user_failure_data,
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_START:
        return DauphinPipelineStartEvent(pipelineName=pipeline_name,
                                         **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_SUCCESS:
        return DauphinPipelineSuccessEvent(pipelineName=pipeline_name,
                                           **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE:
        return DauphinPipelineFailureEvent(pipelineName=pipeline_name,
                                           **basic_params)

    elif dagster_event.event_type == DagsterEventType.PIPELINE_INIT_FAILURE:
        return DauphinPipelineInitFailureEvent(
            pipelineName=pipeline_name,
            error=DauphinPythonError(
                dagster_event.pipeline_init_failure_data.error),
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.OBJECT_STORE_OPERATION:
        operation_result = dagster_event.event_specific_data
        return DauphinObjectStoreOperationEvent(
            operation_result=operation_result, **basic_params)
    elif dagster_event.event_type == DagsterEventType.ENGINE_EVENT:
        return DauphinEngineEvent(
            metadataEntries=_to_dauphin_metadata_entries(
                dagster_event.engine_event_data.metadata_entries),
            error=DauphinPythonError(dagster_event.engine_event_data.error)
            if dagster_event.engine_event_data.error else None,
            marker_start=dagster_event.engine_event_data.marker_start,
            marker_end=dagster_event.engine_event_data.marker_end,
            **basic_params)
    else:
        raise Exception(
            'Unknown DAGSTER_EVENT type {inner_type} found in logs'.format(
                inner_type=dagster_event.event_type))
Esempio n. 2
0
    def __init__(
        self,
        type_check_fn,
        key=None,
        name=None,
        is_builtin=False,
        description=None,
        loader=None,
        materializer=None,
        serialization_strategy=None,
        auto_plugins=None,
        required_resource_keys=None,
        kind=DagsterTypeKind.REGULAR,
    ):
        check.opt_str_param(key, "key")
        check.opt_str_param(name, "name")

        check.invariant(not (name is None and key is None),
                        "Must set key or name")

        if name is None:
            check.param_invariant(
                bool(key),
                "key",
                "If name is not provided, must provide key.",
            )
            self.key, self._name = key, None
        elif key is None:
            check.param_invariant(
                bool(name),
                "name",
                "If key is not provided, must provide name.",
            )
            self.key, self._name = name, name
        else:
            check.invariant(key and name)
            self.key, self._name = key, name

        self.description = check.opt_str_param(description, "description")
        self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)
        self.materializer = check.opt_inst_param(materializer, "materializer",
                                                 DagsterTypeMaterializer)

        self.serialization_strategy = check.opt_inst_param(
            serialization_strategy,
            "serialization_strategy",
            SerializationStrategy,
            PickleSerializationStrategy(),
        )
        self.required_resource_keys = check.opt_set_param(
            required_resource_keys,
            "required_resource_keys",
        )

        self._type_check_fn = check.callable_param(type_check_fn,
                                                   "type_check_fn")
        _validate_type_check_fn(self._type_check_fn, self._name)

        auto_plugins = check.opt_list_param(auto_plugins,
                                            "auto_plugins",
                                            of_type=type)

        check.param_invariant(
            all(
                issubclass(auto_plugin_type, TypeStoragePlugin)
                for auto_plugin_type in auto_plugins),
            "auto_plugins",
        )

        self.auto_plugins = auto_plugins

        self.is_builtin = check.bool_param(is_builtin, "is_builtin")
        check.invariant(
            self.display_name is not None,
            "All types must have a valid display name, got None for key {}".
            format(key),
        )

        self.kind = check.inst_param(kind, "kind", DagsterTypeKind)
Esempio n. 3
0
    def __init__(self, target_folder, overwrite=False):
        super(FSFileCache, self).__init__(overwrite=overwrite)
        check.str_param(target_folder, 'target_folder')
        check.param_invariant(os.path.isdir(target_folder), 'target_folder')

        self.target_folder = target_folder
Esempio n. 4
0
 def __init__(self, config_type):
     check.inst_param(config_type, 'config_type', ConfigType)
     check.param_invariant(config_type.kind == ConfigTypeKind.ENUM, 'config_type')
     self._config_type = config_type
     super(DauphinEnumConfigType, self).__init__(**_ctor_kwargs(config_type))
Esempio n. 5
0
def ensure_single_item(ddict):
    check.dict_param(ddict, 'ddict')
    check.param_invariant(len(ddict) == 1, 'ddict', 'Expected dict with single item')
    return list(ddict.items())[0]
Esempio n. 6
0
def resolve_to_config_type(dagster_type):
    from .field_utils import convert_fields_to_dict_type

    # Short circuit if it's already a Config Type
    if isinstance(dagster_type, ConfigType):
        return dagster_type

    if isinstance(dagster_type, dict):
        return convert_fields_to_dict_type(dagster_type)

    if isinstance(dagster_type, list):
        if len(dagster_type) != 1:
            raise DagsterInvalidDefinitionError(
                "Array specifications must only be of length 1")

        inner_type = resolve_to_config_type(dagster_type[0])

        if not inner_type:
            raise DagsterInvalidDefinitionError(
                "Invalid member of array specification: {value} in list {the_list}"
                .format(value=repr(dagster_type[0]), the_list=dagster_type))
        return Array(inner_type)

    from dagster.core.types.dagster_type import DagsterType, List, ListType
    from dagster.core.types.python_set import Set, _TypedPythonSet
    from dagster.core.types.python_tuple import Tuple, _TypedPythonTuple

    if _is_config_type_class(dagster_type):
        check.param_invariant(
            False,
            "dagster_type",
            f"Cannot pass config type class {dagster_type} to resolve_to_config_type. "
            "This error usually occurs when you pass a dagster config type class instead of a class instance into "
            'another dagster config type. E.g. "Noneable(Permissive)" should instead be "Noneable(Permissive())".',
        )

    if isinstance(dagster_type, type) and issubclass(dagster_type,
                                                     DagsterType):
        raise DagsterInvalidDefinitionError(
            "You have passed a DagsterType class {dagster_type} to the config system. "
            "The DagsterType and config schema systems are separate. "
            "Valid config values are:\n{desc}".format(
                dagster_type=repr(dagster_type),
                desc=VALID_CONFIG_DESC,
            ))

    if is_closed_python_optional_type(dagster_type):
        raise DagsterInvalidDefinitionError(
            "Cannot use typing.Optional as a config type. If you want this field to be "
            "optional, please use Field(<type>, is_required=False), and if you want this field to "
            "be required, but accept a value of None, use dagster.Noneable(<type>)."
        )

    if is_typing_type(dagster_type):
        raise DagsterInvalidDefinitionError((
            "You have passed in {dagster_type} to the config system. Types from "
            "the typing module in python are not allowed in the config system. "
            "You must use types that are imported from dagster or primitive types "
            "such as bool, int, etc.").format(dagster_type=dagster_type))

    if dagster_type is List or isinstance(dagster_type, ListType):
        raise DagsterInvalidDefinitionError(
            "Cannot use List in the context of config. " +
            helpful_list_error_string())

    if dagster_type is Set or isinstance(dagster_type, _TypedPythonSet):
        raise DagsterInvalidDefinitionError(
            "Cannot use Set in the context of a config field. " +
            helpful_list_error_string())

    if dagster_type is Tuple or isinstance(dagster_type, _TypedPythonTuple):
        raise DagsterInvalidDefinitionError(
            "Cannot use Tuple in the context of a config field. " +
            helpful_list_error_string())

    if isinstance(dagster_type, DagsterType):
        raise DagsterInvalidDefinitionError((
            "You have passed an instance of DagsterType {type_name} to the config "
            "system (Repr of type: {dagster_type}). "
            "The DagsterType and config schema systems are separate. "
            "Valid config values are:\n{desc}").format(
                type_name=dagster_type.display_name,
                dagster_type=repr(dagster_type),
                desc=VALID_CONFIG_DESC,
            ), )

    # If we are passed here either:
    #  1) We have been passed a python builtin
    #  2) We have been a dagster wrapping type that needs to be convert its config variant
    #     e.g. dagster.List
    #  2) We have been passed an invalid thing. We return False to signify this. It is
    #     up to callers to report a reasonable error.

    from dagster.primitive_mapping import (
        remap_python_builtin_for_config,
        is_supported_config_python_builtin,
    )

    if is_supported_config_python_builtin(dagster_type):
        return remap_python_builtin_for_config(dagster_type)

    if dagster_type is None:
        return ConfigAnyInstance
    if BuiltinEnum.contains(dagster_type):
        return ConfigType.from_builtin_enum(dagster_type)

    # This means that this is an error and we are return False to a callsite
    # We do the error reporting there because those callsites have more context
    return False
Esempio n. 7
0
 def __new__(cls, dagster_type, incoming_fields):
     check.param_invariant(dagster_type.is_selector, 'dagster_type')
     return super(SelectorTypeErrorData, cls).__new__(
         cls, dagster_type, check.list_param(incoming_fields, 'incoming_fields', of_type=str)
     )
Esempio n. 8
0
    def execute(pipeline_context, execution_plan, step_keys_to_execute=None):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
        check.opt_list_param(step_keys_to_execute,
                             'step_keys_to_execute',
                             of_type=str)

        step_key_set = None if step_keys_to_execute is None else set(
            step_keys_to_execute)

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Executing steps in process (pid: {pid})'.format(pid=os.getpid()),
            event_specific_data=EngineEventData.in_process(
                os.getpid(), step_key_set),
        )

        with time_execution_scope() as timer_result:
            check.param_invariant(
                isinstance(pipeline_context.executor_config, ExecutorConfig),
                'pipeline_context',
                'Expected executor_config to be ExecutorConfig got {}'.format(
                    pipeline_context.executor_config),
            )

            failed_or_skipped_steps = set()

            step_levels = execution_plan.topological_step_levels()

            # It would be good to implement a reference tracking algorithm here to
            # garbage collect results that are no longer needed by any steps
            # https://github.com/dagster-io/dagster/issues/811
            for step_level in step_levels:
                for step in step_level:
                    if step_key_set and step.key not in step_key_set:
                        continue

                    step_context = pipeline_context.for_step(step)

                    failed_inputs = []
                    for step_input in step.step_inputs:
                        failed_inputs.extend(
                            failed_or_skipped_steps.intersection(
                                step_input.dependency_keys))

                    if failed_inputs:
                        step_context.log.info((
                            'Dependencies for step {step} failed: {failed_inputs}. Not executing.'
                        ).format(step=step.key, failed_inputs=failed_inputs))
                        failed_or_skipped_steps.add(step.key)
                        yield DagsterEvent.step_skipped_event(step_context)
                        continue

                    uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs(
                        step_context, step)
                    if uncovered_inputs:
                        # In partial pipeline execution, we may end up here without having validated the
                        # missing dependent outputs were optional
                        _assert_missing_inputs_optional(
                            uncovered_inputs, execution_plan, step.key)

                        step_context.log.info((
                            'Not all inputs covered for {step}. Not executing. Output missing for '
                            'inputs: {uncovered_inputs}').format(
                                uncovered_inputs=uncovered_inputs,
                                step=step.key))
                        failed_or_skipped_steps.add(step.key)
                        yield DagsterEvent.step_skipped_event(step_context)
                        continue

                    for step_event in check.generator(
                            dagster_event_sequence_for_step(step_context)):
                        check.inst(step_event, DagsterEvent)
                        if step_event.is_step_failure:
                            failed_or_skipped_steps.add(step.key)

                        yield step_event

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Finished steps in process (pid: {pid}) in {duration_ms}'.format(
                pid=os.getpid(),
                duration_ms=format_duration(timer_result.millis)),
            event_specific_data=EngineEventData.in_process(
                os.getpid(), step_key_set),
        )
 def uri_for_paths(self, paths, protocol=None):
     check.list_param(paths, "paths", of_type=str)
     check.param_invariant(len(paths) > 0, "paths")
     key = self.key_for_paths(paths)
     return self.object_store.uri_for_key(key, protocol)
Esempio n. 10
0
def define_python_dagster_type(
    python_type,
    name=None,
    description=None,
    input_hydration_config=None,
    output_materialization_config=None,
    serialization_strategy=None,
    auto_plugins=None,
    type_check=None,
):
    '''Core machinery for defining a Dagster type corresponding to an existing python type.

    Users should generally use the :py:func:`@dagster_type` decorator or :py:func:`as_dagster_type`,
    both of which defer to this function.

    Args:
        python_type (cls): The python type to wrap as a Dagster type.
        name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of
            the ``python_type`` will be used.
        description (Optional[str]): A user-readable description of the type.
        input_hydration_config (Optional[InputHydrationConfig]): An instance of a class constructed
            using the :py:func:`@input_hydration_config <dagster.InputHydrationConfig>` decorator
            that can map config data to a value of this type.
        output_materialization_config (Optiona[OutputMaterializationConfig]): An instance of a class
            constructed using the
            :py:func:`@output_materialization_config <dagster.output_materialization_config>`
            decorator that can persist values of this type.
        serialization_strategy (Optional[SerializationStrategy]): An instance of a class that
            inherits from :py:class:`SerializationStrategy`. The default strategy for serializing
            this value when automatically persisting it between execution steps. You should set
            this value if the ordinary serialization machinery (e.g., pickle) will not be adequate
            for this type.
        auto_plugins (Optional[List[TypeStoragePlugin]]): If types must be serialized differently
            depending on the storage being used for intermediates, they should specify this
            argument. In these cases the serialization_strategy argument is not sufficient because
            serialization requires specialized API calls, e.g. to call an S3 API directly instead
            of using a generic file object. See ``dagster_pyspark.DataFrame`` for an example.
        type_check (Optional[Callable[[Any], Union[bool, TypeCheck]]]): If specified, this function
            will be called in place of the default isinstance type check. This function should
            return ``True`` if the type check succeds, ``False`` if it fails, or, if additional
            metadata should be emitted along with the type check success or failure, an instance of
            :py:class:`TypeCheck` with the ``success`` field set appropriately.
    '''

    check.type_param(python_type, 'python_type')
    check.opt_str_param(name, 'name', python_type.__name__)
    check.opt_str_param(description, 'description')
    check.opt_inst_param(input_hydration_config, 'input_hydration_config',
                         InputHydrationConfig)
    check.opt_inst_param(output_materialization_config,
                         'output_materialization_config',
                         OutputMaterializationConfig)
    check.opt_inst_param(
        serialization_strategy,
        'serialization_strategy',
        SerializationStrategy,
        default=PickleSerializationStrategy(),
    )

    auto_plugins = check.opt_list_param(auto_plugins,
                                        'auto_plugins',
                                        of_type=type)
    check.param_invariant(
        all(
            issubclass(auto_plugin_type, TypeStoragePlugin)
            for auto_plugin_type in auto_plugins),
        'auto_plugins',
    )

    check.opt_callable_param(type_check, 'type_check')

    class _ObjectType(PythonObjectType):
        def __init__(self):
            super(_ObjectType, self).__init__(
                python_type=python_type,
                name=name,
                description=description,
                input_hydration_config=input_hydration_config,
                output_materialization_config=output_materialization_config,
                serialization_strategy=serialization_strategy,
                auto_plugins=auto_plugins,
                type_check=type_check,
            )

    return _ObjectType
Esempio n. 11
0
    def has_object(self, key):
        check.str_param(key, 'key')
        check.param_invariant(len(key) > 0, 'key')

        return os.path.exists(key)
Esempio n. 12
0
    def __init__(
        self,
        type_check_fn,
        key=None,
        name=None,
        is_builtin=False,
        description=None,
        loader=None,
        materializer=None,
        required_resource_keys=None,
        kind=DagsterTypeKind.REGULAR,
        typing_type=None,
    ):
        check.opt_str_param(key, "key")
        check.opt_str_param(name, "name")

        check.invariant(not (name is None and key is None),
                        "Must set key or name")

        if name is None:
            check.param_invariant(
                bool(key),
                "key",
                "If name is not provided, must provide key.",
            )
            self.key, self._name = key, None
        elif key is None:
            check.param_invariant(
                bool(name),
                "name",
                "If key is not provided, must provide name.",
            )
            self.key, self._name = name, name
        else:
            check.invariant(key and name)
            self.key, self._name = key, name

        self.description = check.opt_str_param(description, "description")
        self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)
        self.materializer = check.opt_inst_param(materializer, "materializer",
                                                 DagsterTypeMaterializer)

        self.required_resource_keys = check.opt_set_param(
            required_resource_keys,
            "required_resource_keys",
        )

        self._type_check_fn = check.callable_param(type_check_fn,
                                                   "type_check_fn")
        _validate_type_check_fn(self._type_check_fn, self._name)

        self.is_builtin = check.bool_param(is_builtin, "is_builtin")
        check.invariant(
            self.display_name is not None,
            "All types must have a valid display name, got None for key {}".
            format(key),
        )

        self.kind = check.inst_param(kind, "kind", DagsterTypeKind)

        self.typing_type = typing_type
Esempio n. 13
0
 def __getitem__(self, *args):
     check.param_invariant(len(args[0]) == 2, "args", "Must be two parameters")
     return create_typed_runtime_dict(args[0][0], args[0][1])
Esempio n. 14
0
    def execute(pipeline_context, execution_plan):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        dask_config = pipeline_context.executor_config

        check.param_invariant(
            isinstance(pipeline_context.executor_config, DaskConfig),
            'pipeline_context',
            'Expected executor_config to be DaskConfig got {}'.format(
                pipeline_context.executor_config),
        )

        # Checks to ensure storage is compatible with Dask configuration
        storage = pipeline_context.environment_dict.get('storage')
        check.invariant(storage.keys(),
                        'Must specify storage to use Dask execution')

        check.invariant(
            pipeline_context.instance.is_persistent,
            'Dask execution requires a persistent DagsterInstance',
        )

        # https://github.com/dagster-io/dagster/issues/2440
        check.invariant(
            pipeline_context.system_storage_def.is_persistent,
            'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS',
        )

        step_levels = execution_plan.execution_step_levels()

        pipeline_name = pipeline_context.pipeline_def.name

        instance = pipeline_context.instance

        with dask.distributed.Client(
                **dask_config.build_dict(pipeline_name)) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = []
                    for step_input in step.step_inputs:
                        for key in step_input.dependency_keys:
                            dependencies.append(execution_futures_dict[key])

                    environment_dict = dict(pipeline_context.environment_dict,
                                            execution={'in_process': {}})
                    variables = {
                        'executionParams': {
                            'selector': {
                                'name': pipeline_name
                            },
                            'environmentConfigData': environment_dict,
                            'mode': pipeline_context.mode_def.name,
                            'executionMetadata': {
                                'runId': pipeline_context.pipeline_run.run_id
                            },
                            'stepKeys': [step.key],
                        }
                    }

                    dask_task_name = '%s.%s' % (pipeline_name, step.key)

                    future = client.submit(
                        query_on_dask_worker,
                        pipeline_context.pipeline.
                        get_reconstructable_repository(),
                        variables,
                        dependencies,
                        instance.get_ref(),
                        key=dask_task_name,
                        resources=get_dask_resource_requirements(step.tags),
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the
            # master
            for future in dask.distributed.as_completed(execution_futures):
                for step_event in future.result():
                    check.inst(step_event, DagsterEvent)

                    yield step_event
Esempio n. 15
0
 def _input_schema(_context, value):
     check.dict_param(value, "value")
     check.param_invariant(set(value.keys()) == field_names, "value")
     return value
Esempio n. 16
0
    def _t_fn(compute_context, inputs):
        check.inst_param(compute_context, "compute_context",
                         SolidExecutionContext)
        check.param_invariant(
            isinstance(compute_context.run_config, dict),
            "context",
            "SystemComputeExecutionContext must have valid run_config",
        )

        system_compute_context = compute_context.get_system_context()

        with seven.TemporaryDirectory() as output_notebook_dir:
            with safe_tempfile_path() as output_log_path:

                parameterized_notebook_path = os.path.join(
                    output_notebook_dir,
                    "{prefix}-inter.ipynb".format(prefix=str(uuid.uuid4())))

                executed_notebook_path = os.path.join(
                    output_notebook_dir,
                    "{prefix}-out.ipynb".format(prefix=str(uuid.uuid4())))

                # Scaffold the registration here
                nb = load_notebook_node(notebook_path)
                nb_no_parameters = replace_parameters(
                    system_compute_context,
                    nb,
                    get_papermill_parameters(system_compute_context, inputs,
                                             output_log_path),
                )
                write_ipynb(nb_no_parameters, parameterized_notebook_path)

                with user_code_error_boundary(
                        DagstermillExecutionError,
                        lambda:
                    ("Error occurred during the execution of Dagstermill solid "
                     "{solid_name}: {notebook_path}".format(
                         solid_name=name, notebook_path=notebook_path)),
                ):
                    try:
                        papermill_engines.register("dagstermill",
                                                   DagstermillNBConvertEngine)
                        papermill.execute_notebook(
                            input_path=parameterized_notebook_path,
                            output_path=executed_notebook_path,
                            engine_name="dagstermill",
                            log_output=True,
                        )

                    except Exception as exc:  # pylint: disable=broad-except
                        try:
                            with open(executed_notebook_path, "rb") as fd:
                                executed_notebook_file_handle = compute_context.file_manager.write(
                                    fd, mode="wb", ext="ipynb")
                                executed_notebook_materialization_path = (
                                    executed_notebook_file_handle.path_desc)
                        except Exception as exc_inner:  # pylint: disable=broad-except
                            compute_context.log.warning(
                                "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}"
                                .format(exc=exc_inner))
                            executed_notebook_materialization_path = executed_notebook_path

                        yield AssetMaterialization(
                            asset_key=executed_notebook_materialization_path,
                            description=
                            "Location of output notebook in file manager",
                            metadata_entries=[
                                EventMetadataEntry.fspath(
                                    executed_notebook_materialization_path)
                            ],
                        )
                        raise exc

            system_compute_context.log.debug(
                "Notebook execution complete for {name} at {executed_notebook_path}."
                .format(
                    name=name,
                    executed_notebook_path=executed_notebook_path,
                ))

            executed_notebook_file_handle = None
            try:
                # use binary mode when when moving the file since certain file_managers such as S3
                # may try to hash the contents
                with open(executed_notebook_path, "rb") as fd:
                    executed_notebook_file_handle = compute_context.file_manager.write(
                        fd, mode="wb", ext="ipynb")
                    executed_notebook_materialization_path = executed_notebook_file_handle.path_desc
            except Exception as exc:  # pylint: disable=broad-except
                compute_context.log.warning(
                    "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}"
                    .format(exc=str(exc)))
                executed_notebook_materialization_path = executed_notebook_path

            yield AssetMaterialization(
                asset_key=executed_notebook_materialization_path,
                description="Location of output notebook in file manager",
                metadata_entries=[
                    EventMetadataEntry.fspath(
                        executed_notebook_materialization_path)
                ],
            )

            if output_notebook is not None:
                yield Output(executed_notebook_file_handle, output_notebook)

            # deferred import for perf
            import scrapbook

            output_nb = scrapbook.read_notebook(executed_notebook_path)

            for (output_name, output_def
                 ) in system_compute_context.solid_def.output_dict.items():
                data_dict = output_nb.scraps.data_dict
                if output_name in data_dict:
                    value = read_value(output_def.dagster_type,
                                       data_dict[output_name])

                    yield Output(value, output_name)

            for key, value in output_nb.scraps.items():
                if key.startswith("event-"):
                    with open(value.data, "rb") as fd:
                        yield pickle.loads(fd.read())
Esempio n. 17
0
    def __init__(
        self,
        config,
        default_value=FIELD_NO_DEFAULT_PROVIDED,
        is_required=None,
        description=None,
    ):
        from .validate import validate_config
        from .post_process import resolve_defaults

        self.config_type = check.inst(self._resolve_config_arg(config),
                                      ConfigType)

        self.description = check.opt_str_param(description, "description")

        check.opt_bool_param(is_required, "is_required")

        if default_value != FIELD_NO_DEFAULT_PROVIDED:
            check.param_invariant(not (callable(default_value)),
                                  "default_value",
                                  "default_value cannot be a callable")

        if is_required is True:
            check.param_invariant(
                default_value == FIELD_NO_DEFAULT_PROVIDED,
                "default_value",
                "required arguments should not specify default values",
            )

        self._default_value = default_value

        # check explicit default value
        if self.default_provided:
            if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value(
                    default_value):
                raise DagsterInvalidDefinitionError((
                    "You have passed into a python enum value as the default value "
                    "into of a config enum type {name}. You must pass in the underlying "
                    "string represention as the default value. One of {value_set}."
                ).format(
                    value_set=[
                        ev.config_value for ev in self.config_type.enum_values
                    ],
                    name=self.config_type.given_name,
                ))

            evr = validate_config(self.config_type, default_value)
            if not evr.success:
                raise DagsterInvalidConfigError(
                    "Invalid default_value for Field.",
                    evr.errors,
                    default_value,
                )

        if is_required is None:
            is_optional = has_implicit_default(
                self.config_type) or self.default_provided
            is_required = not is_optional

            # on implicitly optional - set the default value
            # by resolving the defaults of the type
            if is_optional and not self.default_provided:
                evr = resolve_defaults(self.config_type, None)
                if not evr.success:
                    raise DagsterInvalidConfigError(
                        "Unable to resolve implicit default_value for Field.",
                        evr.errors,
                        None,
                    )
                self._default_value = evr.value
        self._is_required = is_required
Esempio n. 18
0
    def __getitem__(self, *args):
        from .python_dict import create_typed_runtime_dict

        check.param_invariant(
            len(args[0]) == 2, 'args', 'Must be two parameters')
        return create_typed_runtime_dict(args[0][0], args[0][1])
Esempio n. 19
0
    def execute(pipeline_context, execution_plan, step_keys_to_execute=None):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
        check.opt_list_param(step_keys_to_execute,
                             'step_keys_to_execute',
                             of_type=str)

        step_key_set = None if step_keys_to_execute is None else set(
            step_keys_to_execute)

        check.param_invariant(
            isinstance(pipeline_context.executor_config, ExecutorConfig),
            'pipeline_context',
            'Expected executor_config to be ExecutorConfig got {}'.format(
                pipeline_context.executor_config),
        )

        failed_or_skipped_steps = set()

        step_levels = execution_plan.topological_step_levels()

        intermediates_manager = pipeline_context.intermediates_manager

        # It would be good to implement a reference tracking algorithm here so we could
        # garbage collection results that are no longer needed by any steps
        # https://github.com/dagster-io/dagster/issues/811
        for step_level in step_levels:
            for step in step_level:
                if step_key_set and step.key not in step_key_set:
                    continue

                step_context = pipeline_context.for_step(step)

                failed_inputs = [
                    step_input.prev_output_handle.step_key
                    for step_input in step.step_inputs
                    if step_input.prev_output_handle.step_key in
                    failed_or_skipped_steps
                ]
                if failed_inputs:
                    step_context.log.info((
                        'Dependencies for step {step} failed: {failed_inputs}. Not executing.'
                    ).format(step=step.key, failed_inputs=failed_inputs))
                    failed_or_skipped_steps.add(step.key)
                    yield DagsterEvent.step_skipped_event(step_context)
                    continue

                uncovered_inputs = intermediates_manager.uncovered_inputs(
                    step_context, step)
                if uncovered_inputs:
                    # In partial pipeline execution, we may end up here without having validated the
                    # missing dependent outputs were optional
                    _assert_missing_inputs_optional(uncovered_inputs,
                                                    execution_plan, step.key)

                    step_context.log.info((
                        'Not all inputs covered for {step}. Not executing. Output missing for '
                        'inputs: {uncovered_inputs}').format(
                            uncovered_inputs=uncovered_inputs, step=step.key))
                    failed_or_skipped_steps.add(step.key)
                    yield DagsterEvent.step_skipped_event(step_context)
                    continue

                input_values = _create_input_values(step_context,
                                                    intermediates_manager)

                for step_event in check.generator(
                        execute_step_in_memory(step_context, input_values,
                                               intermediates_manager)):
                    check.inst(step_event, DagsterEvent)
                    if step_event.is_step_failure:
                        failed_or_skipped_steps.add(step.key)

                    yield step_event
Esempio n. 20
0
def evaluate_composite_config(context):
    check.inst_param(context, 'context', TraversalContext)
    check.param_invariant(context.config_type.is_composite, 'composite_type')

    fields = context.config_type.fields

    if context.config_value and not isinstance(context.config_value, dict):
        return EvaluateValueResult.for_error(
            create_composite_type_mismatch_error(context))

    evaluate_value_result = _evaluate_composite_solid_config(context)
    if evaluate_value_result.errors or evaluate_value_result.value:
        return evaluate_value_result

    # ASK: this can crash on user error
    config_value = check.opt_dict_param(context.config_value,
                                        'incoming_value',
                                        key_type=str)

    defined_fields = set(fields.keys())
    incoming_fields = set(config_value.keys())
    extra_fields = list(incoming_fields - defined_fields)

    # We'll build up a dict of processed config values below
    errors = []
    output_config_value = {}

    # Here, we support permissive composites. In cases where we know the set of permissible keys a
    # priori, we validate against the config:
    if not context.config_type.is_permissive_composite:
        if extra_fields:
            if len(extra_fields) == 1:
                errors.append(
                    create_field_not_defined_error(context, extra_fields[0]))
            else:
                errors.append(
                    create_fields_not_defined_error(context, extra_fields))

    # And for permissive fields, we just pass along to the output without further validation
    else:
        for field_name in extra_fields:
            output_config_value[field_name] = config_value[field_name]

    # ...However, for any fields the user *has* told us about, we validate against their config
    # specifications
    missing_fields = []

    for key, field_def in fields.items():
        if key in incoming_fields:
            evaluate_value_result = _evaluate_config(
                context.for_field(field_def, key, context.config_value[key]))
            if evaluate_value_result.errors:
                errors += evaluate_value_result.errors
            else:
                output_config_value[key] = evaluate_value_result.value

        elif field_def.is_optional:
            # Try to see if this is a composite solid
            speculative_composite_solid_result = _evaluate_composite_solid_config(
                context.for_field(
                    field_def, key, field_def.default_value
                    if field_def.default_provided else {}))
            if speculative_composite_solid_result.value is not None:
                output_config_value[
                    key] = speculative_composite_solid_result.value
            else:
                if field_def.default_provided:
                    output_config_value[key] = field_def.default_value

        else:
            check.invariant(not field_def.default_provided)
            missing_fields.append(key)

    if missing_fields:
        if len(missing_fields) == 1:
            errors.append(
                create_missing_required_field_error(context,
                                                    missing_fields[0]))
        else:
            errors.append(
                create_missing_required_fields_error(context, missing_fields))

    if errors:
        return EvaluateValueResult.for_errors(errors)
    else:
        return EvaluateValueResult.for_value(output_config_value)
Esempio n. 21
0
    def __init__(self, config_type):
        self._config_type = check.inst_param(config_type, 'config_type', ConfigType)
        check.param_invariant(config_type.kind == ConfigTypeKind.SCALAR_UNION, 'config_type')

        super(DauphinScalarUnionConfigType, self).__init__(**_ctor_kwargs(config_type))
Esempio n. 22
0
    def has_object(self, key):
        check.str_param(key, 'key')
        check.param_invariant(len(key) > 0, 'key')

        key_count = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=key)['KeyCount']
        return bool(key_count > 0)
Esempio n. 23
0
 def __init__(self, config_type):
     check.inst_param(config_type, 'config_type', ConfigType)
     check.param_invariant(ConfigTypeKind.has_fields(config_type.kind), 'config_type')
     self._config_type = config_type
     super(DauphinCompositeConfigType, self).__init__(**_ctor_kwargs(config_type))
Esempio n. 24
0
def ensure_single_item(ddict):
    check.dict_param(ddict, "ddict")
    check.param_invariant(
        len(ddict) == 1, "ddict", "Expected dict with single item")
    return list(ddict.items())[0]
Esempio n. 25
0
 def __init__(self, inner_dagster_type):
     self._inner_dagster_type = check.inst_param(inner_dagster_type,
                                                 "inner_dagster_type",
                                                 DagsterType)
     check.param_invariant(inner_dagster_type.loader, "inner_dagster_type")
     self._schema_type = Array(inner_dagster_type.loader.schema_type)
Esempio n. 26
0
 def __new__(cls, config_type, list_index):
     check.int_param(list_index, 'list_index')
     check.param_invariant(list_index >= 0, 'list_index')
     return super(EvaluationStackListItemEntry, cls).__new__(
         cls, check.inst_param(config_type, 'config_type', ConfigType),
         list_index)
Esempio n. 27
0
def from_dagster_event_record(event_record, pipeline_name):
    from ..schema.errors import GraphenePythonError
    from ..schema.logs.events import (
        GrapheneEngineEvent,
        GrapheneExecutionStepFailureEvent,
        GrapheneExecutionStepInputEvent,
        GrapheneExecutionStepOutputEvent,
        GrapheneExecutionStepRestartEvent,
        GrapheneExecutionStepSkippedEvent,
        GrapheneExecutionStepStartEvent,
        GrapheneExecutionStepSuccessEvent,
        GrapheneExecutionStepUpForRetryEvent,
        GrapheneHandledOutputEvent,
        GrapheneHookCompletedEvent,
        GrapheneHookErroredEvent,
        GrapheneHookSkippedEvent,
        GrapheneLoadedInputEvent,
        GrapheneObjectStoreOperationEvent,
        GraphenePipelineCanceledEvent,
        GraphenePipelineCancelingEvent,
        GraphenePipelineDequeuedEvent,
        GraphenePipelineEnqueuedEvent,
        GraphenePipelineFailureEvent,
        GraphenePipelineInitFailureEvent,
        GraphenePipelineStartEvent,
        GraphenePipelineStartingEvent,
        GraphenePipelineSuccessEvent,
        GrapheneStepExpectationResultEvent,
        GrapheneStepMaterializationEvent,
    )

    # Lots of event types. Pylint thinks there are too many branches
    # pylint: disable=too-many-branches
    check.inst_param(event_record, "event_record", EventRecord)
    check.param_invariant(event_record.is_dagster_event, "event_record")
    check.str_param(pipeline_name, "pipeline_name")

    dagster_event = event_record.dagster_event
    basic_params = construct_basic_params(event_record)
    if dagster_event.event_type == DagsterEventType.STEP_START:
        return GrapheneExecutionStepStartEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_SKIPPED:
        return GrapheneExecutionStepSkippedEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_UP_FOR_RETRY:
        return GrapheneExecutionStepUpForRetryEvent(
            error=dagster_event.step_retry_data.error,
            secondsToWait=dagster_event.step_retry_data.seconds_to_wait,
            **basic_params,
        )
    elif dagster_event.event_type == DagsterEventType.STEP_RESTARTED:
        return GrapheneExecutionStepRestartEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_SUCCESS:
        return GrapheneExecutionStepSuccessEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_INPUT:
        input_data = dagster_event.event_specific_data
        return GrapheneExecutionStepInputEvent(
            input_name=input_data.input_name,
            type_check=input_data.type_check_data,
            **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_OUTPUT:
        output_data = dagster_event.step_output_data
        return GrapheneExecutionStepOutputEvent(
            output_name=output_data.output_name,
            type_check=output_data.type_check_data,
            **basic_params,
        )
    elif dagster_event.event_type == DagsterEventType.STEP_MATERIALIZATION:
        materialization = dagster_event.step_materialization_data.materialization
        return GrapheneStepMaterializationEvent(
            materialization=materialization, **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_EXPECTATION_RESULT:
        expectation_result = dagster_event.event_specific_data.expectation_result
        return GrapheneStepExpectationResultEvent(
            expectation_result=expectation_result, **basic_params)
    elif dagster_event.event_type == DagsterEventType.STEP_FAILURE:
        check.inst(dagster_event.step_failure_data, StepFailureData)
        return GrapheneExecutionStepFailureEvent(
            error=GraphenePythonError(dagster_event.step_failure_data.error),
            failureMetadata=dagster_event.step_failure_data.user_failure_data,
            **basic_params,
        )
    elif dagster_event.event_type == DagsterEventType.PIPELINE_ENQUEUED:
        return GraphenePipelineEnqueuedEvent(pipelineName=pipeline_name,
                                             **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_DEQUEUED:
        return GraphenePipelineDequeuedEvent(pipelineName=pipeline_name,
                                             **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_STARTING:
        return GraphenePipelineStartingEvent(pipelineName=pipeline_name,
                                             **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_CANCELING:
        return GraphenePipelineCancelingEvent(pipelineName=pipeline_name,
                                              **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_CANCELED:
        return GraphenePipelineCanceledEvent(pipelineName=pipeline_name,
                                             **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_START:
        return GraphenePipelineStartEvent(pipelineName=pipeline_name,
                                          **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_SUCCESS:
        return GraphenePipelineSuccessEvent(pipelineName=pipeline_name,
                                            **basic_params)
    elif dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE:
        return GraphenePipelineFailureEvent(
            pipelineName=pipeline_name,
            error=GraphenePythonError(
                dagster_event.pipeline_failure_data.error) if
            (dagster_event.pipeline_failure_data
             and dagster_event.pipeline_failure_data.error) else None,
            **basic_params,
        )

    elif dagster_event.event_type == DagsterEventType.PIPELINE_INIT_FAILURE:
        return GraphenePipelineInitFailureEvent(
            pipelineName=pipeline_name,
            error=GraphenePythonError(
                dagster_event.pipeline_init_failure_data.error),
            **basic_params,
        )
    elif dagster_event.event_type == DagsterEventType.HANDLED_OUTPUT:
        return GrapheneHandledOutputEvent(
            output_name=dagster_event.event_specific_data.output_name,
            manager_key=dagster_event.event_specific_data.manager_key,
            **basic_params,
        )
    elif dagster_event.event_type == DagsterEventType.LOADED_INPUT:
        return GrapheneLoadedInputEvent(
            input_name=dagster_event.event_specific_data.input_name,
            manager_key=dagster_event.event_specific_data.manager_key,
            upstream_output_name=dagster_event.event_specific_data.
            upstream_output_name,
            upstream_step_key=dagster_event.event_specific_data.
            upstream_step_key,
            **basic_params,
        )
    elif dagster_event.event_type == DagsterEventType.OBJECT_STORE_OPERATION:
        operation_result = dagster_event.event_specific_data
        return GrapheneObjectStoreOperationEvent(
            operation_result=operation_result, **basic_params)
    elif dagster_event.event_type == DagsterEventType.ENGINE_EVENT:
        return GrapheneEngineEvent(
            metadataEntries=_to_metadata_entries(
                dagster_event.engine_event_data.metadata_entries),
            error=GraphenePythonError(dagster_event.engine_event_data.error)
            if dagster_event.engine_event_data.error else None,
            marker_start=dagster_event.engine_event_data.marker_start,
            marker_end=dagster_event.engine_event_data.marker_end,
            **basic_params,
        )
    elif dagster_event.event_type == DagsterEventType.HOOK_COMPLETED:
        return GrapheneHookCompletedEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.HOOK_SKIPPED:
        return GrapheneHookSkippedEvent(**basic_params)
    elif dagster_event.event_type == DagsterEventType.HOOK_ERRORED:
        return GrapheneHookErroredEvent(error=GraphenePythonError(
            dagster_event.hook_errored_data.error),
                                        **basic_params)
    else:
        raise Exception(
            "Unknown DAGSTER_EVENT type {inner_type} found in logs".format(
                inner_type=dagster_event.event_type))
Esempio n. 28
0
def validate_selector_config_value(selector_type, config_value, stack):
    check.param_invariant(selector_type.is_selector, 'selector_type')
    check.inst_param(stack, 'stack', EvaluationStack)

    if config_value and not isinstance(config_value, dict):
        yield EvaluationError(
            stack=stack,
            reason=DagsterEvaluationErrorReason.RUNTIME_TYPE_MISMATCH,
            message=
            'Value for selector type {type_name} must be a dict got {value}'.
            format(type_name=selector_type.name, value=config_value),
            error_data=RuntimeMismatchErrorData(config_type=selector_type,
                                                value_rep=repr(config_value)),
        )
        return

    if config_value and len(config_value) > 1:
        incoming_fields = sorted(list(config_value.keys()))
        defined_fields = sorted(list(selector_type.fields.keys()))
        yield EvaluationError(
            stack=stack,
            reason=DagsterEvaluationErrorReason.SELECTOR_FIELD_ERROR,
            message=
            ('You can only specify a single field. You specified {incoming_fields}. '
             'The available fields are {defined_fields}').format(
                 incoming_fields=incoming_fields,
                 defined_fields=defined_fields),
            error_data=SelectorTypeErrorData(dagster_type=selector_type,
                                             incoming_fields=incoming_fields),
        )
        return

    elif not config_value:
        defined_fields = sorted(list(selector_type.fields.keys()))
        if len(selector_type.fields) > 1:
            yield EvaluationError(
                stack=stack,
                reason=DagsterEvaluationErrorReason.SELECTOR_FIELD_ERROR,
                message=(
                    'Must specify a field if more one defined. Defined fields: '
                    '{defined_fields}').format(defined_fields=defined_fields),
                error_data=SelectorTypeErrorData(dagster_type=selector_type,
                                                 incoming_fields=[]),
            )
            return

        field_name, field_def = single_item(selector_type.fields)

        if not field_def.is_optional:
            yield EvaluationError(
                stack=stack,
                reason=DagsterEvaluationErrorReason.SELECTOR_FIELD_ERROR,
                message=('Must specify the required field. Defined fields: '
                         '{defined_fields}').format(
                             defined_fields=defined_fields),
                error_data=SelectorTypeErrorData(dagster_type=selector_type,
                                                 incoming_fields=[]),
            )
            return

        incoming_field_value = field_def.default_value if field_def.default_provided else None

    else:
        check.invariant(config_value and len(config_value) == 1)

        field_name, incoming_field_value = single_item(config_value)
        if field_name not in selector_type.fields:
            yield create_field_not_defined_error(selector_type, stack,
                                                 field_name)
            return

    parent_field = selector_type.fields[field_name]
    for error in _validate_config(
            parent_field.config_type,
            incoming_field_value,
            stack_with_field(stack, field_name, parent_field),
    ):
        yield error
Esempio n. 29
0
    def __init__(
        self,
        config,
        default_value=FIELD_NO_DEFAULT_PROVIDED,
        is_optional=None,
        is_required=None,
        description=None,
    ):
        from .validate import validate_config
        from .post_process import post_process_config

        self.config_type = check.inst(self._resolve_config_arg(config),
                                      ConfigType)

        self.description = check.opt_str_param(description, 'description')

        check.opt_bool_param(is_optional, 'is_optional')
        check.opt_bool_param(is_required, 'is_required')

        canonical_is_required = canonicalize_backcompat_args(
            new_val=is_required,
            new_arg='is_required',
            old_val=is_optional,
            old_arg='is_optional',
            coerce_old_to_new=lambda val: not val,
            additional_warn_txt=
            '"is_optional" deprecated in 0.7.0 and will be removed in 0.8.0. Users should use "is_required" instead.',
        )

        if canonical_is_required is True:
            check.param_invariant(
                default_value == FIELD_NO_DEFAULT_PROVIDED,
                'default_value',
                'required arguments should not specify default values',
            )
        self._default_value = default_value

        # check explicit default value
        if self.default_provided:
            # invoke through property in case it is callable
            value = self.default_value
            evr = validate_config(self.config_type, value)
            if not evr.success:
                raise DagsterInvalidConfigError(
                    'Invalid default_value for Field.',
                    evr.errors,
                    default_value,
                )

        if canonical_is_required is None:
            # neither is_required nor is_optional were specified
            canonical_is_required = not all_optional_type(self.config_type)

            # on implicitly optional - set the default value
            # by resolving the defaults of the type
            if not canonical_is_required and self._default_value == FIELD_NO_DEFAULT_PROVIDED:
                evr = post_process_config(self.config_type, None)
                if not evr.success:
                    raise DagsterInvalidConfigError(
                        'Unable to resolve implicit default_value for Field.',
                        evr.errors,
                        None,
                    )
                self._default_value = evr.value
        self._is_required = canonical_is_required
Esempio n. 30
0
def _core_celery_execution_loop(pipeline_context, execution_plan,
                                step_execution_fn):
    from .tasks import make_app

    check.inst_param(pipeline_context, 'pipeline_context',
                     SystemPipelineExecutionContext)
    check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
    check.callable_param(step_execution_fn, 'step_execution_fn')

    check.param_invariant(
        isinstance(pipeline_context.executor_config,
                   (CeleryConfig, CeleryK8sJobConfig)),
        'pipeline_context',
        'Expected executor_config to be Celery config got {}'.format(
            pipeline_context.executor_config),
    )

    celery_config = pipeline_context.executor_config

    # https://github.com/dagster-io/dagster/issues/2440
    check.invariant(
        pipeline_context.system_storage_def.is_persistent,
        'Cannot use in-memory storage with Celery, use filesystem (on top of NFS or '
        'similar system that allows files to be available to all nodes), S3, or GCS',
    )

    app = make_app(celery_config)

    priority_for_step = lambda step: (-1 * int(
        step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority)
    ) + -1 * _get_run_priority(pipeline_context))
    priority_for_key = lambda step_key: (priority_for_step(
        execution_plan.get_step_by_key(step_key)))
    _warn_on_priority_misuse(pipeline_context, execution_plan)

    step_results = {}  # Dict[ExecutionStep, celery.AsyncResult]
    step_errors = {}
    completed_steps = set({})  # Set[step_key]
    active_execution = execution_plan.start(
        retries=pipeline_context.executor_config.retries,
        sort_key_fn=priority_for_step)
    stopping = False

    while (not active_execution.is_complete and not stopping) or step_results:

        results_to_pop = []
        for step_key, result in sorted(step_results.items(),
                                       key=lambda x: priority_for_key(x[0])):
            if result.ready():
                try:
                    step_events = result.get()
                except Exception:  # pylint: disable=broad-except
                    # We will want to do more to handle the exception here.. maybe subclass Task
                    # Certainly yield an engine or pipeline event
                    step_events = []
                    step_errors[
                        step_key] = serializable_error_info_from_exc_info(
                            sys.exc_info())
                    stopping = True
                for step_event in step_events:
                    event = deserialize_json_to_dagster_namedtuple(step_event)
                    yield event
                    active_execution.handle_event(event)

                results_to_pop.append(step_key)
                completed_steps.add(step_key)

        for step_key in results_to_pop:
            if step_key in step_results:
                del step_results[step_key]
                active_execution.verify_complete(pipeline_context, step_key)

        # process skips from failures or uncovered inputs
        for event in active_execution.skipped_step_events_iterator(
                pipeline_context):
            yield event

        # don't add any new steps if we are stopping
        if stopping:
            continue

        # This is a slight refinement. If we have n workers idle and schedule m > n steps for
        # execution, the first n steps will be picked up by the idle workers in the order in
        # which they are scheduled (and the following m-n steps will be executed in priority
        # order, provided that it takes longer to execute a step than to schedule it). The test
        # case has m >> n to exhibit this behavior in the absence of this sort step.
        for step in active_execution.get_steps_to_execute():
            try:
                queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG,
                                      task_default_queue)
                yield DagsterEvent.engine_event(
                    pipeline_context,
                    'Submitting celery task for step "{step_key}" to queue "{queue}".'
                    .format(step_key=step.key, queue=queue),
                    EngineEventData(marker_start=DELEGATE_MARKER),
                    step_key=step.key,
                )

                # Get the Celery priority for this step
                priority = _get_step_priority(pipeline_context, step)

                # Submit the Celery tasks
                step_results[step.key] = step_execution_fn(
                    app, pipeline_context, step, queue, priority)

            except Exception:
                yield DagsterEvent.engine_event(
                    pipeline_context,
                    'Encountered error during celery task submission.'.format(
                    ),
                    event_specific_data=EngineEventData.engine_error(
                        serializable_error_info_from_exc_info(
                            sys.exc_info()), ),
                )
                raise

        time.sleep(TICK_SECONDS)

    if step_errors:
        raise DagsterSubprocessError(
            'During celery execution errors occurred in workers:\n{error_list}'
            .format(error_list='\n'.join([
                '[{step}]: {err}'.format(step=key, err=err.to_string())
                for key, err in step_errors.items()
            ])),
            subprocess_error_infos=list(step_errors.values()),
        )