Example #1
0
def _dagster_compute_log_manager(base_dir):
    config = dagster_instance_config(base_dir)
    compute_log_base = os.path.join(base_dir, 'storage')
    if config and config.get('compute_logs'):
        if 'module' in config['compute_logs'] and 'class' in config[
                'compute_logs']:
            from dagster.core.storage.compute_log_manager import ComputeLogManager

            try:
                module = __import__(config['compute_logs']['module'])
                klass = getattr(module, config['compute_logs']['class'])
                check.subclass_param(klass, 'compute_log_manager',
                                     ComputeLogManager)
                kwargs = config['compute_logs'].get('config', {})
                compute_log_manager = klass(compute_log_base, **kwargs)
                check.inst_param(compute_log_manager, 'compute_log_manager',
                                 ComputeLogManager)
                return compute_log_manager
            except Exception:
                raise DagsterInvariantViolationError(
                    'Invalid dagster config in `{config_yaml_filename}`. Expecting `module`, '
                    '`class`, and `config`, returning a valid instance of '
                    '`ComputeLogManager`'.format(
                        config_yaml_filename=DAGSTER_CONFIG_YAML_FILENAME))

    from dagster.core.storage.local_compute_log_manager import LocalComputeLogManager

    return LocalComputeLogManager(compute_log_base)
Example #2
0
def test_subclass_param():
    class Super(object):
        pass

    class Sub(Super):
        pass

    class Alone(object):
        pass

    assert check.subclass_param(Sub, "foo", Super)

    with pytest.raises(CheckError):
        assert check.subclass_param(Alone, "foo", Super)

    with pytest.raises(CheckError):
        assert check.subclass_param("value", "foo", Super)

    assert check.opt_subclass_param(Sub, "foo", Super)
    assert check.opt_subclass_param(None, "foo", Super) is None

    with pytest.raises(CheckError):
        assert check.opt_subclass_param(Alone, "foo", Super)

    with pytest.raises(CheckError):
        assert check.opt_subclass_param("value", "foo", Super)
Example #3
0
def user_code_error_boundary(error_cls, msg, **kwargs):
    '''
    Wraps the execution of user-space code in an error boundary. This places a uniform
    policy around an user code invoked by the framework. This ensures that all user
    errors are wrapped in the DagsterUserCodeExecutionError, and that the original stack
    trace of the user error is preserved, so that it can be reported without confusing
    framework code in the stack trace, if a tool author wishes to do so. This has
    been especially help in a notebooking context.
    '''
    check.str_param(msg, 'msg')
    check.subclass_param(error_cls, 'error_cls', DagsterUserCodeExecutionError)

    try:
        yield
    except Exception as e:  # pylint: disable=W0703
        if isinstance(e, DagsterError):
            # The system has thrown an error that is part of the user-framework contract
            raise e
        else:
            # An exception has been thrown by user code and computation should cease
            # with the error reported further up the stack
            raise_from(
                error_cls(msg,
                          user_exception=e,
                          original_exc_info=sys.exc_info(),
                          **kwargs), e)
Example #4
0
def user_code_context_manager(user_fn, error_cls, msg):
    '''Wraps the output of a user provided function that may yield or return a value and
    returns a generator that asserts it only yields a single value.
    '''
    check.callable_param(user_fn, 'user_fn')
    check.subclass_param(error_cls, 'error_cls', DagsterUserCodeExecutionError)

    with user_code_error_boundary(error_cls, msg):
        thing_or_gen = user_fn()
        gen = _ensure_gen(thing_or_gen)

        try:
            thing = next(gen)
        except StopIteration:
            check.failed('Must yield one item. You did not yield anything.')

        yield thing

        stopped = False

        try:
            next(gen)
        except StopIteration:
            stopped = True

        check.invariant(stopped, 'Must yield one item. Yielded more than one item')
Example #5
0
def make_airflow_dag_for_operator(
    recon_repo,
    pipeline_name,
    operator,
    run_config=None,
    mode=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
):
    """Construct an Airflow DAG corresponding to a given Dagster pipeline and custom operator.

    `Custom operator template <https://github.com/dagster-io/dagster/blob/master/python_modules/dagster-test/dagster_test/dagster_airflow/custom_operator.py>`_

    Tasks in the resulting DAG will execute the Dagster logic they encapsulate run by the given
    Operator :py:class:`BaseOperator <airflow.models.BaseOperator>`. If you
    are looking for a containerized solution to provide better isolation, see instead
    :py:func:`make_airflow_dag_containerized`.

    This function should be invoked in an Airflow DAG definition file, such as that created by an
    invocation of the dagster-airflow scaffold CLI tool.

    Args:
        recon_repo (:class:`dagster.ReconstructableRepository`): reference to a Dagster RepositoryDefinition
            that can be reconstructed in another process
        pipeline_name (str): The name of the pipeline definition.
        operator (type): The operator to use. Must be a class that inherits from
            :py:class:`BaseOperator <airflow.models.BaseOperator>`
        run_config (Optional[dict]): The environment config, if any, with which to compile
            the pipeline to an execution plan, as a Python dict.
        mode (Optional[str]): The mode in which to execute the pipeline.
        instance (Optional[DagsterInstance]): The Dagster instance to use to execute the pipeline.
        dag_id (Optional[str]): The id to use for the compiled Airflow DAG (passed through to
            :py:class:`DAG <airflow:airflow.models.DAG>`).
        dag_description (Optional[str]): The description to use for the compiled Airflow DAG
            (passed through to :py:class:`DAG <airflow:airflow.models.DAG>`)
        dag_kwargs (Optional[dict]): Any additional kwargs to pass to the Airflow
            :py:class:`DAG <airflow:airflow.models.DAG>` constructor, including ``default_args``.
        op_kwargs (Optional[dict]): Any additional kwargs to pass to the underlying Airflow
            operator.

    Returns:
        (airflow.models.DAG, List[airflow.models.BaseOperator]): The generated Airflow DAG, and a
        list of its constituent tasks.
    """
    check.subclass_param(operator, "operator", BaseOperator)

    return _make_airflow_dag(
        recon_repo=recon_repo,
        pipeline_name=pipeline_name,
        run_config=run_config,
        mode=mode,
        dag_id=dag_id,
        dag_description=dag_description,
        dag_kwargs=dag_kwargs,
        op_kwargs=op_kwargs,
        operator=operator,
    )
Example #6
0
 def register_type(self, type_to_register, type_storage_plugin):
     check.inst_param(type_to_register, 'type_to_register', RuntimeType)
     check.subclass_param(type_storage_plugin, 'type_storage_plugin', TypeStoragePlugin)
     check.invariant(
         type_to_register.name is not None,
         'Cannot register a type storage plugin for an anonymous type',
     )
     self.TYPE_STORAGE_PLUGIN_REGISTRY[type_to_register.name] = type_storage_plugin
Example #7
0
    def register_type(self, type_to_register, type_storage_plugin):
        from dagster.core.types.dagster_type import DagsterType

        check.inst_param(type_to_register, 'type_to_register', DagsterType)
        check.subclass_param(type_storage_plugin, 'type_storage_plugin', TypeStoragePlugin)
        check.invariant(
            type_to_register.name is not None,
            'Cannot register a type storage plugin for an anonymous type',
        )
        self._registry[type_to_register.name] = type_storage_plugin
Example #8
0
def user_code_error_boundary(error_cls,
                             msg_fn,
                             control_flow_exceptions=None,
                             **kwargs):
    """
    Wraps the execution of user-space code in an error boundary. This places a uniform
    policy around an user code invoked by the framework. This ensures that all user
    errors are wrapped in an exception derived from DagsterUserCodeExecutionError,
    and that the original stack trace of the user error is preserved, so that it
    can be reported without confusing framework code in the stack trace, if a
    tool author wishes to do so. This has been especially help in a notebooking
    context.


    Examples:

    .. code-block:: python

        with user_code_error_boundary(
            # Pass a class that inherits from DagsterUserCodeExecutionError
            DagstermillExecutionError,
            # Pass a function that produces a message
            lambda: 'Error occurred during the execution of Dagstermill solid '
            '{solid_name}: {notebook_path}'.format(
                solid_name=name, notebook_path=notebook_path
            ),
        ):
            call_user_provided_function()

    """
    check.callable_param(msg_fn, "msg_fn")
    check.subclass_param(error_cls, "error_cls", DagsterUserCodeExecutionError)
    control_flow_exceptions = tuple(
        check.opt_list_param(control_flow_exceptions,
                             "control_flow_exceptions"))

    with raise_execution_interrupts():
        try:
            yield
        except control_flow_exceptions as cf:
            # A control flow exception has occurred and should be propagated
            raise cf
        except DagsterError as de:
            # The system has thrown an error that is part of the user-framework contract
            raise de
        except Exception as e:  # pylint: disable=W0703
            # An exception has been thrown by user code and computation should cease
            # with the error reported further up the stack
            raise_from(
                error_cls(msg_fn(),
                          user_exception=e,
                          original_exc_info=sys.exc_info(),
                          **kwargs), e)
Example #9
0
    def __init__(self, scheduler_type, schedule_defs, artifacts_dir,
                 repository_name):
        from .storage import FilesystemScheduleStorage

        check.subclass_param(scheduler_type, 'scheduler_type', Scheduler)
        check.list_param(schedule_defs, 'schedule_defs', ScheduleDefinition)
        check.str_param(artifacts_dir, 'artifacts_dir')
        check.str_param(repository_name, 'repository_name')

        self._Scheduler = scheduler_type
        self._artifacts_dir = artifacts_dir
        self._schedule_defs = schedule_defs

        self._schedule_storage = FilesystemScheduleStorage(
            artifacts_dir, repository_name=repository_name)
Example #10
0
def whitelist_for_serdes(serializer: Union[Type, Type["Serializer"]]):
    """
    Decorator to whitelist a named tuple or enum to be serializable.

    @whitelist_for_serdes
    class

    """

    if inspect.isclass(serializer) and not issubclass(serializer, Serializer):
        return _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP, serializer=None)(serializer)
    else:
        check.subclass_param(serializer, "serializer", Serializer)
        serializer = cast(Type[Serializer], serializer)
        return _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP, serializer=serializer)
Example #11
0
def test_subclass_param():
    class Super(object):
        pass

    class Sub(Super):
        pass

    class Alone(object):
        pass

    assert check.subclass_param(Sub, 'foo', Super)

    with pytest.raises(CheckError):
        assert check.subclass_param(Alone, 'foo', Super)

    with pytest.raises(CheckError):
        assert check.subclass_param('value', 'foo', Super)
Example #12
0
def user_code_error_boundary(error_cls, msg_fn, log_manager=None, **kwargs):
    """
    Wraps the execution of user-space code in an error boundary. This places a uniform
    policy around any user code invoked by the framework. This ensures that all user
    errors are wrapped in an exception derived from DagsterUserCodeExecutionError,
    and that the original stack trace of the user error is preserved, so that it
    can be reported without confusing framework code in the stack trace, if a
    tool author wishes to do so.

    Examples:

    .. code-block:: python

        with user_code_error_boundary(
            # Pass a class that inherits from DagsterUserCodeExecutionError
            DagsterExecutionStepExecutionError,
            # Pass a function that produces a message
            "Error occurred during step execution"
        ):
            call_user_provided_function()

    """
    check.callable_param(msg_fn, "msg_fn")
    check.subclass_param(error_cls, "error_cls", DagsterUserCodeExecutionError)

    with raise_execution_interrupts():
        if log_manager:
            log_manager.begin_python_log_capture()
        try:
            yield
        except DagsterError as de:
            # The system has thrown an error that is part of the user-framework contract
            raise de
        except Exception as e:  # pylint: disable=W0703
            # An exception has been thrown by user code and computation should cease
            # with the error reported further up the stack
            raise error_cls(msg_fn(),
                            user_exception=e,
                            original_exc_info=sys.exc_info(),
                            **kwargs) from e
        finally:
            if log_manager:
                log_manager.end_python_log_capture()
Example #13
0
    def rehydrate(self, **constructor_kwargs):
        from dagster.core.errors import DagsterInvalidConfigError
        from dagster.core.types.evaluator import evaluate_config

        module = importlib.import_module(self.module_name)
        klass = getattr(module, self.class_name)
        check.subclass_param(
            klass,
            'class {class_name} in module {module_name}'.format(
                class_name=self.class_name, module_name=self.module_name),
            ConfigurableClass,
        )

        config_dict = yaml.load(self.config_yaml)
        result = evaluate_config(klass.config_type().inst(), config_dict)
        if not result.success:
            raise DagsterInvalidConfigError(None, result.errors, config_dict)
        constructor_kwargs['inst_data'] = self
        return klass.from_config_value(result.value, **constructor_kwargs)
Example #14
0
def whitelist_for_serdes(__cls: Optional[Type] = None,
                         *,
                         serializer: Optional[Type["Serializer"]] = None):
    """
    Decorator to whitelist a named tuple or enum to be serializable.

    @whitelist_for_serdes
    class

    """

    if __cls is not None:  # decorator invoked directly on class
        check.class_param(__cls, "__cls")
        return _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP,
                                     serializer=None)(__cls)
    else:  # decorator passed params
        check.subclass_param(serializer, "serializer", Serializer)
        serializer = cast(Type[Serializer], serializer)
        return _whitelist_for_serdes(whitelist_map=_WHITELIST_MAP,
                                     serializer=serializer)
Example #15
0
    def rehydrate(self, **constructor_kwargs):
        from dagster.core.errors import DagsterInvalidConfigError
        from dagster.core.types.evaluator import evaluate_config

        try:
            module = importlib.import_module(self.module_name)
        except seven.ModuleNotFoundError:
            check.invariant(
                False,
                'Couldn\'t import module {module_name} when attempting to rehydrate the '
                'configurable class {configurable_class}'.format(
                    module_name=self.module_name,
                    configurable_class=self.module_name + '.' +
                    self.class_name,
                ),
            )
        try:
            klass = getattr(module, self.class_name)
        except AttributeError:
            check.invariant(
                False,
                'Couldn\'t find class {class_name} in module when attempting to rehydrate the '
                'configurable class {configurable_class}'.format(
                    class_name=self.class_name,
                    configurable_class=self.module_name + '.' +
                    self.class_name,
                ),
            )
        check.subclass_param(
            klass,
            'class {class_name} in module {module_name}'.format(
                class_name=self.class_name, module_name=self.module_name),
            ConfigurableClass,
        )

        config_dict = yaml.load(self.config_yaml)
        result = evaluate_config(klass.config_type().inst(), config_dict)
        if not result.success:
            raise DagsterInvalidConfigError(None, result.errors, config_dict)
        constructor_kwargs['inst_data'] = self
        return klass.from_config_value(result.value, **constructor_kwargs)
Example #16
0
def solid_execution_error_boundary(error_cls, msg_fn, step_context, **kwargs):
    """
    A specialization of user_code_error_boundary for the steps involved in executing a solid.
    This variant supports the control flow exceptions RetryRequested and Failure as well
    as respecting the RetryPolicy if present.
    """
    from dagster.core.execution.context.system import StepExecutionContext

    check.callable_param(msg_fn, "msg_fn")
    check.subclass_param(error_cls, "error_cls", DagsterUserCodeExecutionError)
    check.inst_param(step_context, "step_context", StepExecutionContext)

    with raise_execution_interrupts():
        try:
            yield
        except (RetryRequested, Failure) as cf:
            # A control flow exception has occurred and should be propagated
            raise cf
        except DagsterError as de:
            # The system has thrown an error that is part of the user-framework contract
            raise de
        except Exception as e:  # pylint: disable=W0703
            # An exception has been thrown by user code and computation should cease
            # with the error reported further up the stack

            policy = step_context.solid_retry_policy
            if policy:
                # could check exc against a whitelist of exceptions
                raise RetryRequested(
                    max_retries=policy.max_retries,
                    # could support an enum of "delay curves" which use delay and
                    # step_context.previous_attempt_count to calculate wait time
                    seconds_to_wait=policy.delay,
                ) from e

            raise error_cls(
                msg_fn(),
                user_exception=e,
                original_exc_info=sys.exc_info(),
                **kwargs,
            ) from e
Example #17
0
def _make_airflow_dag(
    handle,
    pipeline_name,
    environment_dict=None,
    mode=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):

    check.inst_param(handle, 'handle', ExecutionTargetHandle)
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict',
                                            key_type=str)
    mode = check.opt_str_param(mode, 'mode')

    # Only used for Airflow; internally we continue to use pipeline.name
    dag_id = check.opt_str_param(dag_id, 'dag_id',
                                 _rename_for_airflow(pipeline_name))

    dag_description = check.opt_str_param(dag_description, 'dag_description',
                                          _make_dag_description(pipeline_name))
    check.subclass_param(operator, 'operator', DagsterOperator)
    # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting
    # the trailing comma after **check.opt_dict_param...) -- black 19.3b0 supports multiple python
    # versions, but currently doesn't know what to do with from __future__ import print_function --
    # see https://github.com/ambv/black/issues/768
    # fmt: off
    dag_kwargs = dict({'default_args': DEFAULT_ARGS},
                      **check.opt_dict_param(dag_kwargs,
                                             'dag_kwargs',
                                             key_type=str))
    # fmt: on

    op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)

    pipeline = handle.build_pipeline_definition()
    if mode is None:
        mode = pipeline.get_default_mode_name()
    execution_plan = create_execution_plan(pipeline,
                                           environment_dict,
                                           mode=mode)

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_name, solid_steps in coalesced_plan.items():

        step_keys = [step.key for step in solid_steps]

        task = operator.operator_for_solid(
            handle=handle,
            pipeline_name=pipeline_name,
            environment_dict=environment_dict,
            mode=mode,
            solid_name=solid_name,
            step_keys=step_keys,
            dag=dag,
            dag_id=dag_id,
            op_kwargs=op_kwargs,
        )

        tasks[solid_name] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                prev_solid_name = execution_plan.get_step_by_key(
                    step_input.prev_output_handle.step_key).solid_name
                if solid_name != prev_solid_name:
                    tasks[prev_solid_name].set_downstream(task)

    return (dag, [tasks[solid_name] for solid_name in coalesced_plan.keys()])
Example #18
0
 def __init__(self, scheduler_type):
     self.scheduler_type = check.subclass_param(scheduler_type, 'scheduler_type', Scheduler)
Example #19
0
def solid_execution_error_boundary(error_cls, msg_fn, step_context, **kwargs):
    """
    A specialization of user_code_error_boundary for the steps involved in executing a solid.
    This variant supports the control flow exceptions RetryRequested and Failure as well
    as respecting the RetryPolicy if present.
    """
    from dagster.core.execution.context.system import StepExecutionContext

    check.callable_param(msg_fn, "msg_fn")
    check.subclass_param(error_cls, "error_cls", DagsterUserCodeExecutionError)
    check.inst_param(step_context, "step_context", StepExecutionContext)

    with raise_execution_interrupts():

        step_context.log.begin_python_log_capture()
        retry_policy = step_context.solid_retry_policy

        try:
            yield
        except DagsterError as de:
            # The system has thrown an error that is part of the user-framework contract
            raise de

        except Exception as e:  # pylint: disable=W0703
            # An exception has been thrown by user code and computation should cease
            # with the error reported further up the stack

            # Directly thrown RetryRequested escalate before evaluating the retry policy.
            if isinstance(e, RetryRequested):
                raise e

            if retry_policy:
                raise RetryRequested(
                    max_retries=retry_policy.max_retries,
                    seconds_to_wait=retry_policy.calculate_delay(
                        step_context.previous_attempt_count + 1),
                ) from e

            # Failure exceptions get re-throw without wrapping
            if isinstance(e, Failure):
                raise e

            # Otherwise wrap the user exception with context
            raise error_cls(
                msg_fn(),
                user_exception=e,
                original_exc_info=sys.exc_info(),
                **kwargs,
            ) from e

        except (DagsterExecutionInterruptedError, KeyboardInterrupt) as ie:
            # respect retry policy when interrupts occur
            if retry_policy:
                raise RetryRequested(
                    max_retries=retry_policy.max_retries,
                    seconds_to_wait=retry_policy.calculate_delay(
                        step_context.previous_attempt_count + 1),
                ) from ie
            else:
                raise ie

        finally:
            step_context.log.end_python_log_capture()
Example #20
0
def _make_airflow_dag(
    recon_repo,
    pipeline_name,
    run_config=None,
    mode=None,
    instance=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):
    check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)
    check.str_param(pipeline_name, "pipeline_name")
    run_config = check.opt_dict_param(run_config, "run_config", key_type=str)
    mode = check.opt_str_param(mode, "mode")
    # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory,
    # which would not be consistent between Airflow task invocations.
    instance = (
        check.inst_param(instance, "instance", DagsterInstance)
        if instance
        else DagsterInstance.get(fallback_storage=seven.get_system_temp_directory())
    )

    # Only used for Airflow; internally we continue to use pipeline.name
    dag_id = check.opt_str_param(dag_id, "dag_id", _rename_for_airflow(pipeline_name))

    dag_description = check.opt_str_param(
        dag_description, "dag_description", _make_dag_description(pipeline_name)
    )
    check.subclass_param(operator, "operator", BaseOperator)

    dag_kwargs = dict(
        {"default_args": DEFAULT_ARGS},
        **check.opt_dict_param(dag_kwargs, "dag_kwargs", key_type=str)
    )

    op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)
    pipeline = recon_repo.get_definition().get_pipeline(pipeline_name)

    if mode is None:
        mode = pipeline.get_default_mode_name()

    execution_plan = create_execution_plan(pipeline, run_config, mode=mode)

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_handle, solid_steps in coalesced_plan.items():
        step_keys = [step.key for step in solid_steps]

        operator_parameters = DagsterOperatorParameters(
            recon_repo=recon_repo,
            pipeline_name=pipeline_name,
            run_config=run_config,
            mode=mode,
            task_id=solid_handle,
            step_keys=step_keys,
            dag=dag,
            instance_ref=instance.get_ref(),
            op_kwargs=op_kwargs,
            pipeline_snapshot=pipeline.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan, pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()
            ),
        )
        task = operator(operator_parameters)

        tasks[solid_handle] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                for key in step_input.dependency_keys:
                    prev_solid_handle = execution_plan.get_step_by_key(key).solid_handle.to_string()
                    if solid_handle != prev_solid_handle:
                        tasks[prev_solid_handle].set_downstream(task)

    return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
Example #21
0
 def __whitelist_for_persistence(klass):
     check.subclass_param(klass, "klass", Persistable)
     whitelist_map["persistence"][klass.__name__] = klass
     return klass
Example #22
0
 def get_object(cls, object_store, context, runtime_type, paths):
     check.subclass_param(object_store, 'object_store', ObjectStore)
     return object_store.get_object(context, runtime_type, paths)
Example #23
0
    def get_object(cls, intermediate_store, context, runtime_type, paths):
        from .intermediate_store import IntermediateStore

        check.subclass_param(intermediate_store, 'intermediate_store',
                             IntermediateStore)
        return intermediate_store.get_object(context, runtime_type, paths)
Example #24
0
def _make_airflow_dag(
    handle,
    pipeline_name,
    environment_dict=None,
    mode=None,
    instance=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):
    check.inst_param(handle, 'handle', ExecutionTargetHandle)
    check.str_param(pipeline_name, 'pipeline_name')
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict',
                                            key_type=str)
    mode = check.opt_str_param(mode, 'mode')
    # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory,
    # which would not be consistent between Airflow task invocations.
    instance = (check.inst_param(instance, 'instance', DagsterInstance)
                if instance else DagsterInstance.get(
                    fallback_storage=seven.get_system_temp_directory()))

    # Only used for Airflow; internally we continue to use pipeline.name
    dag_id = check.opt_str_param(dag_id, 'dag_id',
                                 _rename_for_airflow(pipeline_name))

    dag_description = check.opt_str_param(dag_description, 'dag_description',
                                          _make_dag_description(pipeline_name))
    check.subclass_param(operator, 'operator', BaseOperator)

    # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting
    # the trailing comma after **check.opt_dict_param...) -- black 19.3b0 supports multiple python
    # versions, but currently doesn't know what to do with from __future__ import print_function --
    # see https://github.com/ambv/black/issues/768
    # fmt: off
    dag_kwargs = dict({'default_args': DEFAULT_ARGS},
                      **check.opt_dict_param(dag_kwargs,
                                             'dag_kwargs',
                                             key_type=str))
    # fmt: on

    op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)

    pipeline = handle.build_pipeline_definition()

    if mode is None:
        mode = pipeline.get_default_mode_name()

    execution_plan = create_execution_plan(pipeline,
                                           environment_dict,
                                           run_config=RunConfig(mode=mode))

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_handle, solid_steps in coalesced_plan.items():

        step_keys = [step.key for step in solid_steps]

        # We separately construct the Airflow operators here with the appropriate args, because if
        # Airflow gets extraneous args/kwargs it emits a warning every time it parses the DAG (and
        # future Airflow versions will mark this a failure).
        # see https://github.com/ambv/black/issues/768
        # fmt: off
        if operator == DagsterPythonOperator:
            task = operator(handle=handle,
                            pipeline_name=pipeline_name,
                            environment_dict=environment_dict,
                            mode=mode,
                            task_id=solid_handle,
                            step_keys=step_keys,
                            dag=dag,
                            instance_ref=instance.get_ref(),
                            **op_kwargs)
        else:
            task = operator(pipeline_name=pipeline_name,
                            environment_dict=environment_dict,
                            mode=mode,
                            task_id=solid_handle,
                            step_keys=step_keys,
                            dag=dag,
                            instance_ref=instance.get_ref(),
                            **op_kwargs)
        # fmt: on

        tasks[solid_handle] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                for key in step_input.dependency_keys:
                    prev_solid_handle = execution_plan.get_step_by_key(
                        key).solid_handle.to_string()
                    if solid_handle != prev_solid_handle:
                        tasks[prev_solid_handle].set_downstream(task)

    return (dag,
            [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
Example #25
0
def _make_airflow_dag(
    handle,
    pipeline_name,
    environment_dict=None,
    mode=None,
    instance=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):
    check.inst_param(handle, 'handle', ExecutionTargetHandle)
    check.str_param(pipeline_name, 'pipeline_name')
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict',
                                            key_type=str)
    mode = check.opt_str_param(mode, 'mode')
    # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory,
    # which would not be consistent between Airflow task invocations.
    instance = (check.inst_param(instance, 'instance', DagsterInstance)
                if instance else DagsterInstance.get(
                    fallback_storage=seven.get_system_temp_directory()))

    # Only used for Airflow; internally we continue to use pipeline.name
    dag_id = check.opt_str_param(dag_id, 'dag_id',
                                 _rename_for_airflow(pipeline_name))

    dag_description = check.opt_str_param(dag_description, 'dag_description',
                                          _make_dag_description(pipeline_name))
    check.subclass_param(operator, 'operator', BaseOperator)

    dag_kwargs = dict({'default_args': DEFAULT_ARGS},
                      **check.opt_dict_param(dag_kwargs,
                                             'dag_kwargs',
                                             key_type=str))

    op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)
    pipeline = handle.build_pipeline_definition()

    if mode is None:
        mode = pipeline.get_default_mode_name()

    execution_plan = create_execution_plan(pipeline,
                                           environment_dict,
                                           run_config=RunConfig(mode=mode))

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_handle, solid_steps in coalesced_plan.items():
        step_keys = [step.key for step in solid_steps]

        operator_parameters = DagsterOperatorParameters(
            handle=handle,
            pipeline_name=pipeline_name,
            environment_dict=environment_dict,
            mode=mode,
            task_id=solid_handle,
            step_keys=step_keys,
            dag=dag,
            instance_ref=instance.get_ref(),
            op_kwargs=op_kwargs,
        )
        task = operator(operator_parameters)

        tasks[solid_handle] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                for key in step_input.dependency_keys:
                    prev_solid_handle = execution_plan.get_step_by_key(
                        key).solid_handle.to_string()
                    if solid_handle != prev_solid_handle:
                        tasks[prev_solid_handle].set_downstream(task)

    return (dag,
            [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
Example #26
0
def _make_airflow_dag(
    pipeline,
    env_config=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):

    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    env_config = check.opt_dict_param(env_config, 'env_config', key_type=str)
    pipeline_name = pipeline.name
    dag_id = check.opt_str_param(dag_id, 'dag_id', pipeline.name)
    dag_description = check.opt_str_param(
        dag_description, 'dag_description',
        _make_dag_description(pipeline_name, env_config))
    check.subclass_param(operator, 'operator', DagsterOperator)
    # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting
    # the trailing comma after **check.opt_dict_param...) -- black 19.3b0 supports multiple python
    # versions, but currently doesn't know what to do with from __future__ import print_function --
    # see https://github.com/ambv/black/issues/768
    # fmt: off
    dag_kwargs = dict({'default_args': DEFAULT_ARGS},
                      **check.opt_dict_param(dag_kwargs,
                                             'dag_kwargs',
                                             key_type=str))
    # fmt: on

    op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)

    execution_plan = create_execution_plan(pipeline, env_config)

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_name, solid_steps in coalesced_plan.items():

        step_keys = [step.key for step in solid_steps]

        task = operator.operator_for_solid(
            pipeline=pipeline,
            env_config=env_config,
            solid_name=solid_name,
            step_keys=step_keys,
            dag=dag,
            dag_id=dag_id,
            op_kwargs=op_kwargs,
        )

        tasks[solid_name] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                prev_solid_name = execution_plan.get_step_by_key(
                    step_input.prev_output_handle.step_key).tags['solid']
                if solid_name != prev_solid_name:
                    tasks[prev_solid_name].set_downstream(task)

    return (dag, [tasks[solid_name] for solid_name in coalesced_plan.keys()])
Example #27
0
 def __whitelist_for_persistence(klass):
     check.subclass_param(klass, 'klass', Persistable)
     whitelist_map['persistence'][klass.__name__] = klass
     return klass