Example #1
0
def generate_solid(solid_id, num_inputs, num_outputs, num_cfg):
    def compute_fn(_context, **_kwargs):
        for i in range(num_outputs):
            yield Output(i, "out_{}".format(i))

    config = {}
    for i in range(num_cfg):
        config[f"field_{i}"] = Field(str, is_required=False)

    return SolidDefinition(
        name=solid_id,
        input_defs=[
            InputDefinition(name="in_{}".format(i), default_value="default")
            for i in range(num_inputs)
        ],
        output_defs=[
            OutputDefinition(name="out_{}".format(i))
            for i in range(num_outputs)
        ],
        compute_fn=compute_fn,
        config_schema=config,
    )
Example #2
0
def test_basic_solid_with_config():
    did_get = {}

    def _t_fn(context, _inputs):
        did_get['yep'] = context.solid_config

    solid = SolidDefinition(
        name='solid_with_context',
        inputs=[],
        outputs=[],
        config_field=Field(Dict({'some_config': Field(String)})),
        transform_fn=_t_fn,
    )

    pipeline = PipelineDefinition(solids=[solid])

    execute_pipeline(
        pipeline, {'solids': {'solid_with_context': {'config': {'some_config': 'foo'}}}}
    )

    assert 'yep' in did_get
    assert 'some_config' in did_get['yep']
Example #3
0
def test_wrong_solid_name():
    pipeline_def = PipelineDefinition(
        name='pipeline_wrong_solid_name',
        solid_defs=[
            SolidDefinition(
                name='some_solid',
                input_defs=[],
                output_defs=[],
                config_field=Field(Int),
                compute_fn=lambda *_args: None,
            )
        ],
    )

    env_config = {'solids': {'another_name': {'config': {}}}}

    with pytest.raises(DagsterInvalidConfigError) as pe_info:
        execute_pipeline(pipeline_def, env_config)

    pe = pe_info.value

    assert 'Undefined field "another_name" at path root:solids' in str(pe)
Example #4
0
def test_provided_default_on_resources_config():
    pipeline_def = PipelineDefinition(
        mode_definitions=[
            ModeDefinition(
                name='some_mode',
                resources={
                    'some_resource': ResourceDefinition(
                        resource_fn=lambda: None,
                        config_field=Field(
                            Dict(
                                {
                                    'with_default_int': Field(
                                        Int, is_optional=True, default_value=23434
                                    )
                                }
                            )
                        ),
                    )
                },
            )
        ],
        solids=[
            SolidDefinition(name='some_solid', inputs=[], outputs=[], compute_fn=lambda *args: None)
        ],
    )

    env_type = create_environment_type(pipeline_def)
    assert env_type.type_attributes.is_system_config
    some_resource_field = env_type.fields['resources'].config_type.fields['some_resource']
    assert some_resource_field.is_optional

    some_resource_config_field = some_resource_field.config_type.fields['config']
    assert some_resource_config_field.is_optional
    assert some_resource_config_field.default_value == {'with_default_int': 23434}

    assert some_resource_field.default_value == {'config': {'with_default_int': 23434}}

    value = EnvironmentConfig.from_dict(throwing_evaluate_config_value(env_type, {}))
    assert value.resources == {'some_resource': {'config': {'with_default_int': 23434}}}
Example #5
0
def test_solid_not_found():
    def _t_fn(*_args):
        raise Exception('should not reach')

    solid = SolidDefinition(
        name='find_me_solid',
        inputs=[],
        outputs=[],
        transform_fn=_t_fn,
    )

    pipeline = PipelineDefinition(solids=[solid])

    with pytest.raises(DagsterInvariantViolationError):
        execute_pipeline(
            pipeline,
            config.Environment(solids={
                'not_found': config.Solid({
                    'some_config': 1,
                }),
            }),
        )
Example #6
0
def test_required_solid_with_required_subfield():
    pipeline_def = PipelineDefinition(
        name="some_pipeline",
        solid_defs=[
            SolidDefinition(
                name="int_config_solid",
                config_schema={"required_field": String},
                input_defs=[],
                output_defs=[],
                compute_fn=lambda *_args: None,
            )
        ],
    )

    env_type = create_environment_type(pipeline_def)

    assert env_type.fields["solids"].is_required is True
    assert env_type.fields["solids"].config_type

    solids_type = env_type.fields["solids"].config_type
    assert solids_type.fields["int_config_solid"].is_required is True
    int_config_solid_type = solids_type.fields["int_config_solid"].config_type
    assert int_config_solid_type.fields["config"].is_required is True

    assert env_type.fields["execution"].is_required is False

    env_obj = EnvironmentConfig.build(
        pipeline_def,
        {"solids": {"int_config_solid": {"config": {"required_field": "foobar"}}}},
    )

    assert env_obj.solids["int_config_solid"].config["required_field"] == "foobar"

    res = process_config(env_type, {"solids": {}})
    assert not res.success

    res = process_config(env_type, {})
    assert not res.success
Example #7
0
def define_dagstermill_solid(
    name,
    notebook_path,
    input_defs=None,
    output_defs=None,
    config=None,
    required_resource_keys=None,
):
    '''Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[list[:class:`dagster.InputDefinition`]]): The solid's inputs.
        output_defs (Optional[list[:class:`dagster.OutputDefinition`]]): The solid's outputs.
        required_resource_keys (Optional[set[str]]): The string names of any required resources.

    Returns:
        :class:`dagster.SolidDefinition`
    '''
    check.str_param(name, 'name')
    check.str_param(notebook_path, 'notebook_path')
    input_defs = check.opt_list_param(input_defs, 'input_defs', of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs, 'output_defs', of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(
        required_resource_keys, 'required_resource_keys', of_type=str
    )

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_solid_compute(name, notebook_path),
        output_defs=output_defs,
        config=check_user_facing_opt_config_param(config, 'config'),
        required_resource_keys=required_resource_keys,
        description='This solid is backed by the notebook at {path}'.format(path=notebook_path),
        tags={'notebook_path': notebook_path, 'kind': 'ipynb'},
    )
Example #8
0
def test_config_arg_mismatch():
    def _t_fn(*_args):
        raise Exception('should not reach')

    solid = SolidDefinition(
        name='solid_with_context',
        inputs=[],
        outputs=[],
        config_def=ConfigDefinition.config_dict({
            'some_config': Field(types.String)
        }),
        transform_fn=_t_fn,
    )

    pipeline = PipelineDefinition(solids=[solid])

    with pytest.raises(DagsterTypeError):
        execute_pipeline(
            pipeline,
            config.Environment(solids={'solid_with_context': config.Solid({
                'some_config': 1
            })}),
        )
Example #9
0
def define_more_complicated_config():
    return PipelineDefinition(
        name='more_complicated_config',
        solids=[
            SolidDefinition(
                name='a_solid_with_three_field_config',
                inputs=[],
                outputs=[],
                transform_fn=lambda *_args: None,
                config_field=Field(
                    Dict(
                        {
                            'field_one': Field(String),
                            'field_two': Field(String, is_optional=True),
                            'field_three': Field(
                                String, is_optional=True, default_value='some_value'
                            ),
                        }
                    )
                ),
            )
        ],
    )
Example #10
0
def create_templated_sql_transform_solid(name, sql, table_arguments, dependant_solids=None):
    check.str_param(name, 'name')
    check.str_param(sql, 'sql')
    check.list_param(table_arguments, 'table_arguments', of_type=str)

    dependant_solids = check.opt_list_param(
        dependant_solids, 'dependant_solids', of_type=SolidDefinition
    )

    field_dict = {}
    for table in table_arguments:
        field_dict[table] = Field(String)

    return SolidDefinition(
        name=name,
        inputs=[InputDefinition(solid.name) for solid in dependant_solids],
        config_field=Field(Dict(field_dict)),
        transform_fn=_create_templated_sql_transform_with_output(sql),
        outputs=[
            OutputDefinition(name='result', dagster_type=Any),
            OutputDefinition(name='sql_text', dagster_type=SqlTextType),
        ],
    )
Example #11
0
def test_execution_plan_create_metadata():
    solid_def = SolidDefinition(
        name='solid_metadata_creation',
        input_defs=[],
        output_defs=[],
        compute_fn=lambda *args, **kwargs: None,
        config_field=Field(Dict({'str_value': Field(String)})),
        step_metadata_fn=lambda env_config: {
            'computed': env_config.solids['solid_metadata_creation'].config['str_value'] + '1'
        },
    )
    p_def = PipelineDefinition(name='test_metadata', solid_defs=[solid_def])

    execution_plan = create_execution_plan(
        p_def,
        environment_dict={
            'solids': {'solid_metadata_creation': {'config': {'str_value': 'foobar'}}}
        },
    )

    compute_step = execution_plan.get_step_by_key('solid_metadata_creation.compute')
    assert compute_step
    assert compute_step.metadata == {'computed': 'foobar1'}
Example #12
0
def create_templated_sql_transform_solid(name,
                                         sql,
                                         table_arguments,
                                         dependant_solids=None):
    check.str_param(name, 'name')
    check.str_param(sql, 'sql')
    check.list_param(table_arguments, 'table_arguments', of_type=str)

    dependant_solids = check.opt_list_param(dependant_solids,
                                            'dependant_solids',
                                            of_type=SolidDefinition)

    field_dict = {}
    for table in table_arguments:
        field_dict[table] = Field(types.String)

    return SolidDefinition(
        name=name,
        inputs=[InputDefinition(solid.name) for solid in dependant_solids],
        config_def=ConfigDefinition.config_dict(field_dict),
        transform_fn=_create_templated_sql_transform_with_output(sql),
        outputs=[OutputDefinition()],
    )
Example #13
0
def test_required_solid_with_required_subfield():
    pipeline_def = PipelineDefinition(
        name='some_pipeline',
        solid_defs=[
            SolidDefinition(
                name='int_config_solid',
                config_schema={'required_field': String},
                input_defs=[],
                output_defs=[],
                compute_fn=lambda *_args: None,
            )
        ],
    )

    env_type = create_environment_type(pipeline_def)

    assert env_type.fields['solids'].is_required is True
    assert env_type.fields['solids'].config_type

    solids_type = env_type.fields['solids'].config_type
    assert solids_type.fields['int_config_solid'].is_required is True
    int_config_solid_type = solids_type.fields['int_config_solid'].config_type
    assert int_config_solid_type.fields['config'].is_required is True

    assert env_type.fields['execution'].is_required is False

    env_obj = EnvironmentConfig.build(
        pipeline_def, {'solids': {'int_config_solid': {'config': {'required_field': 'foobar'}}}},
    )

    assert env_obj.solids['int_config_solid'].config['required_field'] == 'foobar'

    res = process_config(env_type, {'solids': {}})
    assert not res.success

    res = process_config(env_type, {})
    assert not res.success
Example #14
0
def test_required_solid_with_required_subfield():
    pipeline_def = PipelineDefinition(
        name='some_pipeline',
        solid_defs=[
            SolidDefinition(
                name='int_config_solid',
                config={'required_field': String},
                input_defs=[],
                output_defs=[],
                compute_fn=lambda *_args: None,
            )
        ],
    )

    env_type = create_environment_type(pipeline_def)

    assert env_type.fields['solids'].is_required is True
    assert env_type.fields['solids'].config_type

    solids_type = env_type.fields['solids'].config_type
    assert solids_type.fields['int_config_solid'].is_required is True
    int_config_solid_type = solids_type.fields['int_config_solid'].config_type
    assert int_config_solid_type.fields['config'].is_required is True

    assert env_type.fields['execution'].is_required is False

    env_obj = EnvironmentConfig.build(
        pipeline_def, {'solids': {'int_config_solid': {'config': {'required_field': 'foobar'}}}},
    )

    assert env_obj.solids['int_config_solid'].config['required_field'] == 'foobar'

    with pytest.raises(DagsterEvaluateConfigValueError):
        throwing_validate_config_value(env_type, {'solids': {}})

    with pytest.raises(DagsterEvaluateConfigValueError):
        throwing_validate_config_value(env_type, {})
Example #15
0
def test_config_arg_mismatch():
    def _t_fn(*_args):
        raise Exception('should not reach')

    solid = SolidDefinition(
        name='solid_with_context',
        inputs=[],
        outputs=[],
        config_field=Field(Dict({'some_config': Field(String)})),
        compute_fn=_t_fn,
    )

    pipeline = PipelineDefinition(solids=[solid])

    with pytest.raises(PipelineConfigEvaluationError):
        execute_pipeline(
            pipeline,
            {'solids': {
                'solid_with_context': {
                    'config': {
                        'some_config': 1
                    }
                }
            }})
Example #16
0
def test_provided_default_config():
    pipeline_def = PipelineDefinition(
        context_definitions={
            'some_context': PipelineContextDefinition(
                config_field=Field(
                    Dict({'with_default_int': Field(Int, is_optional=True, default_value=23434)})
                ),
                context_fn=lambda *args: None,
            )
        },
        solids=[
            SolidDefinition(
                name='some_solid', inputs=[], outputs=[], transform_fn=lambda *args: None
            )
        ],
    )

    env_type = pipeline_def.environment_type
    some_context_field = env_type.fields['context'].config_type.fields['some_context']
    assert some_context_field.is_optional

    some_context_config_field = some_context_field.config_type.fields['config']
    assert some_context_config_field.is_optional
    assert some_context_config_field.default_value == {'with_default_int': 23434}

    assert some_context_field.default_value == {
        'config': {'with_default_int': 23434},
        'resources': {},
        'persistence': {'file': {}},
    }

    value = construct_environment_config(
        throwing_evaluate_config_value(pipeline_def.environment_type, {})
    )
    assert value.context.name == 'some_context'
    assert env_type.type_attributes.is_system_config
Example #17
0
# pylint: disable=unused-argument

from dagster import Int, Output, OutputDefinition, SolidDefinition, solid


# start_solid_definition_marker_0
@solid
def my_solid(context):
    return 1


# end_solid_definition_marker_0

# start_solid_definition_marker_1
def _return_one(_context, inputs):
    yield Output(1)


solid = SolidDefinition(
    name="my_solid",
    input_defs=[],
    output_defs=[OutputDefinition(Int)],
    compute_fn=_return_one,
)
# end_solid_definition_marker_1
Example #18
0
def get_duplicate_solids():
    return (
        SolidDefinition("a_solid", [], lambda: None, []),
        SolidDefinition("a_solid", [], lambda: None, []),
    )
Example #19
0
def define_dagstermill_solid(
    name,
    notebook_path,
    input_defs=None,
    output_defs=None,
    config_schema=None,
    required_resource_keys=None,
    output_notebook=None,
    asset_key_prefix=None,
):
    """Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook (Optional[str]): If set, will be used as the name of an injected output of
            type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in
            addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This
            respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on
            the pipeline resources via the "file_manager" resource key, so, e.g.,
            if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a :
            py:class:`~dagster_aws.s3.S3FileHandle`.
        asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the
            asset keys for materialized notebooks.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    """
    check.str_param(name, "name")
    check.str_param(notebook_path, "notebook_path")
    input_defs = check.opt_list_param(input_defs,
                                      "input_defs",
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       "output_defs",
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 "required_resource_keys",
                                                 of_type=str)
    if output_notebook is not None:
        required_resource_keys.add("file_manager")
    if isinstance(asset_key_prefix, str):
        asset_key_prefix = [asset_key_prefix]

    asset_key_prefix = check.opt_list_param(asset_key_prefix,
                                            "asset_key_prefix",
                                            of_type=str)

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_solid_compute(name,
                                     notebook_path,
                                     output_notebook,
                                     asset_key_prefix=asset_key_prefix),
        output_defs=output_defs +
        ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)]
         if output_notebook else []),
        config_schema=config_schema,
        required_resource_keys=required_resource_keys,
        description="This solid is backed by the notebook at {path}".format(
            path=notebook_path),
        tags={
            "notebook_path": notebook_path,
            "kind": "ipynb"
        },
    )
Example #20
0
    def get_context(self,
                    solid_config=None,
                    mode_def=None,
                    environment_dict=None):
        '''Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            environment_dict(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :class:`dagstermill.DagstermillExecutionContext`
        '''
        check.opt_inst_param(mode_def, 'mode_def', ModeDefinition)
        environment_dict = check.opt_dict_param(environment_dict,
                                                'environment_dict',
                                                key_type=str)

        solid_def = SolidDefinition(
            name='this_solid',
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description=
            'Ephemeral solid constructed by dagstermill.get_context()',
        )

        if not mode_def:
            mode_def = ModeDefinition(
                logger_defs={'dagstermill': colored_console_logger})
            environment_dict['loggers'] = {'dagstermill': {}}

        pipeline_def = PipelineDefinition(
            [solid_def],
            mode_defs=[mode_def],
            name='ephemeral_dagstermill_pipeline')

        run_id = str(uuid.uuid4())

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=mode_def.name,
            reexecution_config=None,
            selector=None,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        with scoped_pipeline_context(
                self.pipeline_def,
                environment_dict,
                pipeline_run,
                instance=DagsterInstance.ephemeral(),
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillExecutionContext(pipeline_context,
                                                       solid_config)

        return self.context
Example #21
0
def define_dagstermill_solid(
    name,
    notebook_path,
    input_defs=None,
    output_defs=None,
    config_schema=None,
    required_resource_keys=None,
    output_notebook=None,
    asset_key_prefix=None,
    description=None,
    tags=None,
):
    """Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook (Optional[str]): If set, will be used as the name of an injected output of
            type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in
            addition to the :py:class:`~dagster.AssetMaterialization` that is always created). This
            respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on
            the pipeline resources via the "file_manager" resource key, so, e.g.,
            if :py:class:`~dagster_aws.s3.s3_file_manager` is configured, the output will be a :
            py:class:`~dagster_aws.s3.S3FileHandle`.
        asset_key_prefix (Optional[Union[List[str], str]]): If set, will be used to prefix the
            asset keys for materialized notebooks.
        description (Optional[str]): If set, description used for solid.
        tags (Optional[Dict[str, str]]): If set, additional tags used to annotate solid.
            Dagster uses the tag keys `notebook_path` and `kind`, which cannot be
            overwritten by the user.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    """
    check.str_param(name, "name")
    check.str_param(notebook_path, "notebook_path")
    input_defs = check.opt_list_param(input_defs,
                                      "input_defs",
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       "output_defs",
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 "required_resource_keys",
                                                 of_type=str)
    if output_notebook is not None:
        required_resource_keys.add("file_manager")
    if isinstance(asset_key_prefix, str):
        asset_key_prefix = [asset_key_prefix]

    asset_key_prefix = check.opt_list_param(asset_key_prefix,
                                            "asset_key_prefix",
                                            of_type=str)

    default_description = f"This solid is backed by the notebook at {notebook_path}"
    description = check.opt_str_param(description,
                                      "description",
                                      default=default_description)

    user_tags = validate_tags(tags)
    if tags is not None:
        check.invariant(
            "notebook_path" not in tags,
            "user-defined solid tags contains the `notebook_path` key, but the `notebook_path` key is reserved for use by Dagster",
        )
        check.invariant(
            "kind" not in tags,
            "user-defined solid tags contains the `kind` key, but the `kind` key is reserved for use by Dagster",
        )
    default_tags = {"notebook_path": notebook_path, "kind": "ipynb"}

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_solid_compute(name,
                                     notebook_path,
                                     output_notebook,
                                     asset_key_prefix=asset_key_prefix),
        output_defs=output_defs +
        ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)]
         if output_notebook else []),
        config_schema=config_schema,
        required_resource_keys=required_resource_keys,
        description=description,
        tags={
            **user_tags,
            **default_tags
        },
    )
def test_basic_solids_config():
    pipeline_def = PipelineDefinition(
        name='BasicSolidsConfigPipeline',
        solids=[
            SolidDefinition(
                name='required_field_solid',
                inputs=[],
                outputs=[],
                config_field=Field(Dict(fields={'required_int': Field(Int)})),
                compute_fn=lambda *_args: fail_me(),
            )
        ],
    )

    env_config_type = create_environment_type(pipeline_def)

    assert env_config_type.fields['solids'].is_optional is False
    solids_config_type = env_config_type.fields['solids'].config_type
    assert solids_config_type.fields[
        'required_field_solid'].is_optional is False
    required_solid_config_type = solids_config_type.fields[
        'required_field_solid'].config_type
    assert required_solid_config_type.fields['config'].is_optional is False

    assert set(
        env_config_type.fields['loggers'].config_type.fields.keys()) == set(
            ['console'])

    console_logger_config_type = env_config_type.fields[
        'loggers'].config_type.fields['console']

    assert set(console_logger_config_type.config_type.fields.keys()) == set(
        ['config'])

    assert console_logger_config_type.config_type.fields['config'].is_optional

    console_logger_config_config_type = console_logger_config_type.config_type.fields[
        'config'].config_type

    assert set(console_logger_config_config_type.fields.keys()) == set(
        ['log_level', 'name'])

    assert scaffold_pipeline_config(pipeline_def, skip_optional=False) == {
        'loggers': {
            'console': {
                'config': {
                    'log_level': '',
                    'name': ''
                }
            }
        },
        'solids': {
            'required_field_solid': {
                'config': {
                    'required_int': 0
                }
            }
        },
        'expectations': {
            'evaluate': True
        },
        'execution': {},
        'resources': {},
        'storage': {
            'filesystem': {
                'base_dir': ''
            },
            'in_memory': {},
            's3': {
                's3_bucket': ''
            }
        },
    }
Example #23
0
def define_dagstermill_solid(
    name,
    notebook_path,
    input_defs=None,
    output_defs=None,
    config=None,
    required_resource_keys=None,
    output_notebook=None,
    config_schema=None,
):
    '''Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook (Optional[str]): If set, will be used as the name of an injected output of
            type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in
            addition to the :py:class:`~dagster.Materialization` that is always created). This
            respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on
            the pipeline system storage, so, e.g., if :py:class:`~dagster_aws.s3.s3_system_storage`
            is configured, the output will be a :py:class:`~dagster_aws.s3.S3FileHandle`.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    '''
    check.str_param(name, 'name')
    check.str_param(notebook_path, 'notebook_path')
    input_defs = check.opt_list_param(input_defs,
                                      'input_defs',
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       'output_defs',
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 'required_resource_keys',
                                                 of_type=str)

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_solid_compute(name, notebook_path, output_notebook),
        output_defs=output_defs +
        ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)]
         if output_notebook else []),
        config_schema=canonicalize_backcompat_args(
            check_user_facing_opt_config_param(config_schema, 'config_schema'),
            'config_schema',
            check_user_facing_opt_config_param(config, 'config'),
            'config',
            '0.9.0',
        ),
        required_resource_keys=required_resource_keys,
        description='This solid is backed by the notebook at {path}'.format(
            path=notebook_path),
        tags={
            'notebook_path': notebook_path,
            'kind': 'ipynb'
        },
    )
Example #24
0
def sql_solid(name,
              select_statement,
              materialization_strategy,
              table_name=None,
              input_defs=None):
    '''Return a new solid that executes and materializes a SQL select statement.

    Args:
        name (str): The name of the new solid.
        select_statement (str): The select statement to execute.
        materialization_strategy (str): Must be 'table', the only currently supported
            materialization strategy. If 'table', the kwarg `table_name` must also be passed.
    Kwargs:
        table_name (str): THe name of the new table to create, if the materialization strategy
            is 'table'. Default: None.
        input_defs (list[InputDefinition]): Inputs, if any, for the new solid. Default: None.

    Returns:
        function:
            The new SQL solid.
    '''
    input_defs = check.opt_list_param(input_defs, 'input_defs',
                                      InputDefinition)

    materialization_strategy_output_types = {  # pylint:disable=C0103
        'table': SqlTableName,
        # 'view': String,
        # 'query': SqlAlchemyQueryType,
        # 'subquery': SqlAlchemySubqueryType,
        # 'result_proxy': SqlAlchemyResultProxyType,
        # could also materialize as a Pandas table, as a Spark table, as an intermediate file, etc.
    }

    if materialization_strategy not in materialization_strategy_output_types:
        raise Exception(
            'Invalid materialization strategy {materialization_strategy}, must '
            'be one of {materialization_strategies}'.format(
                materialization_strategy=materialization_strategy,
                materialization_strategies=str(
                    list(materialization_strategy_output_types.keys())),
            ))

    if materialization_strategy == 'table':
        if table_name is None:
            raise Exception(
                'Missing table_name: required for materialization strategy \'table\''
            )

    output_description = (
        'The string name of the new table created by the solid'
        if materialization_strategy == 'table' else
        'The materialized SQL statement. If the materialization_strategy is '
        '\'table\', this is the string name of the new table created by the solid.'
    )

    description = '''This solid executes the following SQL statement:
    {select_statement}'''.format(select_statement=select_statement)

    # n.b., we will eventually want to make this resources key configurable
    sql_statement = (
        'drop table if exists {table_name};\n'
        'create table {table_name} as {select_statement};').format(
            table_name=table_name, select_statement=select_statement)

    def compute_fn(context, _inputs):
        '''Inner function defining the new solid.

        Args:
            context (ComputeExecutionContext): Must expose a `db` resource with an `execute` method,
                like a SQLAlchemy engine, that can execute raw SQL against a database.

        Returns:
            str:
                The table name of the newly materialized SQL select statement.
        '''

        context.log.info('Executing sql statement:\n{sql_statement}'.format(
            sql_statement=sql_statement))
        context.resources.db_info.engine.execute(text(sql_statement))
        yield Output(value=table_name, output_name='result')

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        output_defs=[
            OutputDefinition(
                materialization_strategy_output_types[
                    materialization_strategy],
                description=output_description,
            )
        ],
        compute_fn=compute_fn,
        description=description,
        metadata={
            'kind': 'sql',
            'sql': sql_statement
        },
    )
Example #25
0
def test_whole_environment():
    pipeline_def = PipelineDefinition(
        name='some_pipeline',
        mode_defs=[
            ModeDefinition(
                name='test_mode',
                resource_defs={
                    'test_resource':
                    ResourceDefinition(resource_fn=lambda: None,
                                       config_field=Field(Any))
                },
            )
        ],
        solid_defs=[
            SolidDefinition(
                name='int_config_solid',
                config_field=Field(Int),
                input_defs=[],
                output_defs=[],
                compute_fn=lambda *args: None,
            ),
            SolidDefinition(name='no_config_solid',
                            input_defs=[],
                            output_defs=[],
                            compute_fn=lambda *args: None),
        ],
    )

    environment_type = create_environment_type(pipeline_def)

    assert (environment_type.fields['resources'].config_type.name ==
            'SomePipeline.Mode.TestMode.Resources')
    solids_type = environment_type.fields['solids'].config_type
    assert solids_type.name == 'SomePipeline.SolidsConfigDictionary'
    assert (solids_type.fields['int_config_solid'].config_type.name ==
            'SomePipeline.SolidConfig.IntConfigSolid')

    env = EnvironmentConfig.from_config_value(
        throwing_evaluate_config_value(
            environment_type,
            {
                'resources': {
                    'test_resource': {
                        'config': 1
                    }
                },
                'solids': {
                    'int_config_solid': {
                        'config': 123
                    }
                },
            },
        ),
        {
            'resources': {
                'test_resource': {
                    'config': 1
                }
            },
            'solids': {
                'int_config_solid': {
                    'config': 123
                }
            },
        },
    )

    assert isinstance(env, EnvironmentConfig)
    assert env.solids == {'int_config_solid': SolidConfig(123)}
    assert env.resources == {'test_resource': {'config': 1}}
Example #26
0
def test_multiple_outputs_only_emit_one():
    def _t_fn(*_args):
        yield Result(output_name='output_one', value='foo')

    solid = SolidDefinition(
        name='multiple_outputs',
        inputs=[],
        outputs=[
            OutputDefinition(name='output_one'),
            OutputDefinition(name='output_two')
        ],
        transform_fn=_t_fn,
    )

    called = {}

    def _transform_fn_one(*_args, **_kwargs):
        called['one'] = True

    downstream_one = SolidDefinition(
        name='downstream_one',
        inputs=[InputDefinition('some_input')],
        outputs=[],
        transform_fn=_transform_fn_one,
    )

    def _transform_fn_two(*_args, **_kwargs):
        raise Exception('do not call me')

    downstream_two = SolidDefinition(
        name='downstream_two',
        inputs=[InputDefinition('some_input')],
        outputs=[],
        transform_fn=_transform_fn_two,
    )

    pipeline = PipelineDefinition(
        solids=[solid, downstream_one, downstream_two],
        dependencies={
            'downstream_one': {
                'some_input': DependencyDefinition(solid.name,
                                                   output='output_one')
            },
            'downstream_two': {
                'some_input': DependencyDefinition(solid.name,
                                                   output='output_two')
            },
        },
    )

    result = execute_pipeline(pipeline)
    assert result.success

    assert called['one']
    solid_result = result.result_for_solid('multiple_outputs')
    assert set(solid_result.transformed_values.keys()) == set(['output_one'])

    with pytest.raises(
            DagsterInvariantViolationError,
            match='not_defined not defined in solid multiple_outputs'):
        solid_result.transformed_value('not_defined')

    with pytest.raises(DagsterInvariantViolationError,
                       match='Did not find result output_two'):
        solid_result.transformed_value('output_two')

    with pytest.raises(
            DagsterInvariantViolationError,
            match=
            'Try to get result for solid not_present in <<unnamed>>. No such solid.',
    ):
        result.result_for_solid('not_present')

    with pytest.raises(
            DagsterInvariantViolationError,
            match=
            'Did not find result for solid downstream_two in pipeline execution result',
    ):
        result.result_for_solid('downstream_two')
def create_solid_with_deps(name, *solid_deps):
    inputs = [InputDefinition(solid_dep.name) for solid_dep in solid_deps]

    return SolidDefinition(
        name=name, inputs=inputs, transform_fn=_transform_fn, outputs=[OutputDefinition()]
    )
Example #28
0
    def get_context(self, solid_config=None, mode_def=None, run_config=None):
        """Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            run_config(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :py:class:`~dagstermill.DagstermillExecutionContext`
        """
        check.opt_inst_param(mode_def, "mode_def", ModeDefinition)
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)

        # If we are running non-interactively, and there is already a context reconstituted, return
        # that context rather than overwriting it.
        if self.context is not None and isinstance(
                self.context, DagstermillRuntimeExecutionContext):
            return self.context

        if not mode_def:
            mode_def = ModeDefinition(
                logger_defs={"dagstermill": colored_console_logger})
            run_config["loggers"] = {"dagstermill": {}}

        solid_def = SolidDefinition(
            name="this_solid",
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description=
            "Ephemeral solid constructed by dagstermill.get_context()",
            required_resource_keys=mode_def.resource_key_set,
        )

        pipeline_def = PipelineDefinition(
            [solid_def],
            mode_defs=[mode_def],
            name="ephemeral_dagstermill_pipeline")

        run_id = make_new_run_id()

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=mode_def.name,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline = pipeline_def

        environment_config = EnvironmentConfig.build(pipeline_def,
                                                     run_config,
                                                     mode=mode_def.name)

        pipeline = InMemoryPipeline(pipeline_def)
        execution_plan = ExecutionPlan.build(pipeline, environment_config)

        with scoped_pipeline_context(
                execution_plan,
                pipeline,
                run_config,
                pipeline_run,
                DagsterInstance.ephemeral(),
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:

            self.context = DagstermillExecutionContext(
                pipeline_context=pipeline_context,
                pipeline_def=pipeline_def,
                solid_config=solid_config,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_def,
                    environment_config,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
def test_basic_solids_config():
    pipeline_def = PipelineDefinition(
        name='BasicSolidsConfigPipeline',
        solids=[
            SolidDefinition(
                name='required_field_solid',
                inputs=[],
                outputs=[],
                config_field=Field(Dict(fields={'required_int': Field(Int)})),
                transform_fn=lambda *_args: fail_me(),
            )
        ],
    )

    env_config_type = pipeline_def.environment_type

    assert env_config_type.fields['solids'].is_optional is False
    solids_config_type = env_config_type.fields['solids'].config_type
    assert solids_config_type.fields[
        'required_field_solid'].is_optional is False
    required_solid_config_type = solids_config_type.fields[
        'required_field_solid'].config_type
    assert required_solid_config_type.fields['config'].is_optional is False

    context_config_type = env_config_type.fields['context'].config_type

    assert 'default' in context_config_type.fields
    assert context_config_type.fields['default'].is_optional

    default_context_config_type = context_config_type.fields[
        'default'].config_type

    assert set(default_context_config_type.fields.keys()) == set(
        ['config', 'resources', 'persistence'])

    default_context_user_config_type = default_context_config_type.fields[
        'config'].config_type

    assert set(default_context_user_config_type.fields.keys()) == set(
        ['log_level'])

    assert scaffold_pipeline_config(pipeline_def, skip_optional=False) == {
        'context': {
            'default': {
                'config': {
                    'log_level': ''
                },
                'persistence': {
                    'file': {}
                },
                'resources': {}
            }
        },
        'solids': {
            'required_field_solid': {
                'config': {
                    'required_int': 0
                }
            }
        },
        'expectations': {
            'evaluate': True
        },
        'execution': {},
    }
Example #30
0
def define_dagstermill_solid(
    name,
    notebook_path,
    inputs=None,
    outputs=None,
    config_def=None,
):
    check.str_param(name, 'name')
    check.str_param(notebook_path, 'notebook_path')
    inputs = check.opt_list_param(inputs,
                                  'input_defs',
                                  of_type=InputDefinition)
    outputs = check.opt_list_param(outputs,
                                   'output_defs',
                                   of_type=OutputDefinition)

    do_cleanup = False  # for now

    def _t_fn(info, inputs):
        if not os.path.exists('/tmp/dagstermill/'):
            os.mkdir('/tmp/dagstermill/')

        temp_path = '/tmp/dagstermill/{prefix}-out.ipynb'.format(
            prefix=str(uuid.uuid4()))

        try:
            _source_nb = pm.execute_notebook(
                notebook_path,
                temp_path,
                parameters=dict(
                    inputs=serialize_dm_object(inputs),
                    config=serialize_dm_object(info.config),
                ),
            )

            output_nb = pm.read_notebook(temp_path)

            info.context.debug(
                'Notebook execution complete for {name}. Data is {data}'.
                format(
                    name=name,
                    data=output_nb.data,
                ))

            for output_def in info.solid_def.output_defs:
                if output_def.name in output_nb.data:
                    yield Result(
                        deserialize_dm_object(output_nb.data[output_def.name]),
                        output_def.name,
                    )

        finally:
            if do_cleanup and os.path.exists(temp_path):
                os.remove(temp_path)

    return SolidDefinition(
        name=name,
        inputs=inputs,
        transform_fn=_t_fn,
        outputs=outputs,
        config_def=config_def,
        description='This solid is backed by the notebook at {path}'.format(
            path=notebook_path),
        metadata={
            'notebook_path': notebook_path,
        })