def test_output_definition_single_partition_materialization():

    entry1 = MetadataEntry("nrows", value=123)
    entry2 = MetadataEntry("some value", value=3.21)

    @solid(output_defs=[OutputDefinition(name="output1", asset_key=AssetKey("table1"))])
    def solid1(_):
        return Output(None, "output1", metadata_entries=[entry1])

    @solid(output_defs=[OutputDefinition(name="output2", asset_key=lambda _: AssetKey("table2"))])
    def solid2(_, _input1):
        yield Output(
            7,
            "output2",
            metadata_entries=[entry2],
        )

    @pipeline
    def my_pipeline():
        solid2(solid1())

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 2

    check_materialization(materializations[0], AssetKey(["table1"]), metadata_entries=[entry1])
    check_materialization(
        materializations[1],
        AssetKey(["table2"]),
        metadata_entries=[entry2],
        parent_assets=[AssetLineageInfo(AssetKey(["table1"]))],
    )
def test_io_manager_single_partition_materialization():

    entry1 = MetadataEntry.int(123, "nrows")
    entry2 = MetadataEntry.float(3.21, "some value")

    class MyIOManager(IOManager):
        def handle_output(self, context, obj):
            # store asset
            yield entry1

        def load_input(self, context):
            return None

        def get_output_asset_key(self, context):
            return AssetKey([context.step_key])

    @io_manager
    def my_io_manager(_):
        return MyIOManager()

    @solid(output_defs=[OutputDefinition(name="output1")])
    def solid1(_):
        return Output(None, "output1")

    @solid(output_defs=[OutputDefinition(name="output2")])
    def solid2(_, _input1):
        yield Output(
            7,
            "output2",
            metadata_entries=[entry2],
        )

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={"io_manager": my_io_manager})
    ])
    def my_pipeline():
        solid2(solid1())

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 2

    check_materialization(materializations[0],
                          AssetKey(["solid1"]),
                          metadata_entries=[entry1])
    check_materialization(
        materializations[1],
        AssetKey(["solid2"]),
        metadata_entries=[entry1, entry2],
        parent_assets=[AssetLineageInfo(AssetKey(["solid1"]))],
    )
def test_dynamic_output_definition_single_partition_materialization():

    entry1 = MetadataEntry("nrows", value=123)
    entry2 = MetadataEntry("some value", value=3.21)

    @solid(output_defs=[
        OutputDefinition(name="output1", asset_key=AssetKey("table1"))
    ])
    def solid1(_):
        return Output(None, "output1", metadata_entries=[entry1])

    @solid(output_defs=[
        DynamicOutputDefinition(
            name="output2",
            asset_key=lambda context: AssetKey(context.mapping_key))
    ])
    def solid2(_, _input1):
        for i in range(4):
            yield DynamicOutput(
                7,
                mapping_key=str(i),
                output_name="output2",
                metadata_entries=[entry2],
            )

    @solid
    def do_nothing(_, _input1):
        pass

    @pipeline
    def my_pipeline():
        solid2(solid1()).map(do_nothing)

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 5

    check_materialization(materializations[0],
                          AssetKey(["table1"]),
                          metadata_entries=[entry1])
    seen_paths = set()
    for i in range(1, 5):
        path = materializations[i].asset_key.path
        seen_paths.add(tuple(path))
        check_materialization(
            materializations[i],
            AssetKey(path),
            metadata_entries=[entry2],
            parent_assets=[AssetLineageInfo(AssetKey(["table1"]))],
        )
    assert len(seen_paths) == 4
 def fail_solid(_):
     yield Output(
         None,
         metadata_entries=[
             PartitionMetadataEntry("3", MetadataEntry.int(1, "x"))
         ],
     )
Beispiel #5
0
    def handle_output(self, context, obj):
        """Pickle the data and store the object to a custom file path.

        This method emits an AssetMaterialization event so the assets will be tracked by the
        Asset Catalog.
        """
        check.inst_param(context, "context", OutputContext)
        metadata = context.metadata
        path = check.str_param(metadata.get("path"), "metadata.path")

        filepath = self._get_path(path)

        # Ensure path exists
        mkdir_p(os.path.dirname(filepath))
        context.log.debug(f"Writing file at: {filepath}")

        with open(filepath, self.write_mode) as write_obj:
            pickle.dump(obj, write_obj, PICKLE_PROTOCOL)

        return AssetMaterialization(
            asset_key=AssetKey([context.pipeline_name, context.step_key, context.name]),
            metadata_entries=[
                MetadataEntry("path", value=MetadataValue.path(os.path.abspath(filepath)))
            ],
        )
Beispiel #6
0
    def handle_output(self, context: OutputContext, obj: bytes):
        """obj: bytes"""
        check.inst_param(context, "context", OutputContext)

        # the output notebook itself is stored at output_file_path
        output_notebook_path = self._get_path(context)
        mkdir_p(os.path.dirname(output_notebook_path))
        with open(output_notebook_path, self.write_mode) as dest_file_obj:
            dest_file_obj.write(obj)
        yield MetadataEntry.fspath(path=output_notebook_path, label="path")
def test_source_asset_metadata():
    sa = SourceAsset(key=AssetKey("foo"),
                     metadata={
                         "foo": "bar",
                         "baz": object()
                     })
    assert sa.metadata_entries == [
        MetadataEntry(label="foo",
                      description=None,
                      entry_data=MetadataValue.text("bar")),
        MetadataEntry(
            label="baz",
            description=None,
            entry_data=MetadataValue.text("[object] (unserializable)"),
        ),
    ]
    assert sa.metadata == {
        "foo": MetadataValue.text("bar"),
        "baz": MetadataValue.text("[object] (unserializable)"),
    }
Beispiel #8
0
def test_metadata_entries():

    metadata_entry = MetadataEntry("foo", None, MetadataValue.text("bar"))

    # We use `Output` as a stand-in for all events here, they all follow the same pattern of calling
    # `normalize_metadata`.
    with pytest.warns(DeprecationWarning,
                      match=re.escape('"metadata_entries" is deprecated')):
        Output("foo", "bar", metadata_entries=[metadata_entry])

    with pytest.warns(DeprecationWarning,
                      match=re.escape('"metadata_entries" is deprecated')):
        DagsterType(lambda _, __: True,
                    "foo",
                    metadata_entries=[metadata_entry])
Beispiel #9
0
def test_table_metadata_value_schema_inference():

    table_metadata_value = MetadataEntry.table(
        records=[
            TableRecord(name="foo", status=False),
            TableRecord(name="bar", status=True),
        ],
        label="foo",
    )

    schema = table_metadata_value.entry_data.schema
    assert isinstance(schema, TableSchema)
    assert schema.columns == [
        TableColumn(name="name", type="string"),
        TableColumn(name="status", type="bool"),
    ]
Beispiel #10
0
    def _t_fn(step_context, inputs):
        check.inst_param(step_context, "step_context", SolidExecutionContext)
        check.param_invariant(
            isinstance(step_context.run_config, dict),
            "context",
            "StepExecutionContext must have valid run_config",
        )

        step_execution_context = step_context.get_step_execution_context()

        with tempfile.TemporaryDirectory() as output_notebook_dir:
            with safe_tempfile_path() as output_log_path:

                prefix = str(uuid.uuid4())
                parameterized_notebook_path = os.path.join(
                    output_notebook_dir, f"{prefix}-inter.ipynb")

                executed_notebook_path = os.path.join(output_notebook_dir,
                                                      f"{prefix}-out.ipynb")

                # Scaffold the registration here
                nb = load_notebook_node(notebook_path)
                compute_descriptor = ("solid" if dagster_factory_name
                                      == "define_dagstermill_solid" else "op")
                nb_no_parameters = replace_parameters(
                    step_execution_context,
                    nb,
                    get_papermill_parameters(step_execution_context, inputs,
                                             output_log_path,
                                             compute_descriptor),
                )
                write_ipynb(nb_no_parameters, parameterized_notebook_path)

                try:
                    papermill_engines.register("dagstermill",
                                               DagstermillEngine)
                    papermill.execute_notebook(
                        input_path=parameterized_notebook_path,
                        output_path=executed_notebook_path,
                        engine_name="dagstermill",
                        log_output=True,
                    )

                except Exception as ex:
                    step_execution_context.log.warn(
                        "Error when attempting to materialize executed notebook: {exc}"
                        .format(exc=str(
                            serializable_error_info_from_exc_info(
                                sys.exc_info()))))
                    # pylint: disable=no-member
                    # compat:
                    if isinstance(
                            ex,
                            ExecutionError) and (ex.ename == "RetryRequested"
                                                 or ex.ename == "Failure"):
                        step_execution_context.log.warn(
                            f"Encountered raised {ex.ename} in notebook. Use dagstermill.yield_event "
                            "with RetryRequested or Failure to trigger their behavior."
                        )

                    raise

            step_execution_context.log.debug(
                "Notebook execution complete for {name} at {executed_notebook_path}."
                .format(
                    name=name,
                    executed_notebook_path=executed_notebook_path,
                ))
            if output_notebook_name is not None:
                # yield output notebook binary stream as a solid output
                with open(executed_notebook_path, "rb") as fd:
                    yield Output(fd.read(), output_notebook_name)

            else:
                # backcompat
                executed_notebook_file_handle = None
                try:
                    # use binary mode when when moving the file since certain file_managers such as S3
                    # may try to hash the contents
                    with open(executed_notebook_path, "rb") as fd:
                        executed_notebook_file_handle = step_context.resources.file_manager.write(
                            fd, mode="wb", ext="ipynb")
                        executed_notebook_materialization_path = (
                            executed_notebook_file_handle.path_desc)

                    yield AssetMaterialization(
                        asset_key=(asset_key_prefix +
                                   [f"{name}_output_notebook"]),
                        description=
                        "Location of output notebook in file manager",
                        metadata_entries=[
                            MetadataEntry.fspath(
                                executed_notebook_materialization_path)
                        ],
                    )

                except Exception:
                    # if file manager writing errors, e.g. file manager is not provided, we throw a warning
                    # and fall back to the previously stored temp executed notebook.
                    step_context.log.warning(
                        "Error when attempting to materialize executed notebook using file manager: "
                        f"{str(serializable_error_info_from_exc_info(sys.exc_info()))}"
                        f"\nNow falling back to local: notebook execution was temporarily materialized at {executed_notebook_path}"
                        "\nIf you have supplied a file manager and expect to use it for materializing the "
                        'notebook, please include "file_manager" in the `required_resource_keys` argument '
                        f"to `{dagster_factory_name}`")

                if output_notebook is not None:
                    yield Output(executed_notebook_file_handle,
                                 output_notebook)

            # deferred import for perf
            import scrapbook

            output_nb = scrapbook.read_notebook(executed_notebook_path)

            for (output_name,
                 _) in step_execution_context.solid_def.output_dict.items():
                data_dict = output_nb.scraps.data_dict
                if output_name in data_dict:
                    # read outputs that were passed out of process via io manager from `yield_result`
                    step_output_handle = StepOutputHandle(
                        step_key=step_execution_context.step.key,
                        output_name=output_name)
                    output_context = step_execution_context.get_output_context(
                        step_output_handle)
                    io_manager = step_execution_context.get_io_manager(
                        step_output_handle)
                    value = io_manager.load_input(
                        build_input_context(upstream_output=output_context))

                    yield Output(value, output_name)

            for key, value in output_nb.scraps.items():
                if key.startswith("event-"):
                    with open(value.data, "rb") as fd:
                        event = pickle.loads(fd.read())
                        if isinstance(event, (Failure, RetryRequested)):
                            raise event
                        else:
                            yield event
def test_output_definition_multiple_partition_materialization():

    entry1 = MetadataEntry.int(123, "nrows")
    entry2 = MetadataEntry.float(3.21, "some value")

    partition_entries = [
        MetadataEntry.int(123 * i * i, "partition count") for i in range(3)
    ]

    @solid(output_defs=[
        OutputDefinition(name="output1",
                         asset_key=AssetKey("table1"),
                         asset_partitions=set(["0", "1", "2"]))
    ])
    def solid1(_):
        return Output(
            None,
            "output1",
            metadata_entries=[
                entry1,
                *[
                    PartitionMetadataEntry(str(i), entry)
                    for i, entry in enumerate(partition_entries)
                ],
            ],
        )

    @solid(output_defs=[
        OutputDefinition(name="output2", asset_key=AssetKey("table2"))
    ])
    def solid2(_, _input1):
        yield Output(
            7,
            "output2",
            metadata_entries=[entry2],
        )

    @pipeline
    def my_pipeline():
        solid2(solid1())

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 4

    seen_partitions = set()
    for i in range(3):
        partition = materializations[i].partition
        seen_partitions.add(partition)
        check_materialization(
            materializations[i],
            AssetKey(["table1"]),
            metadata_entries=[entry1, partition_entries[int(partition)]],
        )

    assert len(seen_partitions) == 3

    check_materialization(
        materializations[-1],
        AssetKey(["table2"]),
        metadata_entries=[entry2],
        parent_assets=[n_asset_keys("table1", 3)],
    )
Beispiel #12
0
def test_inter_op_dependency():
    @asset
    def in1():
        pass

    @asset
    def in2():
        pass

    @asset
    def downstream(only_in, mixed, only_out):  # pylint: disable=unused-argument
        pass

    @multi_asset(
        outs={"only_in": Out(), "mixed": Out(), "only_out": Out()},
        internal_asset_deps={
            "mixed": {AssetKey("in1"), AssetKey("only_in")},
            "only_out": {AssetKey("only_in"), AssetKey("mixed")},
        },
    )
    def assets(in1, in2):  # pylint: disable=unused-argument
        pass

    assets_job = build_assets_job("assets_job", [in1, in2, assets, downstream])

    external_asset_nodes = external_asset_graph_from_defs([assets_job], source_assets_by_key={})
    # sort so that test is deterministic
    sorted_nodes = sorted(
        [
            node._replace(
                dependencies=sorted(node.dependencies, key=lambda d: d.upstream_asset_key),
                depended_by=sorted(node.depended_by, key=lambda d: d.downstream_asset_key),
            )
            for node in external_asset_nodes
        ],
        key=lambda n: n.asset_key,
    )

    assert sorted_nodes == [
        ExternalAssetNode(
            asset_key=AssetKey(["downstream"]),
            dependencies=[
                ExternalAssetDependency(upstream_asset_key=AssetKey(["mixed"]), input_name="mixed"),
                ExternalAssetDependency(
                    upstream_asset_key=AssetKey(["only_in"]), input_name="only_in"
                ),
                ExternalAssetDependency(
                    upstream_asset_key=AssetKey(["only_out"]), input_name="only_out"
                ),
            ],
            depended_by=[],
            op_name="downstream",
            op_description=None,
            job_names=["assets_job"],
            output_name="result",
            metadata_entries=[],
        ),
        ExternalAssetNode(
            asset_key=AssetKey(["in1"]),
            dependencies=[],
            depended_by=[
                ExternalAssetDependedBy(downstream_asset_key=AssetKey(["mixed"]), input_name="in1"),
                ExternalAssetDependedBy(
                    downstream_asset_key=AssetKey(["only_in"]), input_name="in1"
                ),
            ],
            op_name="in1",
            op_description=None,
            job_names=["assets_job"],
            output_name="result",
            metadata_entries=[],
        ),
        ExternalAssetNode(
            asset_key=AssetKey(["in2"]),
            dependencies=[],
            depended_by=[
                ExternalAssetDependedBy(
                    downstream_asset_key=AssetKey(["only_in"]), input_name="in2"
                )
            ],
            op_name="in2",
            op_description=None,
            job_names=["assets_job"],
            output_name="result",
            metadata_entries=[],
        ),
        ExternalAssetNode(
            asset_key=AssetKey(["mixed"]),
            dependencies=[
                ExternalAssetDependency(upstream_asset_key=AssetKey(["in1"]), input_name="in1"),
                ExternalAssetDependency(
                    upstream_asset_key=AssetKey(["only_in"]), output_name="only_in"
                ),
            ],
            depended_by=[
                ExternalAssetDependedBy(
                    downstream_asset_key=AssetKey(["downstream"]), input_name="mixed"
                ),
                ExternalAssetDependedBy(
                    downstream_asset_key=AssetKey(["only_out"]), output_name="mixed"
                ),
            ],
            op_name="assets",
            op_description=None,
            job_names=["assets_job"],
            output_name="mixed",
            metadata_entries=[
                MetadataEntry(
                    label=".dagster/asset_deps",
                    description=None,
                    entry_data=MetadataValue.text("[set] (unserializable)"),
                )
            ],
        ),
        ExternalAssetNode(
            asset_key=AssetKey(["only_in"]),
            dependencies=[
                ExternalAssetDependency(upstream_asset_key=AssetKey(["in1"]), input_name="in1"),
                ExternalAssetDependency(upstream_asset_key=AssetKey(["in2"]), input_name="in2"),
            ],
            depended_by=[
                ExternalAssetDependedBy(
                    downstream_asset_key=AssetKey(["downstream"]), input_name="only_in"
                ),
                ExternalAssetDependedBy(
                    downstream_asset_key=AssetKey(["mixed"]), output_name="only_in"
                ),
                ExternalAssetDependedBy(
                    downstream_asset_key=AssetKey(["only_out"]), output_name="only_in"
                ),
            ],
            op_name="assets",
            op_description=None,
            job_names=["assets_job"],
            output_name="only_in",
            metadata_entries=[],
        ),
        ExternalAssetNode(
            asset_key=AssetKey(["only_out"]),
            dependencies=[
                ExternalAssetDependency(
                    upstream_asset_key=AssetKey(["mixed"]), output_name="mixed"
                ),
                ExternalAssetDependency(
                    upstream_asset_key=AssetKey(["only_in"]), output_name="only_in"
                ),
            ],
            depended_by=[
                ExternalAssetDependedBy(
                    downstream_asset_key=AssetKey(["downstream"]), input_name="only_out"
                ),
            ],
            op_name="assets",
            op_description=None,
            job_names=["assets_job"],
            output_name="only_out",
            metadata_entries=[
                MetadataEntry(
                    label=".dagster/asset_deps",
                    description=None,
                    entry_data=MetadataValue.text("[set] (unserializable)"),
                )
            ],
        ),
    ]
Beispiel #13
0
 def fail_solid(_):
     yield Output(
         None,
         metadata_entries=[PartitionMetadataEntry("3", MetadataEntry("x", value=1))],
     )
Beispiel #14
0
def test_metadata_entry_description():

    with pytest.warns(DeprecationWarning,
                      match=re.escape('"description" attribute')):
        MetadataEntry("foo", "bar", MetadataValue.text("baz"))
Beispiel #15
0
def test_metadata_entry_construction():
    entry_1 = MetadataEntry("foo", value=MetadataValue.text("bar"))
    entry_2 = MetadataEntry("foo", entry_data=MetadataValue.text("bar"))
    assert entry_1.value == MetadataValue.text("bar")
    assert entry_2.value == MetadataValue.text("bar")
    assert entry_1 == entry_2
def test_input_definition_multiple_partition_lineage():

    entry1 = MetadataEntry("nrows", value=123)
    entry2 = MetadataEntry("some value", value=3.21)

    partition_entries = [
        MetadataEntry("partition count", value=123 * i * i) for i in range(3)
    ]

    @solid(
        output_defs=[
            OutputDefinition(
                name="output1",
                asset_key=AssetKey("table1"),
                asset_partitions=set([str(i) for i in range(3)]),
            )
        ], )
    def solid1(_):
        return Output(
            None,
            "output1",
            metadata_entries=[
                entry1,
                *[
                    PartitionMetadataEntry(str(i), entry)
                    for i, entry in enumerate(partition_entries)
                ],
            ],
        )

    @solid(
        input_defs=[
            # here, only take 1 of the asset keys specified by the output
            InputDefinition(name="_input1",
                            asset_key=AssetKey("table1"),
                            asset_partitions=set(["0"]))
        ],
        output_defs=[
            OutputDefinition(name="output2",
                             asset_key=lambda _: AssetKey("table2"))
        ],
    )
    def solid2(_, _input1):
        yield Output(
            7,
            "output2",
            metadata_entries=[entry2],
        )

    @pipeline
    def my_pipeline():
        solid2(solid1())

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 4

    seen_partitions = set()
    for i in range(3):
        partition = materializations[i].partition
        seen_partitions.add(partition)
        check_materialization(
            materializations[i],
            AssetKey(["table1"]),
            metadata_entries=[entry1, partition_entries[int(partition)]],
        )

    assert len(seen_partitions) == 3

    check_materialization(
        materializations[-1],
        AssetKey(["table2"]),
        parent_assets=[n_asset_keys("table1", 1)],
        metadata_entries=[entry2],
    )