Example #1
0
def test_io_manager_diamond_lineage():
    class MyIOManager(IOManager):
        def handle_output(self, context, obj):
            # store asset
            return

        def load_input(self, context):
            return None

        def get_output_asset_key(self, context):
            return AssetKey([context.step_key, context.name])

    @io_manager
    def my_io_manager(_):
        return MyIOManager()

    @solid(
        output_defs=[
            OutputDefinition(name="outputA", io_manager_key="asset_io_manager"),
            OutputDefinition(name="outputB", io_manager_key="asset_io_manager"),
        ]
    )
    def solid_produce(_):
        yield Output(None, "outputA")
        yield Output(None, "outputB")

    @solid(output_defs=[OutputDefinition(name="outputT", io_manager_key="asset_io_manager")])
    def solid_transform(_, _input):
        return None

    @solid(output_defs=[OutputDefinition(name="outputC", io_manager_key="asset_io_manager")])
    def solid_combine(_, _inputA, _inputB):
        return Output(None, "outputC")

    @pipeline(mode_defs=[ModeDefinition(resource_defs={"asset_io_manager": my_io_manager})])
    def my_pipeline():
        a, b = solid_produce()
        at = solid_transform.alias("a_transform")(a)
        bt = solid_transform.alias("b_transform")(b)
        solid_combine(at, bt)

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 5

    check_materialization(materializations[0], AssetKey(["solid_produce", "outputA"]))
    check_materialization(materializations[1], AssetKey(["solid_produce", "outputB"]))
    check_materialization(
        materializations[-1],
        AssetKey(
            ["solid_combine", "outputC"],
        ),
        parent_assets=[
            AssetLineageInfo(AssetKey(["a_transform", "outputT"])),
            AssetLineageInfo(AssetKey(["b_transform", "outputT"])),
        ],
    )
Example #2
0
def test_output_definition_single_partition_materialization():

    entry1 = MetadataEntry("nrows", value=123)
    entry2 = MetadataEntry("some value", value=3.21)

    @solid(output_defs=[OutputDefinition(name="output1", asset_key=AssetKey("table1"))])
    def solid1(_):
        return Output(None, "output1", metadata_entries=[entry1])

    @solid(output_defs=[OutputDefinition(name="output2", asset_key=lambda _: AssetKey("table2"))])
    def solid2(_, _input1):
        yield Output(
            7,
            "output2",
            metadata_entries=[entry2],
        )

    @pipeline
    def my_pipeline():
        solid2(solid1())

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 2

    check_materialization(materializations[0], AssetKey(["table1"]), metadata_entries=[entry1])
    check_materialization(
        materializations[1],
        AssetKey(["table2"]),
        metadata_entries=[entry2],
        parent_assets=[AssetLineageInfo(AssetKey(["table1"]))],
    )
Example #3
0
def _get_asset_lineage_from_fns(
        context, asset_key_fn,
        asset_partitions_fn) -> Optional[AssetLineageInfo]:
    asset_key = asset_key_fn(context)
    if not asset_key:
        return None
    return AssetLineageInfo(
        asset_key=asset_key,
        partitions=asset_partitions_fn(context),
    )
Example #4
0
def test_dynamic_output_definition_single_partition_materialization():

    entry1 = EventMetadataEntry.int(123, "nrows")
    entry2 = EventMetadataEntry.float(3.21, "some value")

    @solid(output_defs=[
        OutputDefinition(name="output1", asset_key=AssetKey("table1"))
    ])
    def solid1(_):
        return Output(None, "output1", metadata_entries=[entry1])

    @solid(output_defs=[
        DynamicOutputDefinition(
            name="output2",
            asset_key=lambda context: AssetKey(context.mapping_key))
    ])
    def solid2(_, _input1):
        for i in range(4):
            yield DynamicOutput(
                7,
                mapping_key=str(i),
                output_name="output2",
                metadata_entries=[entry2],
            )

    @solid
    def do_nothing(_, _input1):
        pass

    @pipeline
    def my_pipeline():
        solid2(solid1()).map(do_nothing)

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 5

    check_materialization(materializations[0],
                          AssetKey(["table1"]),
                          metadata_entries=[entry1])
    seen_paths = set()
    for i in range(1, 5):
        path = materializations[i].asset_key.path
        seen_paths.add(tuple(path))
        check_materialization(
            materializations[i],
            AssetKey(path),
            metadata_entries=[entry2],
            parent_assets=[AssetLineageInfo(AssetKey(["table1"]))],
        )
    assert len(seen_paths) == 4
def test_io_manager_single_partition_materialization():

    entry1 = MetadataEntry.int(123, "nrows")
    entry2 = MetadataEntry.float(3.21, "some value")

    class MyIOManager(IOManager):
        def handle_output(self, context, obj):
            # store asset
            yield entry1

        def load_input(self, context):
            return None

        def get_output_asset_key(self, context):
            return AssetKey([context.step_key])

    @io_manager
    def my_io_manager(_):
        return MyIOManager()

    @solid(output_defs=[OutputDefinition(name="output1")])
    def solid1(_):
        return Output(None, "output1")

    @solid(output_defs=[OutputDefinition(name="output2")])
    def solid2(_, _input1):
        yield Output(
            7,
            "output2",
            metadata_entries=[entry2],
        )

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={"io_manager": my_io_manager})
    ])
    def my_pipeline():
        solid2(solid1())

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 2

    check_materialization(materializations[0],
                          AssetKey(["solid1"]),
                          metadata_entries=[entry1])
    check_materialization(
        materializations[1],
        AssetKey(["solid2"]),
        metadata_entries=[entry1, entry2],
        parent_assets=[AssetLineageInfo(AssetKey(["solid1"]))],
    )
Example #6
0
def _dedup_asset_lineage(asset_lineage: List[AssetLineageInfo]) -> List[AssetLineageInfo]:
    """Method to remove duplicate specifications of the same Asset/Partition pair from the lineage
    information. Duplicates can occur naturally when calculating transitive dependencies from solids
    with multiple Outputs, which in turn have multiple Inputs (because each Output of the solid will
    inherit all dependencies from all of the solid Inputs).
    """
    key_partition_mapping: Dict[AssetKey, Set[str]] = defaultdict(set)

    for lineage_info in asset_lineage:
        if not lineage_info.partitions:
            key_partition_mapping[lineage_info.asset_key] |= set()
        for partition in lineage_info.partitions:
            key_partition_mapping[lineage_info.asset_key].add(partition)
    return [
        AssetLineageInfo(asset_key=asset_key, partitions=partitions)
        for asset_key, partitions in key_partition_mapping.items()
    ]
Example #7
0
def test_mixed_asset_definition_lineage():
    class MyIOManager(IOManager):
        def handle_output(self, context, obj):
            # store asset
            return

        def load_input(self, context):
            return None

        def get_output_asset_key(self, context):
            return AssetKey(["io_manager_table", context.step_key])

    @io_manager
    def my_io_manager(_):
        return MyIOManager()

    @solid(output_defs=[OutputDefinition(io_manager_key="asset_io_manager")])
    def io_manager_solid(_):
        return 1

    @solid(output_defs=[
        OutputDefinition(
            asset_key=AssetKey(["output_def_table", "output_def_solid"]))
    ])
    def output_def_solid(_):
        return 1

    @solid(output_defs=[
        OutputDefinition(name="a",
                         asset_key=AssetKey(
                             ["output_def_table", "combine_solid"])),
        OutputDefinition(name="b", io_manager_key="asset_io_manager"),
    ])
    def combine_solid(_, _a, _b):
        yield Output(None, "a")
        yield Output(None, "b")

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={"asset_io_manager": my_io_manager})
    ])
    def my_pipeline():
        a = io_manager_solid()
        b = output_def_solid()
        combine_solid(a, b)

    result = execute_pipeline(my_pipeline)
    events = result.step_event_list
    materializations = [
        event for event in events
        if event.event_type_value == "ASSET_MATERIALIZATION"
    ]
    assert len(materializations) == 4

    check_materialization(materializations[0],
                          AssetKey(["io_manager_table", "io_manager_solid"]))
    check_materialization(materializations[1],
                          AssetKey(["output_def_table", "output_def_solid"]))
    check_materialization(
        materializations[2],
        AssetKey(["output_def_table", "combine_solid"]),
        parent_assets=[
            AssetLineageInfo(AssetKey(["io_manager_table",
                                       "io_manager_solid"])),
            AssetLineageInfo(AssetKey(["output_def_table",
                                       "output_def_solid"])),
        ],
    )
    check_materialization(
        materializations[3],
        AssetKey(["io_manager_table", "combine_solid"]),
        parent_assets=[
            AssetLineageInfo(AssetKey(["io_manager_table",
                                       "io_manager_solid"])),
            AssetLineageInfo(AssetKey(["output_def_table",
                                       "output_def_solid"])),
        ],
    )
Example #8
0
def n_asset_keys(path, n):
    return AssetLineageInfo(AssetKey(path), set([str(i) for i in range(n)]))