def test_io_manager_diamond_lineage(): class MyIOManager(IOManager): def handle_output(self, context, obj): # store asset return def load_input(self, context): return None def get_output_asset_key(self, context): return AssetKey([context.step_key, context.name]) @io_manager def my_io_manager(_): return MyIOManager() @solid( output_defs=[ OutputDefinition(name="outputA", io_manager_key="asset_io_manager"), OutputDefinition(name="outputB", io_manager_key="asset_io_manager"), ] ) def solid_produce(_): yield Output(None, "outputA") yield Output(None, "outputB") @solid(output_defs=[OutputDefinition(name="outputT", io_manager_key="asset_io_manager")]) def solid_transform(_, _input): return None @solid(output_defs=[OutputDefinition(name="outputC", io_manager_key="asset_io_manager")]) def solid_combine(_, _inputA, _inputB): return Output(None, "outputC") @pipeline(mode_defs=[ModeDefinition(resource_defs={"asset_io_manager": my_io_manager})]) def my_pipeline(): a, b = solid_produce() at = solid_transform.alias("a_transform")(a) bt = solid_transform.alias("b_transform")(b) solid_combine(at, bt) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 5 check_materialization(materializations[0], AssetKey(["solid_produce", "outputA"])) check_materialization(materializations[1], AssetKey(["solid_produce", "outputB"])) check_materialization( materializations[-1], AssetKey( ["solid_combine", "outputC"], ), parent_assets=[ AssetLineageInfo(AssetKey(["a_transform", "outputT"])), AssetLineageInfo(AssetKey(["b_transform", "outputT"])), ], )
def test_output_definition_single_partition_materialization(): entry1 = MetadataEntry("nrows", value=123) entry2 = MetadataEntry("some value", value=3.21) @solid(output_defs=[OutputDefinition(name="output1", asset_key=AssetKey("table1"))]) def solid1(_): return Output(None, "output1", metadata_entries=[entry1]) @solid(output_defs=[OutputDefinition(name="output2", asset_key=lambda _: AssetKey("table2"))]) def solid2(_, _input1): yield Output( 7, "output2", metadata_entries=[entry2], ) @pipeline def my_pipeline(): solid2(solid1()) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 2 check_materialization(materializations[0], AssetKey(["table1"]), metadata_entries=[entry1]) check_materialization( materializations[1], AssetKey(["table2"]), metadata_entries=[entry2], parent_assets=[AssetLineageInfo(AssetKey(["table1"]))], )
def _get_asset_lineage_from_fns( context, asset_key_fn, asset_partitions_fn) -> Optional[AssetLineageInfo]: asset_key = asset_key_fn(context) if not asset_key: return None return AssetLineageInfo( asset_key=asset_key, partitions=asset_partitions_fn(context), )
def test_dynamic_output_definition_single_partition_materialization(): entry1 = EventMetadataEntry.int(123, "nrows") entry2 = EventMetadataEntry.float(3.21, "some value") @solid(output_defs=[ OutputDefinition(name="output1", asset_key=AssetKey("table1")) ]) def solid1(_): return Output(None, "output1", metadata_entries=[entry1]) @solid(output_defs=[ DynamicOutputDefinition( name="output2", asset_key=lambda context: AssetKey(context.mapping_key)) ]) def solid2(_, _input1): for i in range(4): yield DynamicOutput( 7, mapping_key=str(i), output_name="output2", metadata_entries=[entry2], ) @solid def do_nothing(_, _input1): pass @pipeline def my_pipeline(): solid2(solid1()).map(do_nothing) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 5 check_materialization(materializations[0], AssetKey(["table1"]), metadata_entries=[entry1]) seen_paths = set() for i in range(1, 5): path = materializations[i].asset_key.path seen_paths.add(tuple(path)) check_materialization( materializations[i], AssetKey(path), metadata_entries=[entry2], parent_assets=[AssetLineageInfo(AssetKey(["table1"]))], ) assert len(seen_paths) == 4
def test_io_manager_single_partition_materialization(): entry1 = MetadataEntry.int(123, "nrows") entry2 = MetadataEntry.float(3.21, "some value") class MyIOManager(IOManager): def handle_output(self, context, obj): # store asset yield entry1 def load_input(self, context): return None def get_output_asset_key(self, context): return AssetKey([context.step_key]) @io_manager def my_io_manager(_): return MyIOManager() @solid(output_defs=[OutputDefinition(name="output1")]) def solid1(_): return Output(None, "output1") @solid(output_defs=[OutputDefinition(name="output2")]) def solid2(_, _input1): yield Output( 7, "output2", metadata_entries=[entry2], ) @pipeline(mode_defs=[ ModeDefinition(resource_defs={"io_manager": my_io_manager}) ]) def my_pipeline(): solid2(solid1()) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 2 check_materialization(materializations[0], AssetKey(["solid1"]), metadata_entries=[entry1]) check_materialization( materializations[1], AssetKey(["solid2"]), metadata_entries=[entry1, entry2], parent_assets=[AssetLineageInfo(AssetKey(["solid1"]))], )
def _dedup_asset_lineage(asset_lineage: List[AssetLineageInfo]) -> List[AssetLineageInfo]: """Method to remove duplicate specifications of the same Asset/Partition pair from the lineage information. Duplicates can occur naturally when calculating transitive dependencies from solids with multiple Outputs, which in turn have multiple Inputs (because each Output of the solid will inherit all dependencies from all of the solid Inputs). """ key_partition_mapping: Dict[AssetKey, Set[str]] = defaultdict(set) for lineage_info in asset_lineage: if not lineage_info.partitions: key_partition_mapping[lineage_info.asset_key] |= set() for partition in lineage_info.partitions: key_partition_mapping[lineage_info.asset_key].add(partition) return [ AssetLineageInfo(asset_key=asset_key, partitions=partitions) for asset_key, partitions in key_partition_mapping.items() ]
def test_mixed_asset_definition_lineage(): class MyIOManager(IOManager): def handle_output(self, context, obj): # store asset return def load_input(self, context): return None def get_output_asset_key(self, context): return AssetKey(["io_manager_table", context.step_key]) @io_manager def my_io_manager(_): return MyIOManager() @solid(output_defs=[OutputDefinition(io_manager_key="asset_io_manager")]) def io_manager_solid(_): return 1 @solid(output_defs=[ OutputDefinition( asset_key=AssetKey(["output_def_table", "output_def_solid"])) ]) def output_def_solid(_): return 1 @solid(output_defs=[ OutputDefinition(name="a", asset_key=AssetKey( ["output_def_table", "combine_solid"])), OutputDefinition(name="b", io_manager_key="asset_io_manager"), ]) def combine_solid(_, _a, _b): yield Output(None, "a") yield Output(None, "b") @pipeline(mode_defs=[ ModeDefinition(resource_defs={"asset_io_manager": my_io_manager}) ]) def my_pipeline(): a = io_manager_solid() b = output_def_solid() combine_solid(a, b) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 4 check_materialization(materializations[0], AssetKey(["io_manager_table", "io_manager_solid"])) check_materialization(materializations[1], AssetKey(["output_def_table", "output_def_solid"])) check_materialization( materializations[2], AssetKey(["output_def_table", "combine_solid"]), parent_assets=[ AssetLineageInfo(AssetKey(["io_manager_table", "io_manager_solid"])), AssetLineageInfo(AssetKey(["output_def_table", "output_def_solid"])), ], ) check_materialization( materializations[3], AssetKey(["io_manager_table", "combine_solid"]), parent_assets=[ AssetLineageInfo(AssetKey(["io_manager_table", "io_manager_solid"])), AssetLineageInfo(AssetKey(["output_def_table", "output_def_solid"])), ], )
def n_asset_keys(path, n): return AssetLineageInfo(AssetKey(path), set([str(i) for i in range(n)]))