def test_input_context_asset_partitions_time_window(): partitions_def = DailyPartitionsDefinition(start_date="2021-05-05") class MyIOManager(IOManager): def handle_output(self, context, _obj): assert context.asset_partitions_time_window == TimeWindow( pendulum.parse("2021-06-06"), pendulum.parse("2021-06-07")) def load_input(self, context): assert context.asset_partitions_time_window == TimeWindow( pendulum.parse("2021-06-06"), pendulum.parse("2021-06-07")) @asset(partitions_def=partitions_def) def upstream_asset(): pass @asset(partitions_def=partitions_def) def downstream_asset(upstream_asset): assert upstream_asset is None my_job = build_assets_job( "my_job", assets=[downstream_asset, upstream_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) my_job.execute_in_process(partition_key="2021-06-06")
def test_single_partitioned_asset_job(): partitions_def = StaticPartitionsDefinition(["a", "b", "c", "d"]) class MyIOManager(IOManager): def handle_output(self, context, obj): assert context.asset_partition_key == "b" def load_input(self, context): assert False, "shouldn't get here" @asset(partitions_def=partitions_def) def my_asset(): pass my_job = build_assets_job( "my_job", assets=[my_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) result = my_job.execute_in_process(partition_key="b") assert result.asset_materializations_for_node("my_asset") == [ AssetMaterialization(asset_key=AssetKey(["my_asset"]), partition="b") ]
def test_bad_version_str(graph_for_test, strategy): @resource def my_resource(): pass @root_input_manager def my_manager(): pass with instance_for_test() as instance: my_job = graph_for_test.to_job( version_strategy=strategy, resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager( VersionedInMemoryIOManager() ), "my_key": my_manager, "foo": my_resource, }, ) with pytest.raises( DagsterInvariantViolationError, match=f"'{bad_str}' is not a valid version string." ): create_execution_plan(my_job, instance_ref=instance.get_ref())
def test_version_strategy_no_resource_version(): @solid(required_resource_keys={"foo"}) def my_solid(context): return context.resources.foo @resource def foo_resource(): return "bar" class MyVersionStrategy(VersionStrategy): def get_solid_version(self, _): return "foo" @pipeline( version_strategy=MyVersionStrategy(), mode_defs=[ ModeDefinition( resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager( VersionedInMemoryIOManager() ), "foo": foo_resource, } ) ], ) def my_pipeline(): my_solid() with instance_for_test() as instance: execute_pipeline(my_pipeline, instance=instance) memoized_plan = create_execution_plan(my_pipeline, instance_ref=instance.get_ref()) assert len(memoized_plan.step_keys_to_execute) == 0
def test_memoized_plan_disable_memoization(): @solid(version="hello") def my_solid(): return 5 mgr = VersionedInMemoryIOManager() @pipeline( mode_defs=[ ModeDefinition( resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(mgr), }, ), ], tags={MEMOIZED_RUN_TAG: "true"}, ) def my_pipeline(): my_solid() with instance_for_test() as instance: unmemoized_plan = create_execution_plan(my_pipeline, instance_ref=instance.get_ref()) assert len(unmemoized_plan.step_keys_to_execute) == 1 step_output_handle = StepOutputHandle("my_solid", "result") version = unmemoized_plan.get_version_for_step_output_handle(step_output_handle) mgr.values[(step_output_handle.step_key, step_output_handle.output_name, version)] = 5 memoized_plan = create_execution_plan(my_pipeline, instance_ref=instance.get_ref()) assert len(memoized_plan.step_keys_to_execute) == 0 unmemoized_again = create_execution_plan( my_pipeline, instance_ref=instance.get_ref(), tags={MEMOIZED_RUN_TAG: "false"} ) assert len(unmemoized_again.step_keys_to_execute) == 1
def test_memoized_plan_root_input_manager(): @root_input_manager(version="foo") def my_input_manager(): return 5 @solid(input_defs=[InputDefinition("x", root_manager_key="my_input_manager")], version="foo") def my_solid_takes_input(x): return x @pipeline( mode_defs=[ ModeDefinition( resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager( VersionedInMemoryIOManager() ), "my_input_manager": my_input_manager, }, ), ], tags={MEMOIZED_RUN_TAG: "true"}, ) def my_pipeline(): my_solid_takes_input() with instance_for_test() as instance: plan = create_execution_plan(my_pipeline, instance_ref=instance.get_ref()) assert ( plan.get_version_for_step_output_handle( StepOutputHandle("my_solid_takes_input", "result") ) is not None )
def test_memoized_plan_custom_io_manager_key(): manager = VersionedInMemoryIOManager() mgr_def = IOManagerDefinition.hardcoded_io_manager(manager) @solid(version="39", output_defs=[OutputDefinition(io_manager_key="my_key")]) def solid_requires_io_manager(): return Output(5) @pipeline( mode_defs=[ ModeDefinition( resource_defs={ "my_key": mgr_def, }, ), ], tags={MEMOIZED_RUN_TAG: "true"}, ) def io_mgr_pipeline(): solid_requires_io_manager() with instance_for_test() as instance: unmemoized_plan = create_execution_plan(io_mgr_pipeline, instance_ref=instance.get_ref()) assert unmemoized_plan.step_keys_to_execute == ["solid_requires_io_manager"] step_output_handle = StepOutputHandle("solid_requires_io_manager", "result") version = unmemoized_plan.get_version_for_step_output_handle(step_output_handle) manager.values[(step_output_handle.step_key, step_output_handle.output_name, version)] = 5 memoized_plan = create_execution_plan(io_mgr_pipeline, instance_ref=instance.get_ref()) assert len(memoized_plan.step_keys_to_execute) == 0
def test_unmemoized_inner_solid(): @solid def solid_no_version(): pass @composite_solid def wrap(): return solid_no_version() @pipeline( mode_defs=[ ModeDefinition( name="fakemode", resource_defs={ "fake": IOManagerDefinition.hardcoded_io_manager(VersionedInMemoryIOManager()), }, ), ], tags={MEMOIZED_RUN_TAG: "true"}, ) def wrap_pipeline(): wrap() with instance_for_test() as instance: with pytest.raises( DagsterInvariantViolationError, match="While using memoization, version for solid 'solid_no_version' was None. Please " "either provide a versioning strategy for your job, or provide a version using the " "solid decorator.", ): create_execution_plan(wrap_pipeline, instance_ref=instance.get_ref())
def test_source_asset_partitions(): hourly_asset = SourceAsset( AssetKey("hourly_asset"), partitions_def=HourlyPartitionsDefinition( start_date="2021-05-05-00:00"), ) @asset(partitions_def=DailyPartitionsDefinition(start_date="2021-05-05")) def daily_asset(hourly_asset): assert hourly_asset is None class CustomIOManager(IOManager): def handle_output(self, context, obj): pass def load_input(self, context): key_range = context.asset_partition_key_range assert key_range.start == "2021-06-06-00:00" assert key_range.end == "2021-06-06-23:00" daily_job = build_assets_job( name="daily_job", assets=[daily_asset], source_assets=[hourly_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(CustomIOManager()) }, ) assert daily_job.execute_in_process(partition_key="2021-06-06").success
def test_memoized_plan_default_input_val(): @solid( version="42", input_defs=[InputDefinition("_my_input", String, default_value="DEFAULTVAL")], ) def solid_default_input(_my_input): pass @pipeline( mode_defs=[ ModeDefinition( resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager( VersionedInMemoryIOManager() ), }, ) ], tags={MEMOIZED_RUN_TAG: "true"}, ) def pipeline_default_value(): solid_default_input() # Ensure that we can build a valid plan with a default input value. with instance_for_test() as instance: unmemoized_plan = create_execution_plan( pipeline_default_value, instance_ref=instance.get_ref() ) assert unmemoized_plan.step_keys_to_execute == ["solid_default_input"]
def test_memoized_plan_root_input_manager_input_config(): @root_input_manager(version="foo", input_config_schema={"my_str": str}) def my_input_manager(): return 5 @solid( input_defs=[InputDefinition("x", root_manager_key="my_input_manager")], version="foo") def my_solid_takes_input(x): return x @pipeline( mode_defs=[ ModeDefinition(resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager( VersionedInMemoryIOManager()), "my_input_manager": my_input_manager, }, ), ], tags={MEMOIZED_RUN_TAG: "true"}, ) def my_pipeline(): my_solid_takes_input() input_config = {"my_str": "foo"} run_config = { "solids": { "my_solid_takes_input": { "inputs": { "x": input_config } } } } with instance_for_test() as instance: plan = create_execution_plan( my_pipeline, instance_ref=instance.get_ref(), run_config=run_config, ) output_version = plan.get_version_for_step_output_handle( StepOutputHandle("my_solid_takes_input", "result")) assert output_version is not None input_config["my_str"] = "bar" plan = create_execution_plan( my_pipeline, instance_ref=instance.get_ref(), run_config=run_config, ) new_output_version = plan.get_version_for_step_output_handle( StepOutputHandle("my_solid_takes_input", "result")) # Ensure that after changing input config, the version changes. assert not new_output_version == output_version
def test_partition_key(): @op def my_op(): pass @op def my_op2(_input1): pass class MyIOManager(IOManager): def load_input(self, context): assert context.has_partition_key assert context.partition_key == "2020-01-01" def handle_output(self, context, _obj): assert context.has_partition_key assert context.partition_key == "2020-01-01" @job( partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"), resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) def my_job(): my_op2(my_op()) assert my_job.execute_in_process(partition_key="2020-01-01").success
def test_asset_key(): in_asset_key = AssetKey(["a", "b"]) out_asset_key = AssetKey(["c", "d"]) @op(out=Out(asset_key=out_asset_key)) def before(): pass @op(ins={"a": In(asset_key=in_asset_key)}, out={}) def after(a): assert a class MyIOManager(IOManager): def load_input(self, context): assert context.asset_key == in_asset_key assert context.upstream_output.asset_key == out_asset_key return 1 def handle_output(self, context, obj): assert context.asset_key == out_asset_key @graph def my_graph(): after(before()) result = my_graph.to_job(resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }).execute_in_process() assert result.success
def run_test_with_builtin_type(type_to_test, type_values): first_type_val, second_type_val = type_values manager = VersionedInMemoryIOManager() @solid(version="42", input_defs=[InputDefinition("_builtin_type", type_to_test)]) def solid_ext_input(_builtin_type): pass @pipeline( mode_defs=[ ModeDefinition(resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(manager), }, ) ], tags={MEMOIZED_RUN_TAG: "true"}, ) def my_pipeline(): versioned_solid_takes_input(solid_ext_input()) input_config = {"_builtin_type": first_type_val} run_config = {"solids": {"solid_ext_input": {"inputs": input_config}}} with instance_for_test() as instance: unmemoized_plan = create_execution_plan( my_pipeline, run_config=run_config, instance_ref=instance.get_ref(), ) assert len(unmemoized_plan.step_keys_to_execute) == 2 step_output_handle = StepOutputHandle("solid_ext_input", "result") version = unmemoized_plan.get_version_for_step_output_handle( step_output_handle) manager.values[step_output_handle.step_key, step_output_handle.output_name, version] = 5 memoized_plan = create_execution_plan( my_pipeline, run_config=run_config, instance_ref=instance.get_ref(), ) assert memoized_plan.step_keys_to_execute == [ "versioned_solid_takes_input" ] input_config["_builtin_type"] = second_type_val unmemoized_plan = create_execution_plan( my_pipeline, run_config=run_config, instance_ref=instance.get_ref(), ) assert len(unmemoized_plan.step_keys_to_execute) == 2
def test_memoized_plan_affected_by_resource_config(): @solid(required_resource_keys={"my_resource"}, version="39") def solid_reqs_resource(): pass @resource(version="42", config_schema={"foo": str}) def basic(): pass manager = VersionedInMemoryIOManager() @pipeline( mode_defs=[ ModeDefinition( resource_defs={ "my_resource": basic, "io_manager": IOManagerDefinition.hardcoded_io_manager(manager), }, ) ], tags={MEMOIZED_RUN_TAG: "true"}, ) def my_pipeline(): solid_reqs_resource() with instance_for_test() as instance: my_resource_config = {"foo": "bar"} run_config = {"resources": {"my_resource": {"config": my_resource_config}}} unmemoized_plan = create_execution_plan( my_pipeline, run_config=run_config, instance_ref=instance.get_ref() ) assert unmemoized_plan.step_keys_to_execute == ["solid_reqs_resource"] step_output_handle = StepOutputHandle("solid_reqs_resource", "result") version = unmemoized_plan.get_version_for_step_output_handle(step_output_handle) manager.values[step_output_handle.step_key, step_output_handle.output_name, version] = 5 memoized_plan = create_execution_plan( my_pipeline, run_config=run_config, instance_ref=instance.get_ref() ) assert len(memoized_plan.step_keys_to_execute) == 0 my_resource_config["foo"] = "baz" changed_config_plan = create_execution_plan( my_pipeline, run_config=run_config, instance_ref=instance.get_ref() ) assert changed_config_plan.step_keys_to_execute == ["solid_reqs_resource"]
def versioned_pipeline_factory(manager=VersionedInMemoryIOManager()): @pipeline( mode_defs=[ ModeDefinition( name="main", resource_defs={"io_manager": IOManagerDefinition.hardcoded_io_manager(manager)}, ) ], tags={MEMOIZED_RUN_TAG: "true"}, ) def versioned_pipeline(): versioned_solid_takes_input(versioned_solid_no_input()) return versioned_pipeline
def test_memoized_inner_solid(): @solid(version="versioned") def solid_versioned(): pass @composite_solid def wrap(): return solid_versioned() mgr = VersionedInMemoryIOManager() @pipeline( mode_defs=[ ModeDefinition( name="fakemode", resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(mgr), }, ), ], tags={MEMOIZED_RUN_TAG: "true"}, ) def wrap_pipeline(): wrap() with instance_for_test() as instance: unmemoized_plan = create_execution_plan( wrap_pipeline, instance_ref=instance.get_ref()) step_output_handle = StepOutputHandle("wrap.solid_versioned", "result") assert unmemoized_plan.step_keys_to_execute == [ step_output_handle.step_key ] # Affix value to expected version for step output. step_output_version = unmemoized_plan.get_version_for_step_output_handle( step_output_handle) mgr.values[(step_output_handle.step_key, step_output_handle.output_name, step_output_version)] = 4 memoized_plan = unmemoized_plan.build_memoized_plan( wrap_pipeline, ResolvedRunConfig.build(wrap_pipeline), instance=None, selected_step_keys=None, ) assert len(memoized_plan.step_keys_to_execute) == 0
def test_hardcoded_io_manager(): @solid def basic_solid(_): return 5 @pipeline(mode_defs=[ ModeDefinition( resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(InMemoryIOManager()) }) ]) def basic_pipeline(): basic_solid() result = execute_pipeline(basic_pipeline) assert result.success assert result.output_for_solid("basic_solid") == 5
def test_access_partition_keys_from_context_only_one_asset_partitioned(): upstream_partitions_def = StaticPartitionsDefinition(["a", "b", "c"]) class MyIOManager(IOManager): def handle_output(self, context, obj): if context.op_def.name == "upstream_asset": assert context.asset_partition_key == "b" elif context.op_def.name in [ "downstream_asset", "double_downstream_asset" ]: assert not context.has_asset_partitions with pytest.raises(Exception): # TODO: better error message assert context.asset_partition_key_range else: assert False def load_input(self, context): assert not context.has_asset_partitions @asset(partitions_def=upstream_partitions_def) def upstream_asset(context): assert context.output_asset_partition_key() == "b" @asset def downstream_asset(upstream_asset): assert upstream_asset is None @asset def double_downstream_asset(downstream_asset): assert downstream_asset is None my_job = build_assets_job( "my_job", assets=[upstream_asset, downstream_asset, double_downstream_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) result = my_job.execute_in_process(partition_key="b") assert result.asset_materializations_for_node("upstream_asset") == [ AssetMaterialization(asset_key=AssetKey(["upstream_asset"]), partition="b") ]
def test_output_context_asset_partitions_time_window(): class MyIOManager(IOManager): def handle_output(self, context, _obj): assert context.asset_partitions_time_window == TimeWindow( pendulum.parse("2021-06-06"), pendulum.parse("2021-06-07")) def load_input(self, context): raise NotImplementedError() @asset(partitions_def=DailyPartitionsDefinition(start_date="2021-05-05")) def my_asset(): pass my_job = build_assets_job( "my_job", assets=[my_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) my_job.execute_in_process(partition_key="2021-06-06")
while True: time.sleep(0.1) @asset def never_runs_asset(hanging_asset): # pylint: disable=redefined-outer-name,unused-argument pass hanging_job = build_assets_job( name="hanging_job", source_assets=[dummy_source_asset], assets=[first_asset, hanging_asset, never_runs_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(DummyIOManager()), "hanging_asset_resource": hanging_asset_resource, }, ) @asset def asset_one(): return 1 @asset def asset_two(asset_one): # pylint: disable=redefined-outer-name,unused-argument return first_asset + 1
def test_asset_partitions_time_window_non_identity_partition_mapping(): upstream_partitions_def = DailyPartitionsDefinition( start_date="2020-01-01") downstream_partitions_def = DailyPartitionsDefinition( start_date="2020-01-01") class TrailingWindowPartitionMapping(PartitionMapping): """ Maps each downstream partition to two partitions in the upstream asset: itself and the preceding partition. """ def get_upstream_partitions_for_partition_range( self, downstream_partition_key_range: PartitionKeyRange, downstream_partitions_def: PartitionsDefinition, upstream_partitions_def: PartitionsDefinition, ) -> PartitionKeyRange: del downstream_partitions_def, upstream_partitions_def start, end = downstream_partition_key_range assert start == "2020-01-02" assert end == "2020-01-02" return PartitionKeyRange("2020-01-01", "2020-01-02") def get_downstream_partitions_for_partition_range( self, upstream_partition_key_range: PartitionKeyRange, downstream_partitions_def: PartitionsDefinition, upstream_partitions_def: PartitionsDefinition, ) -> PartitionKeyRange: raise NotImplementedError() class MyIOManager(IOManager): def handle_output(self, context, obj): assert context.asset_partitions_time_window == TimeWindow( pendulum.parse("2020-01-02"), pendulum.parse("2020-01-03")) def load_input(self, context): assert context.asset_partitions_time_window == TimeWindow( pendulum.parse("2020-01-01"), pendulum.parse("2020-01-03")) @asset(partitions_def=upstream_partitions_def) def upstream_asset(): pass @asset( partitions_def=downstream_partitions_def, partition_mappings={ "upstream_asset": TrailingWindowPartitionMapping() }, ) def downstream_asset(upstream_asset): assert upstream_asset is None my_job = build_assets_job( "my_job", assets=[upstream_asset, downstream_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) my_job.execute_in_process(partition_key="2020-01-02")
def test_access_partition_keys_from_context_non_identity_partition_mapping(): upstream_partitions_def = StaticPartitionsDefinition(["1", "2", "3"]) downstream_partitions_def = StaticPartitionsDefinition(["1", "2", "3"]) class TrailingWindowPartitionMapping(PartitionMapping): """ Maps each downstream partition to two partitions in the upstream asset: itself and the preceding partition. """ def get_upstream_partitions_for_partition_range( self, downstream_partition_key_range: PartitionKeyRange, downstream_partitions_def: PartitionsDefinition, upstream_partitions_def: PartitionsDefinition, ) -> PartitionKeyRange: assert downstream_partitions_def assert upstream_partitions_def start, end = downstream_partition_key_range return PartitionKeyRange(str(max(1, int(start) - 1)), end) def get_downstream_partitions_for_partition_range( self, upstream_partition_key_range: PartitionKeyRange, downstream_partitions_def: PartitionsDefinition, upstream_partitions_def: PartitionsDefinition, ) -> PartitionKeyRange: raise NotImplementedError() class MyIOManager(IOManager): def handle_output(self, context, obj): assert context.asset_partition_key == "2" def load_input(self, context): start, end = context.asset_partition_key_range assert start, end == ("1", "2") @asset(partitions_def=upstream_partitions_def) def upstream_asset(context): assert context.output_asset_partition_key() == "2" @asset( partitions_def=downstream_partitions_def, partition_mappings={ "upstream_asset": TrailingWindowPartitionMapping() }, ) def downstream_asset(context, upstream_asset): assert context.output_asset_partition_key() == "2" assert upstream_asset is None my_job = build_assets_job( "my_job", assets=[upstream_asset, downstream_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) result = my_job.execute_in_process(partition_key="2") assert result.asset_materializations_for_node("upstream_asset") == [ AssetMaterialization(AssetKey(["upstream_asset"]), partition="2") ] assert result.asset_materializations_for_node("downstream_asset") == [ AssetMaterialization(AssetKey(["downstream_asset"]), partition="2") ]
if context.dagster_type.typing_type == PandasDF: fs_path = os.path.abspath(self._get_fs_path(context.asset_key)) paths = glob.glob(os.path.join(fs_path, "*.csv")) check.invariant(len(paths) > 0, f"No csv files found under {fs_path}") return pd.concat(map(pd.read_csv, paths)) elif context.dagster_type.typing_type == SparkDF: return ( SparkSession.builder.getOrCreate() .read.format("csv") .options(header="true") .load(self._get_fs_path(context.asset_key)) ) else: raise ValueError("Unexpected input type") # io_manager_end # build_assets_job_start spark_weather_job = build_assets_job( "spark_weather", assets=[daily_temperature_highs, hottest_dates, daily_temperature_high_diffs], source_assets=[sfo_q2_weather_sample], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(LocalFileSystemIOManager()) }, ) # build_assets_job_end