def test_materialize_with_selection(): @asset def start_asset(): return "foo" @multi_asset(outs={"o1": Out(asset_key=AssetKey("o1")), "o2": Out(asset_key=AssetKey("o2"))}) def middle_asset(start_asset): return (start_asset, start_asset) @asset def follows_o1(o1): return o1 @asset def follows_o2(o2): return o2 _, io_manager_def = asset_aware_io_manager() group = AssetGroup( [start_asset, middle_asset, follows_o1, follows_o2], resource_defs={"io_manager": io_manager_def}, ) result = group.materialize(selection="*follows_o2") assert result.success assert result.output_for_node("middle_asset", "o1") == "foo" assert result.output_for_node("follows_o2") == "foo" assert result.output_for_node("start_asset") == "foo"
def test_multi_asset_internal_asset_deps_metadata(): @multi_asset( outs={ "my_out_name": Out(metadata={"foo": "bar"}), "my_other_out_name": Out(metadata={"bar": "foo"}), }, internal_asset_deps={ "my_out_name": {AssetKey("my_other_out_name"), AssetKey("my_in_name")} }, ) def my_asset(my_in_name): # pylint: disable=unused-argument yield Output(1, "my_out_name") yield Output(2, "my_other_out_name") assert my_asset.asset_keys == { AssetKey("my_out_name"), AssetKey("my_other_out_name") } assert my_asset.op.output_def_named("my_out_name").metadata == { "foo": "bar", ASSET_DEPENDENCY_METADATA_KEY: {AssetKey("my_other_out_name"), AssetKey("my_in_name")}, } assert my_asset.op.output_def_named("my_other_out_name").metadata == { "bar": "foo" }
def test_multi_asset_asset_materialization_planned_events(): @multi_asset( outs={ "my_out_name": Out(asset_key=AssetKey("my_asset_name")), "my_other_out_name": Out(asset_key=AssetKey("my_other_asset")), } ) def my_asset(): yield Output(1, "my_out_name") yield Output(2, "my_other_out_name") assets_job = build_assets_job("assets_job", [my_asset]) with instance_for_test() as instance: result = assets_job.execute_in_process(instance=instance) records = instance.get_event_records( EventRecordsFilter( DagsterEventType.ASSET_MATERIALIZATION_PLANNED, AssetKey("my_asset_name") ) ) assert result.run_id == records[0].event_log_entry.run_id run_id = result.run_id assert instance.run_ids_for_asset_key(AssetKey("my_asset_name")) == [run_id] assert instance.run_ids_for_asset_key(AssetKey("my_other_asset")) == [run_id]
def test_multi_out(): @op(out={"a": Out(metadata={"x": 1}), "b": Out(metadata={"y": 2})}) def my_op() -> Tuple[int, str]: return 1, "q" assert len(my_op.output_defs) == 2 assert my_op.outs == { "a": Out(metadata={"x": 1}, dagster_type=Int, is_required=True, io_manager_key="io_manager"), "b": Out(metadata={"y": 2}, dagster_type=String, is_required=True, io_manager_key="io_manager"), } assert my_op.output_defs[0].metadata == {"x": 1} assert my_op.output_defs[0].name == "a" assert my_op.output_defs[1].metadata == {"y": 2} assert my_op.output_defs[1].name == "b" assert my_op() == (1, "q")
def test_op_multiout_base(): @op(out={"a": Out(), "b": Out()}) def basic_multiout() -> Tuple[int, str]: return (5, "foo") assert basic_multiout() == (5, "foo") result = execute_op_in_graph(basic_multiout) assert result.output_for_node("basic_multiout", "a") == 5 assert result.output_for_node("basic_multiout", "b") == "foo"
def test_multiout_dagster_type(): @op(out={ "a": Out(dagster_type=even_type), "b": Out(dagster_type=even_type) }) def basic_multi() -> Tuple[int, int]: return 6, 6 assert basic_multi() == (6, 6)
def test_multi_asset_infer_from_empty_asset_key(): @multi_asset(outs={"my_out_name": Out(), "my_other_out_name": Out()}) def my_asset(): yield Output(1, "my_out_name") yield Output(2, "my_other_out_name") assert my_asset.asset_keys == { AssetKey("my_out_name"), AssetKey("my_other_out_name") }
def test_op_multiout_incorrect_annotation(): with pytest.raises( DagsterInvariantViolationError, match= "Expected Tuple annotation for multiple outputs, but received non-tuple annotation.", ): @op(out={"a": Out(), "b": Out()}) def _incorrect_annotation_op() -> int: pass
def test_op_multiout_size_mismatch(): with pytest.raises( DagsterInvariantViolationError, match= "Expected Tuple annotation to have number of entries matching the number of outputs " "for more than one output. Expected 2 outputs but annotation has 3.", ): @op(out={"a": Out(), "b": Out()}) def _basic_multiout_wrong_annotation() -> Tuple[int, int, int]: pass
def test_log_event_multi_output(): @op(out={"out1": Out(), "out2": Out()}) def the_op(context): context.log_event(AssetMaterialization("foo")) yield Output(value=1, output_name="out1") context.log_event(AssetMaterialization("bar")) yield Output(value=2, output_name="out2") context.log_event(AssetMaterialization("baz")) result = execute_op_in_graph(the_op) assert result.success assert len(result.asset_materializations_for_node("the_op")) == 3
def test_multi_out(): @op(out={"a": Out(metadata={"x": 1}), "b": Out(metadata={"y": 2})}) def my_op() -> Tuple[int, str]: return 1, "q" assert len(my_op.output_defs) == 2 assert my_op.output_defs[0].metadata == {"x": 1} assert my_op.output_defs[0].name == "a" assert my_op.output_defs[1].metadata == {"y": 2} assert my_op.output_defs[1].name == "b" assert my_op() == (1, "q")
def test_multi_out_optional(): @op( out={ "a": Out(metadata={"x": 1}, is_required=False), "b": Out(metadata={"y": 2}) }) def my_op(): yield Output(output_name="b", value=2) result = execute_op_in_graph(my_op) assert result.output_for_node("my_op", "b") == 2 assert [output.value for output in my_op()] == [2]
def test_add_output_metadata(): @op(out={"out1": Out(), "out2": Out()}) def the_op(context): context.add_output_metadata({"foo": "bar"}, output_name="out1") yield Output(value=1, output_name="out1") context.add_output_metadata({"bar": "baz"}, output_name="out2") yield Output(value=2, output_name="out2") context = build_op_context() events = list(the_op(context)) assert len(events) == 2 assert context.get_output_metadata("out1") == {"foo": "bar"} assert context.get_output_metadata("out2") == {"bar": "baz"}
def test_output_values(): @op(out={"a": Out(), "b": Out()}) def two_outs(): return 1, 2 @graph def a(): two_outs() result = a.execute_in_process() assert result.success assert result.output_for_node("two_outs", "a") == 1 assert result.output_for_node("two_outs", "b") == 2
def test_multi_asset_out_name_diff_from_asset_key(): @multi_asset( outs={ "my_out_name": Out(asset_key=AssetKey("my_asset_name")), "my_other_out_name": Out(asset_key=AssetKey("my_other_asset")), }) def my_asset(): yield Output(1, "my_out_name") yield Output(2, "my_other_out_name") assert my_asset.asset_keys == { AssetKey("my_asset_name"), AssetKey("my_other_asset") }
def test_multi_out_yields(): @op(out={"a": Out(metadata={"x": 1}), "b": Out(metadata={"y": 2})}) def my_op(): yield Output(output_name="a", value=1) yield Output(output_name="b", value=2) assert my_op.output_defs[0].metadata == {"x": 1} assert my_op.output_defs[0].name == "a" assert my_op.output_defs[1].metadata == {"y": 2} assert my_op.output_defs[1].name == "b" result = execute_op_in_graph(my_op) assert result.output_for_node("my_op", "a") == 1 assert result.output_for_node("my_op", "b") == 2 assert [output.value for output in my_op()] == [1, 2]
def test_out(): @op(out=Out(metadata={"x": 1})) def my_op() -> int: return 1 assert my_op.outs == { "result": Out(metadata={"x": 1}, dagster_type=Int, is_required=True, io_manager_key="io_manager") } assert my_op.output_defs[0].metadata == {"x": 1} assert my_op.output_defs[0].name == "result" assert my_op() == 1
def define_multiple_output_job(): @op( out={ "foo": Out(Int), "foobar": Out(Int), } ) def return_two_outputs(): yield Output(10, "foobar") yield Output(5, "foo") @job(resource_defs={"io_manager": s3_pickle_io_manager, "s3": s3_test_resource}) def output_prefix_execution_plan(): return_two_outputs() return output_prefix_execution_plan
def test_multi_asset_with_compute_kind(): @multi_asset(outs={"o1": Out(asset_key=AssetKey("o1"))}, compute_kind="sql") def my_asset(arg1): return arg1 assert my_asset.op.tags == {"kind": "sql"}
def test_out_dagster_type(): @op(out=Out(dagster_type=even_type)) def basic() -> int: return 6 assert basic.output_defs[0].dagster_type == even_type assert basic() == 6
def build_for_materialization(materialization): class DummyIOManager(IOManager): def __init__(self): self.values = {} def handle_output(self, context, obj): keys = tuple(context.get_output_identifier()) self.values[keys] = obj context.add_output_metadata({"foo": "bar"}) yield MetadataEntry("baz", value="baz") context.add_output_metadata({"bar": "bar"}) yield materialization def load_input(self, context): keys = tuple(context.upstream_output.get_output_identifier()) return self.values[keys] @op(out=Out(asset_key=AssetKey("key_on_out"))) def the_op(): return 5 @graph def the_graph(): the_op() return the_graph.execute_in_process( resources={"io_manager": DummyIOManager()})
def test_dataframe_csv_missing_input_collision(): called = {} @op(out=Out(DataFrame)) def df_as_output(_context): return pd.DataFrame() @op(ins={"df": In(DataFrame)}) def df_as_input(_context, df): # pylint: disable=W0613 called["yup"] = True @graph def overlapping(): return df_as_input(df_as_output()) with pytest.raises(DagsterInvalidConfigError) as exc_info: overlapping.execute_in_process( run_config={ "ops": { "df_as_input": { "inputs": {"df": {"csv": {"path": file_relative_path(__file__, "num.csv")}}} } } }, ) assert ( 'Error 1: Received unexpected config entry "inputs" at path root:ops:df_as_input.' in str(exc_info.value) ) assert "yup" not in called
def test_basic_multi_asset(): @multi_asset( outs={ f"out{i}": Out(description=f"foo: {i}", asset_key=AssetKey(f"asset{i}")) for i in range(10) } ) def assets(): pass assets_job = build_assets_job("assets_job", [assets]) external_asset_nodes = external_asset_graph_from_defs([assets_job], source_assets_by_key={}) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey(f"asset{i}"), dependencies=[], depended_by=[], op_name="assets", op_description=None, job_names=["assets_job"], output_name=f"out{i}", output_description=f"foo: {i}", ) for i in range(10) ]
def nonce_op(name, n_inputs, n_outputs): """Creates an op with the given number of (meaningless) inputs and outputs. Config controls the behavior of the nonce op.""" @op( name=name, ins={"input_{}".format(i): In() for i in range(n_inputs)}, out={"output_{}".format(i): Out() for i in range(n_outputs)}, ) def op_fn(context, **_kwargs): for i in range(200): time.sleep(0.02) if i % 1000 == 420: context.log.error( "Error message seq={i} from op {name}".format(i=i, name=name)) elif i % 100 == 0: context.log.warning( "Warning message seq={i} from op {name}".format(i=i, name=name)) elif i % 10 == 0: context.log.info("Info message seq={i} from op {name}".format( i=i, name=name)) else: context.log.debug( "Debug message seq={i} from op {name}".format(i=i, name=name)) for i in range(n_outputs): yield Output(value="foo", output_name="output_{}".format(i)) return op_fn
def test_asset_key(): in_asset_key = AssetKey(["a", "b"]) out_asset_key = AssetKey(["c", "d"]) @op(out=Out(asset_key=out_asset_key)) def before(): pass @op(ins={"a": In(asset_key=in_asset_key)}, out={}) def after(a): assert a class MyIOManager(IOManager): def load_input(self, context): assert context.asset_key == in_asset_key assert context.upstream_output.asset_key == out_asset_key return 1 def handle_output(self, context, obj): assert context.asset_key == out_asset_key @graph def my_graph(): after(before()) result = my_graph.to_job(resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }).execute_in_process() assert result.success
def test_hello_world(): @op(ins={"num_csv": In(DataFrame)}, out=Out(DataFrame)) def hello_world_op(num_csv): num_csv["sum"] = num_csv["num1"] + num_csv["num2"] return num_csv @graph def hello_world(): hello_world_op() result = hello_world.execute_in_process( run_config={ "ops": { "hello_world_op": { "inputs": { "num_csv": {"csv": {"path": file_relative_path(__file__, "num.csv")}} } } } } ) assert result.success assert result.output_for_node("hello_world_op").to_dict("list") == { "num1": [1, 3], "num2": [2, 4], "sum": [3, 7], }
def test_date_column(): @op(out=Out(DataFrame)) def dataframe_constant(): return pd.DataFrame([{datetime.date(2019, 1, 1): 0}]) df = dataframe_constant() assert isinstance(df, pd.DataFrame)
def test_dataframe_pickle_materialization(): @op(out=Out(DataFrame)) def return_df(_context): return pd.DataFrame({"num1": [1, 3], "num2": [2, 4]}) @graph def return_df_graph(): return_df() with get_temp_file_name() as filename: filename = "/tmp/num.pickle" result = return_df_graph.execute_in_process(run_config={ "ops": { "return_df": { "outputs": [{ "result": { "pickle": { "path": filename } } }] } } }, ) assert result.success df = pd.read_pickle(filename) assert df.to_dict("list") == {"num1": [1, 3], "num2": [2, 4]}
def test_basic_pipeline_with_pandas_dataframe_dagster_type(): def compute_event_metadata(dataframe): return {"max_pid": str(max(dataframe["pid"]))} BasicDF = create_dagster_pandas_dataframe_type( name="BasicDF", columns=[ PandasColumn.integer_column("pid", non_nullable=True), PandasColumn.string_column("names"), ], event_metadata_fn=compute_event_metadata, ) @op(out={"basic_dataframe": Out(dagster_type=BasicDF)}) def create_dataframe(_): yield Output( DataFrame({"pid": [1, 2, 3], "names": ["foo", "bar", "baz"]}), output_name="basic_dataframe", ) @graph def basic_graph(): return create_dataframe() result = basic_graph.execute_in_process() assert result.success for event in result.all_node_events: if event.event_type_value == "STEP_OUTPUT": mock_df_output_event_metadata = ( event.event_specific_data.type_check_data.metadata_entries ) assert len(mock_df_output_event_metadata) == 1 assert any([entry.label == "max_pid" for entry in mock_df_output_event_metadata])
def test_op_typing_annotations(): @op def my_dict_op() -> Dict[str, int]: return {"foo": 5} assert my_dict_op() == {"foo": 5} my_output = {"foo": 5}, ("foo", ) @op(out={"a": Out(), "b": Out()}) def my_dict_multiout() -> Tuple[Dict[str, int], Tuple[str]]: return {"foo": 5}, ("foo", ) assert my_dict_multiout() == my_output result = execute_op_in_graph(my_dict_multiout) assert result.output_for_node("my_dict_multiout", "a") == my_output[0] assert result.output_for_node("my_dict_multiout", "b") == my_output[1]