def test_io_manager_single_partition_materialization(): entry1 = EventMetadataEntry.int(123, "nrows") entry2 = EventMetadataEntry.float(3.21, "some value") class MyIOManager(IOManager): def handle_output(self, context, obj): # store asset yield entry1 def load_input(self, context): return None def get_output_asset_key(self, context): return AssetKey([context.step_key]) @io_manager def my_io_manager(_): return MyIOManager() @solid(output_defs=[OutputDefinition(name="output1")]) def solid1(_): return Output(None, "output1") @solid(output_defs=[OutputDefinition(name="output2")]) def solid2(_, _input1): yield Output( 7, "output2", metadata_entries=[entry2], ) @pipeline(mode_defs=[ ModeDefinition(resource_defs={"io_manager": my_io_manager}) ]) def my_pipeline(): solid2(solid1()) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 2 check_materialization(materializations[0], AssetKey(["solid1"]), metadata_entries=[entry1]) check_materialization( materializations[1], AssetKey(["solid2"]), metadata_entries=[entry1, entry2], parent_assets=[AssetLineageInfo(AssetKey(["solid1"]))], )
def test_dynamic_output_definition_single_partition_materialization(): entry1 = EventMetadataEntry.int(123, "nrows") entry2 = EventMetadataEntry.float(3.21, "some value") @solid(output_defs=[ OutputDefinition(name="output1", asset_key=AssetKey("table1")) ]) def solid1(_): return Output(None, "output1", metadata_entries=[entry1]) @solid(output_defs=[ DynamicOutputDefinition( name="output2", asset_key=lambda context: AssetKey(context.mapping_key)) ]) def solid2(_, _input1): for i in range(4): yield DynamicOutput( 7, mapping_key=str(i), output_name="output2", metadata_entries=[entry2], ) @solid def do_nothing(_, _input1): pass @pipeline def my_pipeline(): solid2(solid1()).map(do_nothing) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 5 check_materialization(materializations[0], AssetKey(["table1"]), metadata_entries=[entry1]) seen_paths = set() for i in range(1, 5): path = materializations[i].asset_key.path seen_paths.add(tuple(path)) check_materialization( materializations[i], AssetKey(path), metadata_entries=[entry2], parent_assets=[AssetLineageInfo(AssetKey(["table1"]))], ) assert len(seen_paths) == 4
def handle_output(self, context, obj): """Pickle the data and store the object to a custom file path. This method emits an AssetMaterialization event so the assets will be tracked by the Asset Catalog. """ check.inst_param(context, "context", OutputContext) metadata = context.metadata path = check.str_param(metadata.get("path"), "metadata.path") filepath = self._get_path(path) # Ensure path exists mkdir_p(os.path.dirname(filepath)) context.log.debug(f"Writing file at: {filepath}") with open(filepath, self.write_mode) as write_obj: pickle.dump(obj, write_obj, PICKLE_PROTOCOL) return AssetMaterialization( asset_key=AssetKey( [context.pipeline_name, context.step_key, context.name]), metadata_entries=[ EventMetadataEntry.fspath(os.path.abspath(filepath)) ], )
def fail_solid(_): yield Output( None, metadata_entries=[ PartitionMetadataEntry("3", EventMetadataEntry.int(1, "x")) ], )
def handle_output(self, context: OutputContext, obj: bytes): """obj: bytes""" check.inst_param(context, "context", OutputContext) # the output notebook itself is stored at output_file_path output_notebook_path = self._get_path(context) mkdir_p(os.path.dirname(output_notebook_path)) with open(output_notebook_path, self.write_mode) as dest_file_obj: dest_file_obj.write(obj) yield EventMetadataEntry.fspath(path=output_notebook_path, label="path")
def test_output_definition_single_partition_materialization(): entry1 = EventMetadataEntry.int(123, "nrows") entry2 = EventMetadataEntry.float(3.21, "some value") @solid(output_defs=[ OutputDefinition(name="output1", asset_key=AssetKey("table1")) ]) def solid1(_): return Output(None, "output1", metadata_entries=[entry1]) @solid(output_defs=[ OutputDefinition(name="output2", asset_key=lambda _: AssetKey("table2")) ]) def solid2(_, _input1): yield Output( 7, "output2", metadata_entries=[entry2], ) @pipeline def my_pipeline(): solid2(solid1()) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 2 check_materialization(materializations[0], AssetKey(["table1"]), metadata_entries=[entry1]) check_materialization( materializations[1], AssetKey(["table2"]), metadata_entries=[entry2], parent_assets=[AssetLineageInfo(AssetKey(["table1"]))], )
def test_table_metadata_value_schema_inference(): table_metadata_value = EventMetadataEntry.table( records=[ TableRecord(name="foo", status=False), TableRecord(name="bar", status=True), ], label="foo", ) schema = table_metadata_value.entry_data.schema assert isinstance(schema, TableSchema) assert schema.columns == [ TableColumn(name="name", type="string"), TableColumn(name="status", type="bool"), ]
def test_input_definition_multiple_partition_lineage(): entry1 = EventMetadataEntry.int(123, "nrows") entry2 = EventMetadataEntry.float(3.21, "some value") partition_entries = [ EventMetadataEntry.int(123 * i * i, "partition count") for i in range(3) ] @solid( output_defs=[ OutputDefinition( name="output1", asset_key=AssetKey("table1"), asset_partitions=set([str(i) for i in range(3)]), ) ], ) def solid1(_): return Output( None, "output1", metadata_entries=[ entry1, *[ PartitionMetadataEntry(str(i), entry) for i, entry in enumerate(partition_entries) ], ], ) @solid( input_defs=[ # here, only take 1 of the asset keys specified by the output InputDefinition(name="_input1", asset_key=AssetKey("table1"), asset_partitions=set(["0"])) ], output_defs=[ OutputDefinition(name="output2", asset_key=lambda _: AssetKey("table2")) ], ) def solid2(_, _input1): yield Output( 7, "output2", metadata_entries=[entry2], ) @pipeline def my_pipeline(): solid2(solid1()) result = execute_pipeline(my_pipeline) events = result.step_event_list materializations = [ event for event in events if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 4 seen_partitions = set() for i in range(3): partition = materializations[i].partition seen_partitions.add(partition) check_materialization( materializations[i], AssetKey(["table1"]), metadata_entries=[entry1, partition_entries[int(partition)]], ) assert len(seen_partitions) == 3 check_materialization( materializations[-1], AssetKey(["table2"]), parent_assets=[n_asset_keys("table1", 1)], metadata_entries=[entry2], )
def _t_fn(step_context, inputs): check.inst_param(step_context, "step_context", SolidExecutionContext) check.param_invariant( isinstance(step_context.run_config, dict), "context", "StepExecutionContext must have valid run_config", ) step_execution_context = step_context.get_step_execution_context() with tempfile.TemporaryDirectory() as output_notebook_dir: with safe_tempfile_path() as output_log_path: prefix = str(uuid.uuid4()) parameterized_notebook_path = os.path.join( output_notebook_dir, f"{prefix}-inter.ipynb") executed_notebook_path = os.path.join(output_notebook_dir, f"{prefix}-out.ipynb") # Scaffold the registration here nb = load_notebook_node(notebook_path) compute_descriptor = ("solid" if dagster_factory_name == "define_dagstermill_solid" else "op") nb_no_parameters = replace_parameters( step_execution_context, nb, get_papermill_parameters(step_execution_context, inputs, output_log_path, compute_descriptor), ) write_ipynb(nb_no_parameters, parameterized_notebook_path) try: papermill_engines.register("dagstermill", DagstermillEngine) papermill.execute_notebook( input_path=parameterized_notebook_path, output_path=executed_notebook_path, engine_name="dagstermill", log_output=True, ) except Exception as ex: step_execution_context.log.warn( "Error when attempting to materialize executed notebook: {exc}" .format(exc=str( serializable_error_info_from_exc_info( sys.exc_info())))) # pylint: disable=no-member # compat: if isinstance( ex, ExecutionError) and (ex.ename == "RetryRequested" or ex.ename == "Failure"): step_execution_context.log.warn( f"Encountered raised {ex.ename} in notebook. Use dagstermill.yield_event " "with RetryRequested or Failure to trigger their behavior." ) raise step_execution_context.log.debug( "Notebook execution complete for {name} at {executed_notebook_path}." .format( name=name, executed_notebook_path=executed_notebook_path, )) if output_notebook_name is not None: # yield output notebook binary stream as a solid output with open(executed_notebook_path, "rb") as fd: yield Output(fd.read(), output_notebook_name) else: # backcompat executed_notebook_file_handle = None try: # use binary mode when when moving the file since certain file_managers such as S3 # may try to hash the contents with open(executed_notebook_path, "rb") as fd: executed_notebook_file_handle = step_context.resources.file_manager.write( fd, mode="wb", ext="ipynb") executed_notebook_materialization_path = ( executed_notebook_file_handle.path_desc) yield AssetMaterialization( asset_key=(asset_key_prefix + [f"{name}_output_notebook"]), description= "Location of output notebook in file manager", metadata_entries=[ EventMetadataEntry.fspath( executed_notebook_materialization_path) ], ) except Exception: # if file manager writing errors, e.g. file manager is not provided, we throw a warning # and fall back to the previously stored temp executed notebook. step_context.log.warning( "Error when attempting to materialize executed notebook using file manager: " f"{str(serializable_error_info_from_exc_info(sys.exc_info()))}" f"\nNow falling back to local: notebook execution was temporarily materialized at {executed_notebook_path}" "\nIf you have supplied a file manager and expect to use it for materializing the " 'notebook, please include "file_manager" in the `required_resource_keys` argument ' f"to `{dagster_factory_name}`") if output_notebook is not None: yield Output(executed_notebook_file_handle, output_notebook) # deferred import for perf import scrapbook output_nb = scrapbook.read_notebook(executed_notebook_path) for (output_name, _) in step_execution_context.solid_def.output_dict.items(): data_dict = output_nb.scraps.data_dict if output_name in data_dict: # read outputs that were passed out of process via io manager from `yield_result` step_output_handle = StepOutputHandle( step_key=step_execution_context.step.key, output_name=output_name) output_context = step_execution_context.get_output_context( step_output_handle) io_manager = step_execution_context.get_io_manager( step_output_handle) value = io_manager.load_input( build_input_context(upstream_output=output_context)) yield Output(value, output_name) for key, value in output_nb.scraps.items(): if key.startswith("event-"): with open(value.data, "rb") as fd: event = pickle.loads(fd.read()) if isinstance(event, (Failure, RetryRequested)): raise event else: yield event