def sort_by_calories(context, cereals): sorted_cereals = sorted(cereals, key=lambda cereal: int(cereal["calories"])) least_caloric = sorted_cereals[0]["name"] most_caloric = sorted_cereals[-1]["name"] logger = get_dagster_logger() logger.info(f"Least caloric cereal: {least_caloric}") logger.info(f"Most caloric cereal: {most_caloric}") fieldnames = list(sorted_cereals[0].keys()) sorted_cereals_csv_path = os.path.abspath( f"output/calories_sorted_{context.run_id}.csv") os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True) with open(sorted_cereals_csv_path, "w") as fd: writer = csv.DictWriter(fd, fieldnames) writer.writeheader() writer.writerows(sorted_cereals) context.log_event( AssetMaterialization( asset_key="sorted_cereals_csv", description="Cereals data frame sorted by caloric content", metadata={ "sorted_cereals_csv_path": MetadataValue.path(sorted_cereals_csv_path) }, ))
def raw_file_op(_context): yield AssetMaterialization( asset_key="table_info", metadata={ "table_path": MetadataValue.path("/path/to/{}.raw".format(name)) }, ) yield do_expectation(_context, name) yield Output(name)
def test_fs_io_manager(): with tempfile.TemporaryDirectory() as tmpdir_path: io_manager = fs_io_manager.configured({"base_dir": tmpdir_path}) pipeline_def = define_pipeline(io_manager) result = execute_pipeline(pipeline_def) assert result.success handled_output_events = list( filter(lambda evt: evt.is_handled_output, result.event_list)) assert len(handled_output_events) == 2 filepath_a = os.path.join(tmpdir_path, result.run_id, "solid_a", "result") result_metadata_entry_a = handled_output_events[ 0].event_specific_data.metadata_entries[0] assert result_metadata_entry_a.label == "path" assert result_metadata_entry_a.value == MetadataValue.path(filepath_a) assert os.path.isfile(filepath_a) with open(filepath_a, "rb") as read_obj: assert pickle.load(read_obj) == [1, 2, 3] loaded_input_events = list( filter(lambda evt: evt.is_loaded_input, result.event_list)) input_metadata_entry_a = loaded_input_events[ 0].event_specific_data.metadata_entries[0] assert input_metadata_entry_a.label == "path" assert input_metadata_entry_a.value == MetadataValue.path(filepath_a) assert len(loaded_input_events) == 1 assert "solid_a" == loaded_input_events[ 0].event_specific_data.upstream_step_key filepath_b = os.path.join(tmpdir_path, result.run_id, "solid_b", "result") result_metadata_entry_b = handled_output_events[ 1].event_specific_data.metadata_entries[0] assert result_metadata_entry_b.label == "path" assert result_metadata_entry_b.value == MetadataValue.path(filepath_b) assert os.path.isfile(filepath_b) with open(filepath_b, "rb") as read_obj: assert pickle.load(read_obj) == 1
def the_solid(_context): yield AssetMaterialization( asset_key="foo", metadata={ "text": "FOO", "int": 22, "url": MetadataValue.url("http://fake.com"), "float": 0.1, "path": MetadataValue.path(Path("/a/b.csv")), "python": MetadataValue.python_artifact(MetadataValue), }, )
def my_failure_metadata_op(): path = "/path/to/files" my_files = get_files(path) if len(my_files) == 0: raise Failure( description="No files to process", metadata={ "filepath": MetadataValue.path(path), "dashboard_url": MetadataValue.url("http://mycoolsite.com/failures"), }, ) return some_calculation(my_files)
def file_handle_to_s3(context, file_handle): bucket = context.solid_config["Bucket"] key = context.solid_config["Key"] with context.resources.file_manager.read(file_handle, "rb") as fileobj: context.resources.s3.upload_fileobj(fileobj, bucket, key) s3_file_handle = S3FileHandle(bucket, key) yield AssetMaterialization( asset_key=s3_file_handle.s3_path, metadata={ last_key(key): MetadataValue.path(s3_file_handle.s3_path) }, ) yield Output(value=s3_file_handle, output_name="s3_file_handle")
def my_metadata_materialization_op(context): df = read_df() remote_storage_path = persist_to_storage(df) context.log_event( AssetMaterialization( asset_key="my_dataset", description="Persisted result to storage", metadata={ "text_metadata": "Text-based metadata for this event", "path": MetadataValue.path(remote_storage_path), "dashboard_url": MetadataValue.url("http://mycoolsite.com/url_for_my_data"), "size (bytes)": calculate_bytes(df), }, )) return remote_storage_path
def read_file(context): relative_filename = context.op_config["filename"] directory = context.op_config["directory"] filename = os.path.join(directory, relative_filename) try: fstats = os.stat(filename) context.log.info("Found file {}".format(relative_filename)) yield AssetMaterialization( asset_key=AssetKey(["log_file", relative_filename]), metadata={ "path": MetadataValue.path(filename), "File status": { "size": fstats.st_size, "ctime": fstats.st_ctime, "mtime": fstats.st_mtime, }, }, ) yield Output(relative_filename) except FileNotFoundError: context.log.error("No file found: {}".format(relative_filename))
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key="table_info", metadata={ "table_name": table, "table_path": MetadataValue.path(f"/path/to/{table}"), "table_data": { "name": table }, "table_name_big": MetadataValue.url(f"https://bigty.pe/{table}"), "table_blurb": MetadataValue.md(md_str), "big_int": 29119888133298982934829348, "float_nan": float("nan"), }, )
def many_materializations_and_passing_expectations(_context): tables = [ "users", "groups", "events", "friends", "pages", "fans", "event_admins", "group_admins", ] for table in tables: yield AssetMaterialization( asset_key="table_info", metadata={ "table_path": MetadataValue.path(f"/path/to/{table}.raw"), }, ) yield ExpectationResult( success=True, label="{table}.row_count".format(table=table), description="Row count passed for {table}".format(table=table), )
partitions_def=daily_partitions_def) def downstream_daily_partitioned_asset(upstream_daily_partitioned_asset): assert upstream_daily_partitioned_asset is None @asset( metadata={"owner": "*****@*****.**"}, partitions_def=HourlyPartitionsDefinition( start_date=datetime(2022, 3, 12, 0, 0)), ) def hourly_partitioned_asset(): pass @asset( metadata={ "owner": "*****@*****.**", "text_metadata": "Text-based metadata about this asset", "path": MetadataValue.path("/unpartitioned/asset"), "dashboard_url": MetadataValue.url("http://mycoolsite.com/url_for_my_asset"), }, ) def unpartitioned_asset(): pass partitioned_asset_group = AssetGroup.from_current_module()