def raw_file_op(_context): yield AssetMaterialization( asset_key="table_info", metadata={"table_path": EventMetadata.path("/path/to/{}.raw".format(name))}, ) yield do_expectation(_context, name) yield Output(name)
def sort_by_calories(context, cereals): sorted_cereals = sorted(cereals, key=lambda cereal: int(cereal["calories"])) least_caloric = sorted_cereals[0]["name"] most_caloric = sorted_cereals[-1]["name"] logger = get_dagster_logger() logger.info(f"Least caloric cereal: {least_caloric}") logger.info(f"Most caloric cereal: {most_caloric}") fieldnames = list(sorted_cereals[0].keys()) sorted_cereals_csv_path = os.path.abspath( f"output/calories_sorted_{context.run_id}.csv") os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True) with open(sorted_cereals_csv_path, "w") as fd: writer = csv.DictWriter(fd, fieldnames) writer.writeheader() writer.writerows(sorted_cereals) yield AssetMaterialization( asset_key="sorted_cereals_csv", description="Cereals data frame sorted by caloric content", metadata={ "sorted_cereals_csv_path": EventMetadata.path(sorted_cereals_csv_path) }, ) yield Output(None)
def my_failure_metadata_solid(): path = "/path/to/files" my_files = get_files(path) if len(my_files) == 0: raise Failure( description="No files to process", metadata={ "filepath": EventMetadata.path(path), "dashboard_url": EventMetadata.url("http://mycoolsite.com/failures"), }, ) return some_calculation(my_files)
def my_metadata_materialization_solid(context): df = read_df() remote_storage_path = persist_to_storage(df) yield AssetMaterialization( asset_key="my_dataset", description="Persisted result to storage", metadata={ "text_metadata": "Text-based metadata for this event", "path": EventMetadata.path(remote_storage_path), "dashboard_url": EventMetadata.url("http://mycoolsite.com/url_for_my_data"), "size (bytes)": calculate_bytes(df), }, ) yield Output(remote_storage_path)
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key="table_info", metadata={ "table_name": table, "table_path": EventMetadata.path(f"/path/to/{table}"), "table_data": {"name": table}, "table_name_big": EventMetadata.url(f"https://bigty.pe/{table}"), "table_blurb": EventMetadata.md(md_str), "big_int": 29119888133298982934829348, "float_nan": float("nan"), }, )
def observes_dataset_op(context): df = read_df() remote_storage_path = persist_to_storage(df) context.log_event( AssetObservation( asset_key="my_dataset", metadata={ "text_metadata": "Text-based metadata for this event", "path": EventMetadata.path(remote_storage_path), "dashboard_url": EventMetadata.url("http://mycoolsite.com/url_for_my_data"), "size (bytes)": calculate_bytes(df), }, )) context.log_event(AssetMaterialization(asset_key="my_dataset")) return remote_storage_path
def read_file(context): relative_filename = context.solid_config["filename"] directory = context.solid_config["directory"] filename = os.path.join(directory, relative_filename) try: fstats = os.stat(filename) context.log.info("Found file {}".format(relative_filename)) yield AssetMaterialization( asset_key=AssetKey(["log_file", relative_filename]), metadata={ "path": EventMetadata.path(filename), "File status": { "size": fstats.st_size, "ctime": fstats.st_ctime, "mtime": fstats.st_mtime, }, }, ) yield Output(relative_filename) except FileNotFoundError: context.log.error("No file found: {}".format(relative_filename))
def many_materializations_and_passing_expectations(_context): tables = [ "users", "groups", "events", "friends", "pages", "fans", "event_admins", "group_admins", ] for table in tables: yield AssetMaterialization( asset_key="table_info", metadata={ "table_path": EventMetadata.path(f"/path/to/{table}.raw"), }, ) yield ExpectationResult( success=True, label="{table}.row_count".format(table=table), description="Row count passed for {table}".format(table=table), )