def test_get_computed_asset_solid_def_with_source_deps(basic_lakehouse): source_asset1 = SourceAsset(storage_key='storage1', path=('a', 'b')) source_asset2 = SourceAsset(storage_key='storage1', path=('a', 'c')) @computed_asset(storage_key='storage1', input_assets=[source_asset1, source_asset2]) def some_asset(source1: int, source2: int) -> int: return source1 + source2 solid_def = basic_lakehouse.get_computed_asset_solid_def(some_asset, []) assert solid_def.required_resource_keys == {'storage1'} _assert_input_defs(solid_def, []) _assert_output_def(solid_def, some_asset.dagster_type, 'result')
def test_computed_asset_multiple_deps_list(): source_asset1 = SourceAsset(storage_key='filesystem', path=('a', 'b')) source_asset2 = SourceAsset(storage_key='filesystem', path=('a', 'c')) @computed_asset(storage_key='filesystem', input_assets=[source_asset1, source_asset2]) def casset(b_: int, c_: float) -> str: return str(b_) + str(c_) assert isinstance(casset, ComputedAsset) assert casset.path == ('casset', ) assert casset.output_in_memory_type == str assert set(casset.deps.keys()) == set(['b_', 'c_']) assert casset.deps['b_'].in_memory_type == int assert casset.deps['b_'].asset == source_asset1 assert casset.deps['c_'].in_memory_type == float assert casset.deps['c_'].asset == source_asset2
def test_build_pipeline_definition_missing_input_policy(basic_lakehouse): source_asset = SourceAsset(storage_key='storage1', path=('a', 'b')) @computed_asset(storage_key='storage1', input_assets=[source_asset]) def some_asset(source: str) -> int: return int(source) with pytest.raises(CheckError): basic_lakehouse.build_pipeline_definition('some_pipeline', [some_asset])
def test_computed_asset_one_dep(): source_asset = SourceAsset(storage_key='filesystem', path=('a', 'b')) @computed_asset(storage_key='filesystem', input_assets={'a_': source_asset}) def casset(a_: int) -> str: return str(a_) assert isinstance(casset, ComputedAsset) assert casset.path == ('casset', ) assert casset.output_in_memory_type == str assert list(casset.deps.keys()) == ['a_'] assert casset.deps['a_'].in_memory_type == int assert casset.deps['a_'].asset == source_asset
'''Asset definitions for the simple_lakehouse example.''' import pandas as pd from lakehouse import SourceAsset, computed_asset from pandas import DataFrame as PandasDF from pyspark.sql import DataFrame as SparkDF from pyspark.sql import Window from pyspark.sql import functions as f sfo_q2_weather_sample_asset = SourceAsset( storage_key='filesystem', path=('dagster_examples', 'simple_lakehouse', 'sfo_q2_weather_sample') ) @computed_asset(storage_key='filesystem', input_assets=[sfo_q2_weather_sample_asset]) def daily_temperature_highs_asset(sfo_q2_weather_sample: PandasDF) -> PandasDF: '''Computes the temperature high for each day''' sfo_q2_weather_sample['valid_date'] = pd.to_datetime(sfo_q2_weather_sample['valid']) return sfo_q2_weather_sample.groupby('valid_date').max().rename(columns={'tmpf': 'max_tmpf'}) @computed_asset(storage_key='filesystem', input_assets=[daily_temperature_highs_asset]) def daily_temperature_high_diffs_asset(daily_temperature_highs: SparkDF) -> SparkDF: '''Computes the difference between each day's high and the previous day's high''' window = Window.orderBy('valid_date') return daily_temperature_highs.select( 'valid_date', ( daily_temperature_highs['max_tmpf'] - f.lag(daily_temperature_highs['max_tmpf']).over(window) ).alias('day_high_diff'), )