def get_assets(): """ // --> b -- \\ a d \\ --> c -- // e --> f """ a = source_asset(path="a") @computed_asset(input_assets=[a]) def b(_): pass @computed_asset(input_assets=[a]) def c(_): pass @computed_asset(input_assets=[b, c]) def d(_b, _c): pass e = source_asset(path="e") @computed_asset(input_assets=[e]) def f(_): pass return a, b, c, d, e, f
def test_get_computed_asset_solid_def_with_source_deps_multiple_storages(basic_lakehouse): source_asset1 = source_asset(storage_key='storage1', path=('a', 'b')) source_asset2 = source_asset(storage_key='storage2', path=('a', 'c')) @computed_asset(storage_key='storage1', input_assets=[source_asset1, source_asset2]) def some_asset(source1: int, source2: int) -> int: return source1 + source2 solid_def = basic_lakehouse.get_computed_asset_solid_def(some_asset, []) assert solid_def.required_resource_keys == {'storage1', 'storage2'} _assert_input_defs(solid_def, []) _assert_output_def(solid_def, some_asset.dagster_type, 'result')
def test_get_computed_asset_solid_def_with_source_deps_multiple_storages(basic_lakehouse): source_asset1 = source_asset(storage_key="storage1", path=("a", "b")) source_asset2 = source_asset(storage_key="storage2", path=("a", "c")) @computed_asset(storage_key="storage1", input_assets=[source_asset1, source_asset2]) def some_asset(source1: int, source2: int) -> int: return source1 + source2 solid_def = basic_lakehouse.get_computed_asset_solid_def(some_asset, []) assert solid_def.required_resource_keys == {"storage1", "storage2"} _assert_input_defs(solid_def, []) _assert_output_def(solid_def, some_asset.dagster_type, "result")
def test_computed_asset_multiple_deps_list(): source_asset1 = source_asset(storage_key='filesystem', path=('a', 'b')) source_asset2 = source_asset(storage_key='filesystem', path=('a', 'c')) @computed_asset(storage_key='filesystem', input_assets=[source_asset1, source_asset2]) def casset(b_: int, c_: float) -> str: return str(b_) + str(c_) assert casset.computation assert casset.path == ('casset',) assert casset.computation.output_in_memory_type == str assert set(casset.computation.deps.keys()) == set(['b_', 'c_']) assert casset.computation.deps['b_'].in_memory_type == int assert casset.computation.deps['b_'].asset == source_asset1 assert casset.computation.deps['c_'].in_memory_type == float assert casset.computation.deps['c_'].asset == source_asset2
def test_computed_asset_multiple_deps_list(): source_asset1 = source_asset(storage_key="filesystem", path=("a", "b")) source_asset2 = source_asset(storage_key="filesystem", path=("a", "c")) @computed_asset(storage_key="filesystem", input_assets=[source_asset1, source_asset2]) def casset(b_: int, c_: float) -> str: return str(b_) + str(c_) assert casset.computation assert casset.path == ("casset",) assert casset.computation.output_in_memory_type == str assert set(casset.computation.deps.keys()) == set(["b_", "c_"]) assert casset.computation.deps["b_"].in_memory_type == int assert casset.computation.deps["b_"].asset == source_asset1 assert casset.computation.deps["c_"].in_memory_type == float assert casset.computation.deps["c_"].asset == source_asset2
def test_build_pipeline_definition_missing_input_policy(basic_lakehouse): source_asset1 = source_asset(storage_key='storage1', path=('a', 'b')) @computed_asset(storage_key='storage1', input_assets=[source_asset1]) def some_asset(source: str) -> int: return int(source) with pytest.raises(CheckError): basic_lakehouse.build_pipeline_definition('some_pipeline', [some_asset])
def test_computed_asset_one_dep(): source_asset1 = source_asset(storage_key='filesystem', path=('a', 'b')) @computed_asset(storage_key='filesystem', input_assets={'a_': source_asset1}) def casset(a_: int) -> str: return str(a_) assert casset.computation assert casset.path == ('casset',) assert casset.computation.output_in_memory_type == str assert list(casset.computation.deps.keys()) == ['a_'] assert casset.computation.deps['a_'].in_memory_type == int assert casset.computation.deps['a_'].asset == source_asset1
def test_computed_asset_one_dep(): source_asset1 = source_asset(storage_key="filesystem", path=("a", "b")) @computed_asset(storage_key="filesystem", input_assets={"a_": source_asset1}) def casset(a_: int) -> str: return str(a_) assert casset.computation assert casset.path == ("casset",) assert casset.computation.output_in_memory_type == str assert list(casset.computation.deps.keys()) == ["a_"] assert casset.computation.deps["a_"].in_memory_type == int assert casset.computation.deps["a_"].asset == source_asset1
'''Asset definitions for the simple_lakehouse example.''' import pandas as pd from lakehouse import computed_asset, source_asset from pandas import DataFrame as PandasDF from pyspark.sql import DataFrame as SparkDF from pyspark.sql import Window from pyspark.sql import functions as f sfo_q2_weather_sample_asset = source_asset(storage_key='filesystem', path=('dagster_examples', 'simple_lakehouse', 'sfo_q2_weather_sample')) @computed_asset(storage_key='filesystem', input_assets=[sfo_q2_weather_sample_asset]) def daily_temperature_highs_asset(sfo_q2_weather_sample: PandasDF) -> PandasDF: '''Computes the temperature high for each day''' sfo_q2_weather_sample['valid_date'] = pd.to_datetime( sfo_q2_weather_sample['valid']) return sfo_q2_weather_sample.groupby('valid_date').max().rename( columns={'tmpf': 'max_tmpf'}) @computed_asset(storage_key='filesystem', input_assets=[daily_temperature_highs_asset]) def daily_temperature_high_diffs_asset( daily_temperature_highs: SparkDF) -> SparkDF: '''Computes the difference between each day's high and the previous day's high''' window = Window.orderBy('valid_date') return daily_temperature_highs.select(