Beispiel #1
0
def test_get_computed_asset_solid_def_with_source_deps(basic_lakehouse):
    source_asset1 = SourceAsset(storage_key='storage1', path=('a', 'b'))
    source_asset2 = SourceAsset(storage_key='storage1', path=('a', 'c'))

    @computed_asset(storage_key='storage1',
                    input_assets=[source_asset1, source_asset2])
    def some_asset(source1: int, source2: int) -> int:
        return source1 + source2

    solid_def = basic_lakehouse.get_computed_asset_solid_def(some_asset, [])
    assert solid_def.required_resource_keys == {'storage1'}
    _assert_input_defs(solid_def, [])
    _assert_output_def(solid_def, some_asset.dagster_type, 'result')
def test_computed_asset_multiple_deps_list():
    source_asset1 = SourceAsset(storage_key='filesystem', path=('a', 'b'))
    source_asset2 = SourceAsset(storage_key='filesystem', path=('a', 'c'))

    @computed_asset(storage_key='filesystem',
                    input_assets=[source_asset1, source_asset2])
    def casset(b_: int, c_: float) -> str:
        return str(b_) + str(c_)

    assert isinstance(casset, ComputedAsset)
    assert casset.path == ('casset', )
    assert casset.output_in_memory_type == str
    assert set(casset.deps.keys()) == set(['b_', 'c_'])
    assert casset.deps['b_'].in_memory_type == int
    assert casset.deps['b_'].asset == source_asset1
    assert casset.deps['c_'].in_memory_type == float
    assert casset.deps['c_'].asset == source_asset2
Beispiel #3
0
def test_build_pipeline_definition_missing_input_policy(basic_lakehouse):
    source_asset = SourceAsset(storage_key='storage1', path=('a', 'b'))

    @computed_asset(storage_key='storage1', input_assets=[source_asset])
    def some_asset(source: str) -> int:
        return int(source)

    with pytest.raises(CheckError):
        basic_lakehouse.build_pipeline_definition('some_pipeline',
                                                  [some_asset])
def test_computed_asset_one_dep():
    source_asset = SourceAsset(storage_key='filesystem', path=('a', 'b'))

    @computed_asset(storage_key='filesystem',
                    input_assets={'a_': source_asset})
    def casset(a_: int) -> str:
        return str(a_)

    assert isinstance(casset, ComputedAsset)
    assert casset.path == ('casset', )
    assert casset.output_in_memory_type == str
    assert list(casset.deps.keys()) == ['a_']
    assert casset.deps['a_'].in_memory_type == int
    assert casset.deps['a_'].asset == source_asset
Beispiel #5
0
'''Asset definitions for the simple_lakehouse example.'''
import pandas as pd
from lakehouse import SourceAsset, computed_asset
from pandas import DataFrame as PandasDF
from pyspark.sql import DataFrame as SparkDF
from pyspark.sql import Window
from pyspark.sql import functions as f

sfo_q2_weather_sample_asset = SourceAsset(
    storage_key='filesystem', path=('dagster_examples', 'simple_lakehouse', 'sfo_q2_weather_sample')
)


@computed_asset(storage_key='filesystem', input_assets=[sfo_q2_weather_sample_asset])
def daily_temperature_highs_asset(sfo_q2_weather_sample: PandasDF) -> PandasDF:
    '''Computes the temperature high for each day'''
    sfo_q2_weather_sample['valid_date'] = pd.to_datetime(sfo_q2_weather_sample['valid'])
    return sfo_q2_weather_sample.groupby('valid_date').max().rename(columns={'tmpf': 'max_tmpf'})


@computed_asset(storage_key='filesystem', input_assets=[daily_temperature_highs_asset])
def daily_temperature_high_diffs_asset(daily_temperature_highs: SparkDF) -> SparkDF:
    '''Computes the difference between each day's high and the previous day's high'''
    window = Window.orderBy('valid_date')
    return daily_temperature_highs.select(
        'valid_date',
        (
            daily_temperature_highs['max_tmpf']
            - f.lag(daily_temperature_highs['max_tmpf']).over(window)
        ).alias('day_high_diff'),
    )