"""Asset definitions for the multi_type_lakehouse example.""" import pandas as pd from lakehouse import Column, computed_table, source_table from pandas import DataFrame as PandasDF from pyarrow import date32, float64, string from pyspark.sql import DataFrame as SparkDF from pyspark.sql import Window from pyspark.sql import functions as f sfo_q2_weather_sample_table = source_table( path=("sfo_q2_weather_sample", ), columns=[Column("tmpf", float64()), Column("valid_date", string())], ) @computed_table( input_assets=[sfo_q2_weather_sample_table], columns=[Column("valid_date", date32()), Column("max_tmpf", float64())], ) def daily_temperature_highs_table(sfo_q2_weather_sample: PandasDF) -> PandasDF: """Computes the temperature high for each day""" sfo_q2_weather_sample["valid_date"] = pd.to_datetime( sfo_q2_weather_sample["valid"]) return sfo_q2_weather_sample.groupby("valid_date").max().rename( columns={"tmpf": "max_tmpf"}) @computed_table( input_assets=[daily_temperature_highs_table],
@computed_asset(storage_key='filesystem', input_assets=[source_asset1, source_asset2]) def casset(b_: int, c_: float) -> str: return str(b_) + str(c_) assert casset.computation assert casset.path == ('casset',) assert casset.computation.output_in_memory_type == str assert set(casset.computation.deps.keys()) == set(['b_', 'c_']) assert casset.computation.deps['b_'].in_memory_type == int assert casset.computation.deps['b_'].asset == source_asset1 assert casset.computation.deps['c_'].in_memory_type == float assert casset.computation.deps['c_'].asset == source_asset2 COLUMNS = [Column('a', str), Column('bb', int)] def test_computed_table_no_deps(): @computed_table(storage_key='filesystem', columns=COLUMNS) def casset() -> str: return 'a' assert casset.computation assert casset.path == ('casset',) assert casset.computation.output_in_memory_type == str assert len(casset.computation.deps.keys()) == 0 assert casset.columns == COLUMNS def test_computed_table_path():
'''Asset definitions for the simple_lakehouse example.''' import pandas as pd from lakehouse import Column, computed_table, source_table from pyarrow import date32, float64, string sfo_q2_weather_sample_table = source_table( storage_key='filesystem', path=('data', ), columns=[Column('tmpf', float64()), Column('valid_date', string())], ) @computed_table( storage_key='filesystem', input_assets=[sfo_q2_weather_sample_table], columns=[Column('valid_date', date32()), Column('max_tmpf', float64())], ) def daily_temperature_highs_table( sfo_q2_weather_sample: pd.DataFrame) -> pd.DataFrame: '''Computes the temperature high for each day''' sfo_q2_weather_sample['valid_date'] = pd.to_datetime( sfo_q2_weather_sample['valid']) return sfo_q2_weather_sample.groupby('valid_date').max().rename( columns={'tmpf': 'max_tmpf'})
@computed_asset(storage_key="filesystem", input_assets=[source_asset1, source_asset2]) def casset(b_: int, c_: float) -> str: return str(b_) + str(c_) assert casset.computation assert casset.path == ("casset",) assert casset.computation.output_in_memory_type == str assert set(casset.computation.deps.keys()) == set(["b_", "c_"]) assert casset.computation.deps["b_"].in_memory_type == int assert casset.computation.deps["b_"].asset == source_asset1 assert casset.computation.deps["c_"].in_memory_type == float assert casset.computation.deps["c_"].asset == source_asset2 COLUMNS = [Column("a", str), Column("bb", int)] def test_computed_table_no_deps(): @computed_table(storage_key="filesystem", columns=COLUMNS) def casset() -> str: return "a" assert casset.computation assert casset.path == ("casset",) assert casset.computation.output_in_memory_type == str assert len(casset.computation.deps.keys()) == 0 assert casset.columns == COLUMNS def test_computed_table_path():