def make_simple_lakehouse(): dev_mode = ModeDefinition( name="dev", resource_defs={ "pyspark": pyspark_resource, "filesystem": local_file_system_storage.configured({"root": "."}), }, ) prod_mode = ModeDefinition( name="prod", resource_defs={ "pyspark": pyspark_resource, "filesystem": s3_storage.configured({ "bucket": "some_bucket", "prefix": "some_prefix" }), }, ) return Lakehouse( mode_defs=[dev_mode, prod_mode], in_memory_type_resource_keys={SparkDF: ["pyspark"]}, )
def make_simple_lakehouse(): dev_mode = ModeDefinition( name='dev', resource_defs={'pyspark': pyspark_resource, 'filesystem': local_file_system_storage}, ) dev = PresetDefinition( name='dev', mode='dev', run_config={'resources': {'filesystem': {'config': {'root': '.'}}}}, solid_selection=None, ) prod_mode = ModeDefinition( name='prod', resource_defs={'pyspark': pyspark_resource, 'filesystem': s3_storage}, ) prod = PresetDefinition( name='prod', mode='prod', run_config={'resources': {'filesystem': {'config': {'root': '.'}}}}, solid_selection=None, ) return Lakehouse( preset_defs=[dev, prod], mode_defs=[dev_mode, prod_mode], in_memory_type_resource_keys={SparkDF: ['pyspark']}, type_storage_policies=[ SparkDFLocalFileSystemPolicy, PandasDFLocalFileSystemPolicy, SparkDFS3Policy, PandasDFS3Policy, ], )
def basic_lakehouse_and_storages(): storage1 = DictStorage() storage2 = DictStorage() @resource() def some_storage(_): return storage1 @resource() def some_other_storage(_): return storage2 dev_mode = ModeDefinition( name="dev", resource_defs={ "storage1": some_storage, "storage2": some_other_storage }, ) dev_preset = PresetDefinition(name="dev", mode="dev", run_config={}, solid_selection=None) return ( Lakehouse(mode_defs=[dev_mode], preset_defs=[dev_preset]), storage1, storage2, )
def make_multi_type_lakehouse(): dev_mode = ModeDefinition( resource_defs={ "pyspark": pyspark_resource, "default_storage": local_file_system_storage.configured({"root": "."}), }, ) return Lakehouse(mode_defs=[dev_mode], in_memory_type_resource_keys={SparkDF: ["pyspark"]},)
def make_simple_lakehouse(): dev_mode = ModeDefinition( name='dev', resource_defs={ 'filesystem': pandas_df_local_filesystem_storage.configured({'root': '.'}), }, ) return Lakehouse(mode_defs=[dev_mode])
def make_simple_lakehouse(): dev_mode = ModeDefinition( name='dev', resource_defs={ 'pyspark': pyspark_resource, 'filesystem': local_file_system_storage.configured({'root': '.'}), }, ) prod_mode = ModeDefinition( name='prod', resource_defs={ 'pyspark': pyspark_resource, 'filesystem': s3_storage.configured({'bucket': 'some_bucket', 'prefix': 'some_prefix'}), }, ) return Lakehouse( mode_defs=[dev_mode, prod_mode], in_memory_type_resource_keys={SparkDF: ['pyspark']}, )
def basic_lakehouse_and_storages(): class DictStorage(AssetStorage): def __init__(self): self.the_dict = {} def save(self, obj, path, _resources): self.the_dict[path] = obj def load(self, _python_type, path, _resources): return self.the_dict[path] storage1 = DictStorage() storage2 = DictStorage() @asset_storage() def some_storage(_): return storage1 @asset_storage() def some_other_storage(_): return storage2 dev_mode = ModeDefinition( name="dev", resource_defs={ "storage1": some_storage, "storage2": some_other_storage }, ) dev_preset = PresetDefinition(name="dev", mode="dev", run_config={}, solid_selection=None) return ( Lakehouse(mode_defs=[dev_mode], preset_defs=[dev_preset]), storage1, storage2, )
from dagster import ModeDefinition, resource from lakehouse import Lakehouse, computed_asset from lakehouse_tests.conftest import DictStorage @computed_asset() def asset1(): pass @computed_asset(input_assets=[asset1]) def asset2(_): pass @resource() def a_storage(_): return DictStorage() lakehouse_def = Lakehouse( mode_defs=[ ModeDefinition(name="dev", resource_defs={"default_storage": a_storage}) ], assets=[asset1, asset2], )
def basic_lakehouse_and_storages(): class DictStorage: def __init__(self): self.the_dict = {} storage1 = DictStorage() storage2 = DictStorage() @resource def some_storage(_): return storage1 @resource def some_other_storage(_): return storage2 dev_mode = ModeDefinition( name='dev', resource_defs={ 'storage1': some_storage, 'storage2': some_other_storage, }, ) dev_preset = PresetDefinition( name='dev', mode='dev', run_config={}, solid_selection=None, ) class IntSomeStoragePolicy(TypeStoragePolicy): @classmethod def in_memory_type(cls): return int @classmethod def storage_definition(cls): return some_storage @classmethod def save(cls, obj, storage, path, _resources): storage.the_dict[path] = obj @classmethod def load(cls, storage, path, _resources): return storage.the_dict[path] class IntSomeOtherStoragePolicy(TypeStoragePolicy): @classmethod def in_memory_type(cls): return int @classmethod def storage_definition(cls): return some_other_storage @classmethod def save(cls, obj, storage, path, _resources): storage.the_dict[path] = obj @classmethod def load(cls, storage, path, _resources): return storage.the_dict[path] return ( Lakehouse( mode_defs=[dev_mode], preset_defs=[dev_preset], type_storage_policies=[ IntSomeStoragePolicy, IntSomeOtherStoragePolicy ], ), storage1, storage2, )
def __init__(self, root): self._root = root def _get_fs_path(self, path: Tuple[str, ...]) -> str: rpath = os.path.join(self._root, *path) + ".csv" return os.path.abspath(rpath) def save(self, obj: pd.DataFrame, path: Tuple[str, ...], _resources) -> None: """This saves the dataframe as a CSV.""" fpath = self._get_fs_path(path) obj.to_csv(fpath) def load(self, _python_type, path: Tuple[str, ...], _resources): """This reads a dataframe from a CSV.""" fpath = self._get_fs_path(path) return pd.read_csv(fpath) @resource(config_schema={"root": StringSource}) def local_fs_storage(init_context): return LocalFileSystemStorage(init_context.resource_config["root"]) simple_lakehouse = Lakehouse( mode_defs=[ ModeDefinition( resource_defs={"default_storage": local_fs_storage.configured({"root": "."})}, ) ] )