def test_make_dagster_type(): SomeNamedTuple = collections.namedtuple("SomeNamedTuple", "prop") DagsterSomeNamedTuple = PythonObjectDagsterType(SomeNamedTuple) dagster_type = resolve_dagster_type(DagsterSomeNamedTuple) assert dagster_type.name == "SomeNamedTuple" assert SomeNamedTuple(prop="foo").prop == "foo" DagsterNewNameNamedTuple = PythonObjectDagsterType(SomeNamedTuple, name="OverwriteName") dagster_type = resolve_dagster_type(DagsterNewNameNamedTuple) assert dagster_type.name == "OverwriteName"
def test_make_dagster_type(): SomeNamedTuple = collections.namedtuple('SomeNamedTuple', 'prop') DagsterSomeNamedTuple = PythonObjectDagsterType(SomeNamedTuple) dagster_type = resolve_dagster_type(DagsterSomeNamedTuple) assert dagster_type.name == 'SomeNamedTuple' assert SomeNamedTuple(prop='foo').prop == 'foo' DagsterNewNameNamedTuple = PythonObjectDagsterType(SomeNamedTuple, name='OverwriteName') dagster_type = resolve_dagster_type(DagsterNewNameNamedTuple) assert dagster_type.name == 'OverwriteName'
def test_even_type_loader(): class EvenType: def __init__(self, num): assert num % 2 is 0 self.num = num @dagster_type_loader(int) def load_even_type(_, cfg): return EvenType(cfg) EvenDagsterType = PythonObjectDagsterType(EvenType, loader=load_even_type) @solid def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType: return EvenType(even_num.num * 2) yaml_doc = """ solids: double_even: inputs: even_num: 2 """ assert execute_solid(double_even, run_config=yaml.safe_load(yaml_doc)).success assert execute_solid( double_even, run_config={"solids": {"double_even": {"inputs": {"even_num": 2}}}} ).success # Same same as above w/r/t chatting to prha with pytest.raises(AssertionError): execute_solid( double_even, run_config={"solids": {"double_even": {"inputs": {"even_num": 3}}}} )
def test_even_type_materialization_config(): class EvenType: def __init__(self, num): assert num % 2 is 0 self.num = num @dagster_type_materializer({"path": str}) def save_to_file_materialization(_, cfg, value): with open(cfg["path"], "w") as ff: ff.write(str(value)) return AssetMaterialization( "path", "Wrote out value to {path}".format(path=path), metadata={"path": path} ) EvenDagsterType = PythonObjectDagsterType(EvenType, materializer=save_to_file_materialization) @solid def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType: return EvenType(even_num.num * 2) with safe_tempfile_path() as path: yaml_doc = """ solids: double_even: outputs: - result: path: {path} """ solid_result = execute_solid( double_even, input_values={"even_num": EvenType(2)}, run_config=yaml.safe_load(yaml_doc.format(path=path)), ) assert solid_result.success
def test_even_type_hydration_config(): class EvenType: def __init__(self, num): assert num % 2 is 0 self.num = num @input_hydration_config(int) def hydrate_even_type(_, cfg): return EvenType(cfg) EvenDagsterType = PythonObjectDagsterType( EvenType, input_hydration_config=hydrate_even_type) @solid def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType: return EvenType(even_num.num * 2) yaml_doc = ''' solids: double_even: inputs: even_num: 2 ''' assert execute_solid(double_even, run_config=yaml.safe_load(yaml_doc)).success assert execute_solid(double_even, run_config={ 'solids': { 'double_even': { 'inputs': { 'even_num': 2 } } } }).success # Same same as above w/r/t chatting to prha with pytest.raises(AssertionError): execute_solid(double_even, run_config={ 'solids': { 'double_even': { 'inputs': { 'even_num': 3 } } } })
def test_make_usable_as_dagster_type(): class EvenType: def __init__(self, num): assert num % 2 is 0 self.num = num EvenDagsterType = PythonObjectDagsterType(EvenType, name="EvenDagsterType",) make_python_type_usable_as_dagster_type(EvenType, EvenDagsterType) @solid def double_even(_, even_num: EvenType) -> EvenType: return EvenType(even_num.num * 2) assert execute_solid(double_even, input_values={"even_num": EvenType(2)}).success
def test_mypy_compliance(): class EvenType: def __init__(self, num): assert num % 2 is 0 self.num = num if typing.TYPE_CHECKING: EvenDagsterType = EvenType else: EvenDagsterType = PythonObjectDagsterType(EvenType) @solid def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType: return EvenType(even_num.num * 2) assert execute_solid(double_even, input_values={"even_num": EvenType(2)}).success
def test_python_object_dagster_type(): class EvenType: def __init__(self, num): assert num % 2 is 0 self.num = num EvenDagsterType = PythonObjectDagsterType(EvenType, name="EvenDagsterType") @solid def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType: # These type annotations are a shorthand for constructing InputDefinitions # and OutputDefinitions, and are not mypy compliant return EvenType(even_num.num * 2) assert execute_solid(double_even, input_values={"even_num": EvenType(2)}).success with pytest.raises(AssertionError): execute_solid(double_even, input_values={"even_num": EvenType(3)})
def test_validate_inputs(): @root_input_manager def my_loader(_): return 5 @solid(input_defs=[ InputDefinition("input1", dagster_type=PythonObjectDagsterType(int), root_manager_key="my_loader") ]) def my_solid(_, input1): return input1 @pipeline( mode_defs=[ModeDefinition(resource_defs={"my_loader": my_loader})]) def my_pipeline(): my_solid() execute_pipeline(my_pipeline)
def test_even_type_materialization_config(): class EvenType: def __init__(self, num): assert num % 2 is 0 self.num = num @output_materialization_config({'path': str}) def save_to_file_materialization(_, cfg, value): with open(cfg['path'], 'w') as ff: ff.write(str(value)) return Materialization( 'path', 'Wrote out value to {path}'.format(path=path), metadata_entries=[EventMetadataEntry.text('path', path)], ) EvenDagsterType = PythonObjectDagsterType( EvenType, output_materialization_config=save_to_file_materialization) @solid def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType: return EvenType(even_num.num * 2) with safe_tempfile_path() as path: yaml_doc = ''' solids: double_even: outputs: - result: path: {path} ''' solid_result = execute_solid( double_even, input_values={'even_num': EvenType(2)}, run_config=yaml.safe_load(yaml_doc.format(path=path)), ) assert solid_result.success
from dagster import PythonObjectDagsterType, solid # start_object_type class EvenType: def __init__(self, num): assert num % 2 is 0 self.num = num EvenDagsterType = PythonObjectDagsterType(EvenType, name="EvenDagsterType") # end_object_type # start_use_object_type @solid def double_even(even_num: EvenDagsterType) -> EvenDagsterType: return EvenType(even_num.num * 2) # end_use_object_type
] @dagster_type_materializer(String) def df_output_schema(_context, path, value): with open(path, "w") as fd: writer = csv.DictWriter(fd, fieldnames=value[0].keys()) writer.writeheader() writer.writerows(rowdicts=value) return AssetMaterialization.file(path) PoorMansDataFrame = PythonObjectDagsterType( python_type=list, name="PoorMansDataFrame", loader=df_input_schema, materializer=df_output_schema, ) @contextmanager def define_test_out_of_process_context(instance): check.inst_param(instance, "instance", DagsterInstance) with define_out_of_process_context(__file__, main_repo_name(), instance) as context: yield context def create_main_recon_repo(): return ReconstructableRepository.for_file(__file__, main_repo_name())
from dagster import ( Bool, Field, Int, PythonObjectDagsterType, String, composite_solid, execute_pipeline, pipeline, solid, ) if typing.TYPE_CHECKING: DataFrame = list else: DataFrame = PythonObjectDagsterType(list, name="DataFrame") # type: Any @solid( config_schema={ "delimiter": Field( String, default_value=",", is_required=False, description=("A one-character string used to separate fields."), ), "doublequote": Field( Bool, default_value=False,
class SparkDataFrameFilesystemStoragePlugin(TypeStoragePlugin): # pylint: disable=no-init @classmethod def compatible_with_storage_def(cls, system_storage_def): return system_storage_def is fs_system_storage @classmethod def set_object(cls, intermediate_store, obj, _context, _dagster_type, paths): target_path = os.path.join(intermediate_store.root, *paths) obj.write.parquet(intermediate_store.uri_for_paths(paths)) return target_path @classmethod def get_object(cls, intermediate_store, context, _dagster_type, paths): return context.resources.pyspark.spark_session.read.parquet( os.path.join(intermediate_store.root, *paths) ) @classmethod def required_resource_keys(cls): return frozenset({'pyspark'}) DataFrame = PythonObjectDagsterType( python_type=NativeSparkDataFrame, name='PySparkDataFrame', description='A PySpark data frame.', auto_plugins=[SparkDataFrameS3StoragePlugin, SparkDataFrameFilesystemStoragePlugin], output_materialization_config=spark_df_output_schema, )
"""Type definitions for the airline_demo.""" from collections import namedtuple import sqlalchemy from dagster import PythonObjectDagsterType from dagster.core.types.dagster_type import create_string_type AirlineDemoResources = namedtuple( 'AirlineDemoResources', ('spark', 's3', 'db_url', 'db_engine', 'db_dialect', 'redshift_s3_temp_dir', 'db_load'), ) SqlAlchemyEngineType = PythonObjectDagsterType( sqlalchemy.engine.Connectable, name='SqlAlchemyEngineType', description='A SqlAlchemy Connectable', ) SqlTableName = create_string_type('SqlTableName', description='The name of a database table')
from dagster import solid, SolidExecutionContext, Field, Array, String, PythonObjectDagsterType, make_python_type_usable_as_dagster_type from typing import Any, Optional, List, TYPE_CHECKING from azmeta.access.specifications import AzureComputeSpecifications, load_compute_specifications AzureComputeSpecificationsDagsterType = PythonObjectDagsterType( AzureComputeSpecifications) make_python_type_usable_as_dagster_type(AzureComputeSpecifications, AzureComputeSpecificationsDagsterType) @solid( config_schema={ 'subscription': Field(String, is_required=False, description='The subscription ID to list SKUs from.') }) def load_compute_specs( context: SolidExecutionContext) -> AzureComputeSpecifications: return load_compute_specifications(logger=context.log)
PositiveNumber = DagsterType( name="PostivieNumber", description="Only take in numbers greater than zero", type_check_fn=positive_num_check, loader=positive_num_loader) # How to use PythonObjectDagsterType class PercentType: def __init__(self, number): self.value = number * 100 PercentDagsterType = PythonObjectDagsterType(PercentType, name="PercentDagsterType") @solid( input_defs=[ InputDefinition("num1", PositiveNumber), InputDefinition("num2", PositiveNumber) ], output_defs=[OutputDefinition(PercentDagsterType)] # mypy compliance ) @solid def add_two_nums(_context, num1: PositiveNumber, num2: PositiveNumber ): # mypy compliance only works for naked python type adding = num1 + num2 # 2 + 3 => 5 add_percent_type = PercentType(adding) # object 5 yield ExpectationResult(
def create_lakehouse_table_def( name, lakehouse_fn, input_tables=None, other_input_defs=None, required_resource_keys=None, tags=None, description=None, ): input_tables = check.opt_list_param(input_tables, input_tables, of_type=LakehouseTableInputDefinition) other_input_defs = check.opt_list_param(other_input_defs, other_input_defs, of_type=InputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, 'required_resource_keys', of_type=str) table_type = PythonObjectDagsterType(python_type=ITableHandle, name=name, description=description) table_input_dict = { input_table.name: input_table for input_table in input_tables } input_defs = input_tables + other_input_defs validate_solid_fn('@solid', name, lakehouse_fn, input_defs, ['context']) def _compute(context, inputs): ''' Workhouse function of lakehouse. The inputs are something that inherits from ITableHandle. This compute_fn: (1) Iterates over input tables and ask the lakehouse resource to hydrate their contents or a representation of their contents (e.g a pyspark dataframe) into memory for computation (2) Pass those into the lakehouse table function. Do the actual thing. (3) Pass the output of the lakehouse function to the lakehouse materialize function. (4) Yield a materialization if the lakehouse function returned that. There's an argument that the hydrate and materialize functions should return a stream of events but that started to feel like I was implementing what should be a framework feature. ''' check.inst_param(context.resources.lakehouse, 'context.resources.lakehouse', Lakehouse) # hydrate tables hydrated_tables = {} other_inputs = {} for input_name, value in inputs.items(): context.log.info( 'About to hydrate table {input_name} for use in {name}'.format( input_name=input_name, name=name)) if input_name in table_input_dict: table_handle = value input_type = table_input_dict[input_name].runtime_type hydrated_tables[ input_name] = context.resources.lakehouse.hydrate( context, input_type, table_def_of_type(context.pipeline_def, input_type.name).tags, table_handle, tags, ) else: other_inputs[input_name] = value # call user-provided business logic which operates on the hydrated values # (as opposed to the handles) computed_output = lakehouse_fn(context, **hydrated_tables, **other_inputs) materialization, output_table_handle = context.resources.lakehouse.materialize( context, table_type, tags, computed_output) if materialization: yield materialization # just pass in a dummy handle for now if the materialize function # does not return one yield Output( output_table_handle if output_table_handle else TableHandle()) required_resource_keys.add('lakehouse') return LakehouseTableDefinition( lakehouse_fn=lakehouse_fn, name=name, input_tables=input_tables, input_defs=input_defs, output_defs=[OutputDefinition(table_type)], compute_fn=_compute, required_resource_keys=required_resource_keys, tags=tags, description=description, )
def compatible_with_storage_def(cls, system_storage_def): return (system_storage_def is fs_system_storage or system_storage_def is fs_intermediate_storage) @classmethod def set_object(cls, intermediate_store, obj, _context, _dagster_type, paths): target_path = os.path.join(intermediate_store.root, *paths) obj.write.parquet(intermediate_store.uri_for_paths(paths)) return target_path @classmethod def get_object(cls, intermediate_store, context, _dagster_type, paths): return context.resources.pyspark.spark_session.read.parquet( os.path.join(intermediate_store.root, *paths)) @classmethod def required_resource_keys(cls): return frozenset({'pyspark'}) DataFrame = PythonObjectDagsterType( python_type=NativeSparkDataFrame, name='PySparkDataFrame', description='A PySpark data frame.', auto_plugins=[ SparkDataFrameS3StoragePlugin, SparkDataFrameFilesystemStoragePlugin ], materializer=spark_df_materializer, )
from dagster import ( Bool, Field, Int, PythonObjectDagsterType, String, composite_solid, execute_pipeline, pipeline, solid, ) if typing.TYPE_CHECKING: DataFrame = list else: DataFrame = PythonObjectDagsterType(list, name='DataFrame') # type: Any @solid( config={ 'delimiter': Field( String, default_value=',', is_required=False, description=('A one-character string used to separate fields.'), ), 'doublequote': Field( Bool, default_value=False,
if file_type == "csv": return spark_read.csv(path, **dict_without_keys(file_options, "path")) elif file_type == "parquet": return spark_read.parquet(path, **dict_without_keys(file_options, "path")) elif file_type == "json": return spark_read.json(path, **dict_without_keys(file_options, "path")) elif file_type == "jdbc": return spark_read.jdbc(**file_options) elif file_type == "orc": return spark_read.orc(path, **dict_without_keys(file_options, "path")) elif file_type == "table": return spark_read.table(**file_options) elif file_type == "text": return spark_read.text(path, **dict_without_keys(file_options, "path")) elif file_type == "other": return spark_read.load(**file_options) else: raise DagsterInvariantViolationError( "Unsupported file_type {file_type}".format(file_type=file_type)) DataFrame = PythonObjectDagsterType( python_type=NativeSparkDataFrame, name="PySparkDataFrame", description="A PySpark data frame.", loader=dataframe_loader, materializer=dataframe_materializer, )
] @output_materialization_config(String) def df_output_schema(_context, path, value): with open(path, 'w') as fd: writer = csv.DictWriter(fd, fieldnames=value[0].keys()) writer.writeheader() writer.writerows(rowdicts=value) return Materialization.file(path) PoorMansDataFrame = PythonObjectDagsterType( python_type=list, name='PoorMansDataFrame', input_hydration_config=df_input_schema, output_materialization_config=df_output_schema, ) def define_test_subprocess_context(instance): check.inst_param(instance, 'instance', DagsterInstance) return define_subprocess_context_for_file(__file__, "test_repo", instance) def define_test_context(instance): check.inst_param(instance, 'instance', DagsterInstance) return define_context_for_file(__file__, "test_repo", instance) def create_main_recon_repo():
required_resource_keys={'spark'}, ) def write_rdd(context, file_type, file_options, spark_rdd): if file_type == 'csv': df = context.resources.spark.spark_session.createDataFrame(spark_rdd) context.log.info('DF: {}'.format(df)) df.write.csv(file_options['path'], header=file_options.get('header'), sep=file_options.get('sep')) else: check.failed('Unsupported file type: {}'.format(file_type)) SparkRDD = PythonObjectDagsterType( python_type=RDD, name='SparkRDD', input_hydration_config=load_rdd, output_materialization_config=write_rdd, ) @output_selector_schema( Selector({ 'csv': { 'path': Field(Path), 'sep': Field(String, is_required=False), 'header': Field(Bool, is_required=False), }, })) def spark_df_output_schema(_context, file_type, file_options, spark_df): if file_type == 'csv': spark_df.write.csv(file_options['path'],
target_path = os.path.join(intermediate_storage.root, *paths) value.write.parquet(intermediate_storage.uri_for_paths(paths)) return target_path @classmethod def get_intermediate_object( cls, intermediate_storage, context, _dagster_type, step_output_handle ): paths = ["intermediates", step_output_handle.step_key, step_output_handle.output_name] return context.resources.pyspark.spark_session.read.parquet( os.path.join(intermediate_storage.root, *paths) ) @classmethod def required_resource_keys(cls): return frozenset({"pyspark"}) DataFrame = PythonObjectDagsterType( python_type=NativeSparkDataFrame, name="PySparkDataFrame", description="A PySpark data frame.", auto_plugins=[ SparkDataFrameS3StoragePlugin, SparkDataFrameADLS2StoragePlugin, SparkDataFrameFilesystemStoragePlugin, ], loader=dataframe_loader, materializer=dataframe_materializer, )
from dagster import make_python_type_usable_as_dagster_type, PythonObjectDagsterType, input_hydration_config, Selector, \ Int, Field from datetime import date import pandas as pd make_python_type_usable_as_dagster_type(pd.DataFrame, PythonObjectDagsterType(pd.DataFrame)) @input_hydration_config(Selector({"date": {"year": Field(Int), "month": Field(Int), "day": Field(Int)}})) def parse_date(context, selector): date_selector = selector["date"] return date(date_selector["year"], date_selector["month"], date_selector["day"]) make_python_type_usable_as_dagster_type(date, PythonObjectDagsterType(date, input_hydration_config=parse_date))