def test_dagster_type_collision(): class Foo(object): pass _Foo_1 = as_dagster_type(Foo) with pytest.raises( DagsterInvalidDefinitionError, match='A Dagster runtime type has already been registered for the Python type', ): _Foo_2 = as_dagster_type(Foo)
def test_make_dagster_type(): OverwriteNameTuple = as_dagster_type(collections.namedtuple('SomeNamedTuple', 'prop')) runtime_type = resolve_to_runtime_type(OverwriteNameTuple) assert runtime_type.name == 'SomeNamedTuple' assert OverwriteNameTuple(prop='foo').prop == 'foo' OverwriteNameTuple = as_dagster_type( collections.namedtuple('SomeNamedTuple', 'prop'), name='OverwriteName' ) runtime_type = resolve_to_runtime_type(OverwriteNameTuple) assert runtime_type.name == 'OverwriteName' assert OverwriteNameTuple(prop='foo').prop == 'foo'
def get_resource_init_input_hydration_pipeline(resources_initted): @resource def resource_a(_): resources_initted['a'] = True yield 'A' class CustomType(str): pass @input_hydration_config(String, required_resource_keys={'a'}) def InputHydration(context, hello): assert context.resources.a == 'A' return CustomType(hello) CustomDagsterType = as_dagster_type( CustomType, name='CustomType', input_hydration_config=InputHydration ) @solid(input_defs=[InputDefinition('custom_type', CustomDagsterType)]) def input_hydration_solid(context, custom_type): context.log.info(custom_type) @solid(output_defs=[OutputDefinition(CustomDagsterType)]) def source_custom_type(_): return CustomType('from solid') @pipeline(mode_defs=[ModeDefinition(resource_defs={'a': resource_a})]) def selective_pipeline(): input_hydration_solid(source_custom_type()) return selective_pipeline
def define_input_hydration_pipeline(should_require_resources): @resource def resource_a(_): yield 'A' class CustomType(str): pass @input_hydration_config( String, required_resource_keys={'a'} if should_require_resources else set() ) def InputHydration(context, hello): assert context.resources.a == 'A' return CustomType(hello) CustomDagsterType = as_dagster_type( CustomType, name='CustomType', input_hydration_config=InputHydration ) @solid(input_defs=[InputDefinition('custom_type', CustomDagsterType)]) def input_hydration_solid(context, custom_type): context.log.info(custom_type) @pipeline(mode_defs=[ModeDefinition(resource_defs={'a': resource_a})]) def input_hydration_pipeline(): input_hydration_solid() return input_hydration_pipeline
def test_make_dagster_type_from_builtin(): OrderedDict = as_dagster_type(collections.OrderedDict) assert OrderedDict is collections.OrderedDict assert OrderedDict([('foo', 'bar')]) == collections.OrderedDict([('foo', 'bar')]) assert isinstance(resolve_to_runtime_type(OrderedDict), RuntimeType) assert resolve_to_runtime_type( OrderedDict).python_type is collections.OrderedDict
def test_python_built_in_output(): class MyOrderedDict(collections.OrderedDict): pass OrderedDict = as_dagster_type(MyOrderedDict) @lambda_solid def emit_ordered_dict() -> OrderedDict: return OrderedDict([('foo', 'bar')]) output_value = execute_solid(emit_ordered_dict).output_value() assert output_value == OrderedDict([('foo', 'bar')]) assert isinstance(output_value, OrderedDict) assert isinstance(output_value, MyOrderedDict) assert isinstance(output_value, collections.OrderedDict)
from dagster import execute_pipeline, pipeline, as_dagster_type, lambda_solid import pandas as pd # Data Validations which check that source and destination files should be in #PandasDataFrame format in all the nodes. DataFrame = as_dagster_type( pd.DataFrame, name='PandasDataFrame', ) def Input1() -> DataFrame: r = pd.read_csv('file1.csv') return r @lambda_solid def Input2() -> DataFrame: # second node which reads input file ->file2.csv r2 = pd.read_csv('file2.csv') return r2 @lambda_solid #Represents third node which merges input from file1 and file2 def Merge(r: DataFrame, r2: DataFrame) -> DataFrame: r3 = pd.concat([r, r2], axis=1) return r3 @lambda_solid # Fourth node which contains the output merged file. def Result_output(y: DataFrame) -> DataFrame: y3 = y y3.to_csv(r'merged_output.csv')
# The MIT License (MIT) # Copyright (c) 2019 Ian Buttimer # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from dagster import as_dagster_type from bitarray import bitarray BitArray = as_dagster_type( bitarray, name='BitArray', description='''bitarray: efficient arrays of booleans. See https://pypi.org/project/bitarray/, https://github.com/ilanschnell/bitarray''', )
check.dict_param(file_options, 'file_options') if file_type == 'csv': path = file_options['path'] return pd.read_csv(path, **dict_without_keys(file_options, 'path')) elif file_type == 'parquet': return pd.read_parquet(file_options['path']) elif file_type == 'table': return pd.read_table(file_options['path']) else: raise DagsterInvariantViolationError( 'Unsupported file_type {file_type}'.format(file_type=file_type) ) DataFrame = as_dagster_type( pd.DataFrame, name='PandasDataFrame', description='''Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). See http://pandas.pydata.org/''', input_schema=dataframe_input_schema, output_schema=dataframe_output_schema, metadata_fn=lambda value: TypeCheck( metadata_entries=[ EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'), EventMetadataEntry.json({'columns': list(value.columns)}, 'metadata'), ] ), )
[OrderedDict(sorted(x.items(), key=lambda x: x[0])) for x in csv.DictReader(fd)] ) @output_schema(Path) def df_output_schema(_context, path, value): with open(path, 'w') as fd: writer = csv.DictWriter(fd, fieldnames=value[0].keys()) writer.writeheader() writer.writerows(rowdicts=value) return Materialization.file(path) PoorMansDataFrame = as_dagster_type( PoorMansDataFrame_, input_schema=df_input_schema, output_schema=df_output_schema ) def get_events_of_type(events, event_type): return [ event for event in events if event.is_dagster_event and event.dagster_event.event_type == event_type ] def test_running(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_fn(define_passing_pipeline) pipeline = define_passing_pipeline()
if file_type == 'csv': path = file_options['path'] return pd.read_csv(path, **dict_without_keys(file_options, 'path')) elif file_type == 'parquet': return pd.read_parquet(file_options['path']) elif file_type == 'table': return pd.read_csv(file_options['path'], sep='\t') else: raise DagsterInvariantViolationError( 'Unsupported file_type {file_type}'.format(file_type=file_type) ) DataFrame = as_dagster_type( pd.DataFrame, name='PandasDataFrame', description='''Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). See http://pandas.pydata.org/''', input_hydration_config=dataframe_input_schema, output_materialization_config=dataframe_output_schema, typecheck_metadata_fn=lambda value: TypeCheck( metadata_entries=[ EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'), # string cast columns since they may be things like datetime EventMetadataEntry.json({'columns': list(map(str, value.columns))}, 'metadata'), ] ), )
) def spark_df_output_schema(_context, file_type, file_options, spark_df): if file_type == 'csv': spark_df.write.csv( file_options['path'], header=file_options.get('header'), sep=file_options.get('sep') ) return file_options['path'] else: check.failed('Unsupported file type: {}'.format(file_type)) SparkDataFrameType = as_dagster_type( DataFrame, name='SparkDataFrameType', description='A Pyspark data frame.', storage_plugins={ RunStorageMode.S3: SparkDataFrameS3StoragePlugin, RunStorageMode.FILESYSTEM: SparkDataFrameFilesystemStoragePlugin, }, output_schema=spark_df_output_schema, ) SqlAlchemyEngineType = as_dagster_type( sqlalchemy.engine.Connectable, name='SqlAlchemyEngineType', description='A SqlAlchemy Connectable', ) class SqlTableName(Stringish): def __init__(self):
"""Type definitions for the airline_demo.""" from collections import namedtuple import sqlalchemy from dagster import as_dagster_type from dagster.core.types.runtime_type import create_string_type AirlineDemoResources = namedtuple( 'AirlineDemoResources', ('spark', 's3', 'db_url', 'db_engine', 'db_dialect', 'redshift_s3_temp_dir', 'db_load'), ) SqlAlchemyEngineType = as_dagster_type( sqlalchemy.engine.Connectable, name='SqlAlchemyEngineType', description='A SqlAlchemy Connectable', ) SqlTableName = create_string_type('SqlTableName', description='The name of a database table') DbInfo = namedtuple('DbInfo', 'engine url jdbc_url dialect load_table host db_name')
from dagster import execute_pipeline, pipeline, as_dagster_type, lambda_solid, dagster_type from dagit import * from graphql import * from dagster_graphql import * import pandas as pd # Data Validations which check that source and destination files should be in #PandasDataFrame format in all the nodes. DataFrame = as_dagster_type( pd.pandas.core.frame.DataFrame, name='PandasDataFrame', ) @lambda_solid #Defines a node in the workflow. def Input1() -> DataFrame: # first node which reads input file -> file1.csv r = pd.read_csv('file1.csv') return r @lambda_solid def Input2() -> DataFrame: # second node which reads input file ->file2.csv r2 = pd.read_csv('file2.csv') return r2 @lambda_solid #Represents third node which merges input from file1 and file2 def Merge(r: DataFrame, r2: DataFrame) -> DataFrame: r3 = pd.concat([r, r2], axis=1) return r3
) # Placeholder class to cause the unregistered notebook solid to fail -- custom serialization # strategies require repository registration class ComplexSerializationStrategy(SerializationStrategy): # pylint: disable=no-init def serialize(self, value, write_file_obj): pass # pragma: nocover def deserialize(self, read_file_obj): pass # pragma: nocover complex_serialization_strategy = ComplexSerializationStrategy('complex') ComplexDagsterType = as_dagster_type( pd.DataFrame, serialization_strategy=complex_serialization_strategy) @solid('resource_solid', required_resource_keys={'list'}) def resource_solid(context): context.resources.list.append('Hello, solid!') return True @solid_definition def hello_world_resource_solid(): return dagstermill.define_dagstermill_solid( 'hello_world_resource', nb_test_path('hello_world_resource'), input_defs=[InputDefinition('nonce')], required_resource_keys={'list'},
'header': Field(Bool, is_optional=True), })) })) def write_rdd(context, file_type, file_options, spark_rdd): if file_type == 'csv': df = context.resources.spark.spark_session.createDataFrame(spark_rdd) context.log.info('DF: {}'.format(df)) df.write.csv(file_options['path'], header=file_options.get('header'), sep=file_options.get('sep')) else: check.failed('Unsupported file type: {}'.format(file_type)) SparkRDD = as_dagster_type(RDD, 'SparkRDD', input_hydration_config=load_rdd, output_materialization_config=write_rdd) @output_selector_schema( Selector({ 'csv': Field( Dict({ 'path': Field(Path), 'sep': Field(String, is_optional=True), 'header': Field(Bool, is_optional=True), })) })) def spark_df_output_schema(_context, file_type, file_options, spark_df): if file_type == 'csv':
) } ) ) def write_rdd(context, file_type, file_options, spark_rdd): if file_type == 'csv': df = context.resources.spark.createDataFrame(spark_rdd) context.log.info('DF: {}'.format(df)) df.write.csv( file_options['path'], header=file_options.get('header'), sep=file_options.get('sep') ) else: check.failed('Unsupported file type: {}'.format(file_type)) SparkRDD = as_dagster_type(RDD, 'SparkRDD', input_schema=load_rdd, output_schema=write_rdd) @resource(config_field=Field(Dict({'spark_conf': spark_config()}))) def spark_session_resource(init_context): builder = SparkSession.builder flat = flatten_dict(init_context.resource_config['spark_conf']) for key, value in flat: builder = builder.config(key, value) spark = builder.getOrCreate() try: yield spark finally: spark.stop()
# pylint: disable=no-value-for-parameter import collections from dagster import Any, Field, as_dagster_type, pipeline, solid Counter = as_dagster_type(collections.Counter) @solid(config_field=Field(Any)) def multiply_the_word(context, word: str) -> str: return word * context.solid_config['factor'] @solid def count_letters(_, word: str) -> Counter: return collections.Counter(word) @pipeline def configuration_schema_pipeline(): return count_letters(multiply_the_word())
success=True, metadata_entries=[ EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'), # string cast columns since they may be things like datetime EventMetadataEntry.json({'columns': list(map(str, value.columns))}, 'metadata'), ], ) DataFrame = as_dagster_type( pd.DataFrame, name='PandasDataFrame', description='''Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). See http://pandas.pydata.org/''', input_hydration_config=dataframe_input_schema, output_materialization_config=dataframe_output_schema, type_check=df_type_check, ) def _construct_constraint_list(constraints): def add_bullet(constraint_list, constraint_description): return constraint_list + "+ {constraint_description}\n".format( constraint_description=constraint_description) constraint_list = "" for constraint in constraints: if constraint.__class__ not in CONSTRAINT_BLACKLIST: constraint_list = add_bullet(constraint_list,
Field, Output, OutputDefinition, String, as_dagster_type, execute_pipeline, pipeline, solid, ) class _DataFrame(list): pass DataFrame = as_dagster_type(_DataFrame, name='DataFrame') @solid def read_csv(context, csv_path): with open(csv_path, 'r') as fd: lines = [row for row in csv.DictReader(fd)] context.log.info('Read {n_lines} lines'.format(n_lines=len(lines))) return DataFrame(lines) @solid( config={ 'process_hot': Field(Bool, is_optional=True, default_value=True), 'process_cold': Field(Bool, is_optional=True, default_value=True),
]) @output_materialization_config(Path) def df_output_schema(_context, path, value): with open(path, 'w') as fd: writer = csv.DictWriter(fd, fieldnames=value[0].keys()) writer.writeheader() writer.writerows(rowdicts=value) return Materialization.file(path) PoorMansDataFrame = as_dagster_type( PoorMansDataFrame_, input_hydration_config=df_input_schema, output_materialization_config=df_output_schema, ) def define_test_subprocess_context(instance): return define_subprocess_context_for_file(__file__, "define_repository", instance) def define_test_context(instance=None): return define_context_for_file(__file__, "define_repository", instance) @lambda_solid( input_defs=[InputDefinition('num', PoorMansDataFrame)],
import sqlalchemy from pyspark.sql import DataFrame from dagster import dagster_type, as_dagster_type from dagster.core.types.runtime import PythonObjectType, Stringish from dagster.utils import safe_isfile AirlineDemoResources = namedtuple( 'AirlineDemoResources', ('spark', 's3', 'db_url', 'db_engine', 'db_dialect', 'redshift_s3_temp_dir', 'db_load'), ) SparkDataFrameType = as_dagster_type(DataFrame, name='SparkDataFrameType', description='A Pyspark data frame.') SqlAlchemyEngineType = as_dagster_type( sqlalchemy.engine.Connectable, name='SqlAlchemyEngineType', description='A SqlAlchemy Connectable', ) class SqlTableName(Stringish): def __init__(self): super(SqlTableName, self).__init__(description='The name of a database table')