def define_test_type_pipeline(): return PipelineDefinition( name='test_type_pipeline', solids=[ define_solid_for_test_type('int_config', Int), define_solid_for_test_type('list_of_int_config', List(Int)), define_solid_for_test_type('nullable_list_of_int_config', Nullable(List(Int))), define_solid_for_test_type('list_of_nullable_int_config', List(Nullable(Int))), define_solid_for_test_type('nullable_list_of_nullable_int_config', Nullable(List(Nullable(Int)))), define_solid_for_test_type( 'simple_dict', Dict({ 'int_field': Field(Int), 'string_field': Field(String) })), define_solid_for_test_type( 'dict_with_optional_field', Dict({ 'nullable_int_field': Field(Nullable(Int)), 'optional_int_field': Field(Int, is_optional=True), 'string_list_field': Field(List(String)), }), ), define_solid_for_test_type( 'nested_dict', Dict({'nested': Field(Dict({'int_field': Field(Int)}))})), ], )
def test_nullable_list(): list_of_ints = List(Int) assert not eval_config_value_from_dagster_type(list_of_ints, None).success assert eval_config_value_from_dagster_type(list_of_ints, []).success assert not eval_config_value_from_dagster_type(list_of_ints, [None]).success assert eval_config_value_from_dagster_type(list_of_ints, [1]).success nullable_list_of_ints = Nullable(List(Int)) assert eval_config_value_from_dagster_type(nullable_list_of_ints, None).success assert eval_config_value_from_dagster_type(nullable_list_of_ints, []).success assert not eval_config_value_from_dagster_type(nullable_list_of_ints, [None]).success assert eval_config_value_from_dagster_type(nullable_list_of_ints, [1]).success list_of_nullable_ints = List(Nullable(Int)) assert not eval_config_value_from_dagster_type(list_of_nullable_ints, None).success assert eval_config_value_from_dagster_type(list_of_nullable_ints, []).success assert eval_config_value_from_dagster_type(list_of_nullable_ints, [None]).success assert eval_config_value_from_dagster_type(list_of_nullable_ints, [1]).success nullable_list_of_nullable_ints = Nullable(List(Nullable(Int))) assert eval_config_value_from_dagster_type(nullable_list_of_nullable_ints, None).success assert eval_config_value_from_dagster_type(nullable_list_of_nullable_ints, []).success assert eval_config_value_from_dagster_type(nullable_list_of_nullable_ints, [None]).success assert eval_config_value_from_dagster_type(nullable_list_of_nullable_ints, [1]).success
def test_file_system_intermediate_store_composite_types_with_custom_serializer_for_inner_type( ): run_id = str(uuid.uuid4()) intermediate_store = FileSystemIntermediateStore(run_id=run_id) assert intermediate_store.root == os.path.join( seven.get_system_temp_directory(), 'dagster', 'runs', run_id, 'files') with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_store.set_object( ['foo', 'bar'], context, resolve_to_runtime_type(List(LowercaseString)).inst(), ['list'], ) assert intermediate_store.has_object(context, ['list']) assert intermediate_store.get_object( context, resolve_to_runtime_type(List(Bool)).inst(), ['list']) == ['foo', 'bar'] finally: try: shutil.rmtree(intermediate_store.root) except seven.FileNotFoundError: pass
def test_config_double_list_double_error(): nested_lists = Dict( fields={'nested_list_one': Field(List(Int)), 'nested_list_two': Field(List(String))} ) error_value = {'nested_list_one': 'kjdfkdj', 'nested_list_two': ['bar', 2]} error_result = eval_config_value_from_dagster_type(nested_lists, error_value) assert not error_result.success assert len(error_result.errors) == 2
def test_display_name(): int_runtime = resolve_to_runtime_type(Int) assert int_runtime.display_name == 'Int' list_int_runtime = resolve_to_runtime_type(List(Int)) assert list_int_runtime.display_name == '[Int]' list_list_int_runtime = resolve_to_runtime_type(List(List(Int))) assert list_list_int_runtime.display_name == '[[Int]]' list_nullable_int_runtime = resolve_to_runtime_type(List(Nullable(Int))) assert list_nullable_int_runtime.display_name == '[Int?]'
def test_inner_types(): assert resolve_to_runtime_type(Int).inner_types == [] list_int_runtime = resolve_to_runtime_type(List(Int)) assert inner_type_key_set(list_int_runtime) == set(['Int']) list_list_int_runtime = resolve_to_runtime_type(List(List(Int))) assert inner_type_key_set(list_list_int_runtime) == set( ['Int', 'List.Int']) list_nullable_int_runtime = resolve_to_runtime_type(List(Nullable(Int))) assert inner_type_key_set(list_nullable_int_runtime) == set( ['Int', 'Nullable.Int'])
def test_config_double_list(): nested_lists = Dict( {'nested_list_one': Field(List(Int)), 'nested_list_two': Field(List(String))} ) value = {'nested_list_one': [1, 2, 3], 'nested_list_two': ['foo', 'bar']} result = eval_config_value_from_dagster_type(nested_lists, value) assert result.success assert result.value == value error_value = {'nested_list_one': 'kjdfkdj', 'nested_list_two': ['bar']} error_result = eval_config_value_from_dagster_type(nested_lists, error_value) assert not error_result.success
def test_item_error_list_path(): called = {} @solid(config_field=Field(List(Int))) def required_list_int_solid(context): assert context.solid_config == [1, 2] called['yup'] = True pipeline_def = PipelineDefinition(name='list_path', solids=[required_list_int_solid]) with pytest.raises(PipelineConfigEvaluationError) as pe_info: execute_pipeline( pipeline_def, environment_dict={ 'solids': { 'required_list_int_solid': { 'config': [1, 'nope'] } } }, ) pe = pe_info.value assert len(pe.errors) == 1 rtm = pe.errors[0] assert rtm.reason == DagsterEvaluationErrorReason.RUNTIME_TYPE_MISMATCH assert 'Type failure at path "root:solids:required_list_int_solid:config[1]"' in str( pe)
def define_more_complicated_nested_config(): return PipelineDefinition( name='more_complicated_nested_config', solids=[ SolidDefinition( name='a_solid_with_multilayered_config', inputs=[], outputs=[], transform_fn=lambda *_args: None, config_field=Field( Dict( { 'field_one': Field(String), 'field_two': Field(String, is_optional=True), 'field_three': Field( String, is_optional=True, default_value='some_value' ), 'nested_field': Field( Dict( { 'field_four_str': Field(String), 'field_five_int': Field(Int), 'field_six_nullable_int_list': Field( List(Nullable(Int)), is_optional=True ), } ) ), } ) ), ) ], )
def test_solid_list_config(): value = [1, 2] called = {} def _test_config(context, _inputs): assert context.solid_config == value called['yup'] = True pipeline_def = PipelineDefinition( name='solid_list_config_pipeline', solids=[ SolidDefinition( name='solid_list_config', inputs=[], outputs=[], config_field=Field(List(Int)), transform_fn=_test_config, ) ], ) result = execute_pipeline( pipeline_def, environment_dict={'solids': { 'solid_list_config': { 'config': value } }}) assert result.success assert called['yup']
def test_evaluate_list_error_top_level_mismatch(): string_list = List(String) result = eval_config_value_from_dagster_type(string_list, 1) assert not result.success assert len(result.errors) == 1 assert result.errors[ 0].reason == DagsterEvaluationErrorReason.RUNTIME_TYPE_MISMATCH
def test_config_list_in_dict(): nested_list = Dict({'nested_list': Field(List(Int))}) value = {'nested_list': [1, 2, 3]} result = eval_config_value_from_dagster_type(nested_list, value) assert result.success assert result.value == value
def __init__(self, name, sql_queries, description=None): name = check.str_param(name, 'name') sql_queries = check.list_param(sql_queries, 'sql queries', of_type=str) description = check.opt_str_param(description, 'description', 'BigQuery query') def _compute_fn(context, _): query_job_config = _preprocess_config(context.solid_config.get('query_job_config', {})) # Retrieve results as pandas DataFrames results = [] for sql_query in sql_queries: # We need to construct a new QueryJobConfig for each query. # See: https://bit.ly/2VjD6sl cfg = QueryJobConfig(**query_job_config) if query_job_config else None context.log.info( 'executing query %s with config: %s' % (sql_query, cfg.to_api_repr() if cfg else '(no config provided)') ) results.append(context.resources.bq.query(sql_query, job_config=cfg).to_dataframe()) yield Result(results) super(BigQuerySolidDefinition, self).__init__( name=name, description=description, inputs=[InputDefinition(_START, Nothing)], outputs=[OutputDefinition(List(DataFrame))], compute_fn=_compute_fn, config_field=define_bigquery_query_config(), metadata={'kind': 'sql', 'sql': '\n'.join(sql_queries)}, )
def test_interleaved_values(): @solid(inputs=[InputDefinition('stuff', List(Any))]) def collect(_context, stuff): assert set(stuff) == set([1, None, 'one']) return stuff @lambda_solid def emit_num(): return 1 @lambda_solid def emit_none(): pass @lambda_solid def emit_str(): return 'one' result = execute_pipeline( PipelineDefinition( name='input_test', solids=[emit_num, emit_none, emit_str, collect], dependencies={ 'collect': { 'stuff': MultiDependencyDefinition([ DependencyDefinition('emit_num'), DependencyDefinition('emit_none'), DependencyDefinition('emit_str'), ]) } }, )) assert result.success
def test_two_list_types(): assert PipelineDefinition( name='two_types', solids=[ SolidDefinition( name='two_list_type', inputs=[], outputs=[], config_field=Field( Dict({ 'list_one': Field(List(Int)), 'list_two': Field(List(Int)) })), transform_fn=lambda *_args: None, ) ], )
def __init__(self, name, main_class, description=None): name = check.str_param(name, 'name') main_class = check.str_param(main_class, 'main_class') description = check.opt_str_param( description, 'description', 'This solid is a generic representation of a parameterized Spark job.', ) def _spark_compute_fn(context, _): '''Define Spark execution. This function defines how we'll execute the Spark job and invokes spark-submit. ''' spark_shell_cmd = create_spark_shell_cmd(context.solid_config, main_class) context.log.info("Running spark-submit: " + ' '.join(spark_shell_cmd)) retcode = run_spark_subprocess(spark_shell_cmd, context.log) if retcode != 0: raise SparkSolidError( 'Spark job failed. Please consult your logs.') yield Result(context.solid_config.get('spark_outputs'), 'paths') super(SparkSolidDefinition, self).__init__( name=name, description=description, inputs=[InputDefinition('spark_inputs', List(Path))], outputs=[OutputDefinition(dagster_type=List(Path), name='paths')], compute_fn=_spark_compute_fn, config_field=define_spark_config(), metadata={ 'kind': 'spark', 'main_class': main_class }, step_metadata_fn=functools.partial(step_metadata_fn, solid_name=name, main_class=main_class), )
def test_s3_intermediate_store_composite_types_with_custom_serializer_for_inner_type(): run_id = str(uuid.uuid4()) intermediate_store = S3IntermediateStore(run_id=run_id, s3_bucket='dagster-airflow-scratch') with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_store.set_object( ['foo', 'bar'], context, resolve_to_runtime_type(List(LowercaseString)).inst(), ['list'], ) assert intermediate_store.has_object(context, ['list']) assert intermediate_store.get_object( context, resolve_to_runtime_type(List(Bool)).inst(), ['list'] ) == ['foo', 'bar'] finally: intermediate_store.rm_object(context, ['foo'])
def define_pipeline_with_list(): return PipelineDefinition( name='pipeline_with_list', solids=[ SolidDefinition( name='solid_with_list', inputs=[], outputs=[], transform_fn=lambda *_args: None, config_field=Field(List(Int)), ) ], )
def _inputs_for_source(self, source): if source == BigQueryLoadSource.DataFrame: return [InputDefinition('df', DataFrame)] elif source == BigQueryLoadSource.File: return [InputDefinition('file_path', Path)] elif source == BigQueryLoadSource.Gcs: return [InputDefinition('source_uris', List(Path))] else: raise BigQueryError( 'invalid source specification -- must be one of [%s]' % ','.join( [BigQueryLoadSource.DataFrame, BigQueryLoadSource.File, BigQueryLoadSource.Gcs] ) )
def test_file_system_intermediate_store_composite_types(): run_id = str(uuid.uuid4()) intermediate_store = FileSystemIntermediateStore(run_id=run_id) assert intermediate_store.root == os.path.join( seven.get_system_temp_directory(), 'dagster', 'runs', run_id, 'files') with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_store.set_object([True, False], context, resolve_to_runtime_type( List(Bool)).inst(), ['bool']) assert intermediate_store.has_object(context, ['bool']) assert intermediate_store.get_object( context, resolve_to_runtime_type(List(Bool)).inst(), ['bool']) == [True, False] finally: try: shutil.rmtree(intermediate_store.root) except seven.FileNotFoundError: pass
def test_file_system_intermediate_store_with_composite_type_storage_plugin(): run_id = str(uuid.uuid4()) # FIXME need a dedicated test bucket intermediate_store = FileSystemIntermediateStore( run_id=run_id, types_to_register={ RuntimeString.inst(): FancyStringFilesystemTypeStoragePlugin }, ) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value(['hello'], context, resolve_to_runtime_type(List(String)), ['obj_name']) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value(['hello'], context, resolve_to_runtime_type( Nullable(String)), ['obj_name']) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value(['hello'], context, resolve_to_runtime_type( List(Nullable(String))), ['obj_name']) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value(['hello'], context, resolve_to_runtime_type( Nullable(List(String))), ['obj_name'])
def test_single_level_dict_lists_and_nullable(): output = print_type_to_string( Dict({ 'nullable_int_field': Field(Nullable(Int)), 'optional_int_field': Field(Int, is_optional=True), 'string_list_field': Field(List(String)), })) expected = '''{ nullable_int_field: Int? optional_int_field?: Int string_list_field: [String] }''' assert output == expected
def test_s3_intermediate_store_with_composite_type_storage_plugin(): run_id = str(uuid.uuid4()) # FIXME need a dedicated test bucket intermediate_store = S3IntermediateStore( run_id=run_id, s3_bucket='dagster-airflow-scratch', types_to_register={RuntimeString.inst(): FancyStringS3TypeStoragePlugin}, ) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value( ['hello'], context, resolve_to_runtime_type(List(String)), ['obj_name'] )
def test_wrapping_nothing(): with pytest.raises(DagsterInvalidDefinitionError): @lambda_solid(output=OutputDefinition(List(Nothing))) def _(): pass with pytest.raises(DagsterInvalidDefinitionError): @lambda_solid(inputs=[InputDefinition(List(Nothing))]) def _(): pass with pytest.raises(DagsterInvalidDefinitionError): @lambda_solid(output=OutputDefinition(Nullable(Nothing))) def _(): pass with pytest.raises(DagsterInvalidDefinitionError): @lambda_solid(inputs=[InputDefinition(Nullable(Nothing))]) def _(): pass
def test_config_list_in_dict_error(): nested_list = Dict({'nested_list': Field(List(Int))}) value = {'nested_list': [1, 'bar', 3]} result = eval_config_value_from_dagster_type(nested_list, value) assert not result.success assert len(result.errors) == 1 error = result.errors[0] assert error.reason == DagsterEvaluationErrorReason.RUNTIME_TYPE_MISMATCH assert len(error.stack.entries) == 2 stack_entry = error.stack.entries[0] assert isinstance(stack_entry, EvaluationStackPathEntry) assert stack_entry.field_name == 'nested_list' list_entry = error.stack.entries[1] assert isinstance(list_entry, EvaluationStackListItemEntry) assert list_entry.list_index == 1
def __init__(self, name, sql_queries, parameters=None, description=None): name = check.str_param(name, 'name') sql_queries = check.list_param(sql_queries, 'sql queries', of_type=str) description = check.opt_str_param( description, 'description', 'This solid is a generic representation of a parameterized Snowflake query.', ) def _snowflake_compute_fn(context, _): # pylint: disable=too-many-locals '''Define Snowflake execution. This function defines how we'll execute the Snowflake SQL query. ''' with context.resources.snowflake.get_connection( context.log) as conn: with closing(conn.cursor()) as cursor: results = [] for query in sql_queries: if sys.version_info[0] < 3: query = query.encode('utf-8') context.log.info( 'Executing SQL query %s %s' % (query, 'with parameters ' + str(parameters) if parameters else '')) cursor.execute(query, parameters) # pylint: disable=E1101 fetchall_results = cursor.fetchall() # pylint: disable=E1101 results.append(pd.DataFrame(fetchall_results)) yield Result(results) super(SnowflakeSolidDefinition, self).__init__( name=name, description=description, inputs=[InputDefinition('start', Nothing)], outputs=[OutputDefinition(List(dagster_pd.DataFrame))], compute_fn=_snowflake_compute_fn, metadata={ 'kind': 'sql', 'sql': '\n'.join(sql_queries) }, )
def test_working_list_path(): called = {} @solid(config_field=Field(List(Int))) def required_list_int_solid(context): assert context.solid_config == [1, 2] called['yup'] = True pipeline_def = PipelineDefinition(name='list_path', solids=[required_list_int_solid]) result = execute_pipeline(pipeline_def, environment_dict={ 'solids': { 'required_list_int_solid': { 'config': [1, 2] } } }) assert result.success assert called['yup']
def test_nothing_deps(): @solid(inputs=[InputDefinition('stuff', List(Any))]) def collect(_context, stuff): return stuff @lambda_solid(output=OutputDefinition(Int)) def emit_num(): return 1 @lambda_solid(output=OutputDefinition(Nothing)) def emit_nothing(): pass @lambda_solid(output=OutputDefinition(String)) def emit_str(): return 'one' with pytest.raises( DagsterInvalidDefinitionError, match=r'Input "stuff" expects a value of type \[Any\] and output ' '"result" returns type Nothing', ): PipelineDefinition( name='input_test', solids=[emit_num, emit_nothing, emit_str, collect], dependencies={ 'collect': { 'stuff': MultiDependencyDefinition([ DependencyDefinition('emit_num'), DependencyDefinition('emit_nothing'), DependencyDefinition('emit_str'), ]) } }, )
def test_simple_values(): @solid(inputs=[InputDefinition('numbers', List(Int))]) def sum_num(_context, numbers): # cant guarantee order assert set(numbers) == set([1, 2, 3]) return sum(numbers) @lambda_solid def emit_1(): return 1 @lambda_solid def emit_2(): return 2 @lambda_solid def emit_3(): return 3 result = execute_pipeline( PipelineDefinition( name='input_test', solids=[emit_1, emit_2, emit_3, sum_num], dependencies={ 'sum_num': { 'numbers': MultiDependencyDefinition([ DependencyDefinition('emit_1'), DependencyDefinition('emit_2'), DependencyDefinition('emit_3'), ]) } }, )) assert result.success assert result.result_for_solid('sum_num').transformed_value() == 6
def __init__(self, name, main_class, description=None): name = check.str_param(name, 'name') main_class = check.str_param(main_class, 'main_class') description = check.opt_str_param( description, 'description', 'This solid is a generic representation of a parameterized Spark job.', ) def _spark_transform_fn(context, _): '''Define Spark execution. This function defines how we'll execute the Spark job and invokes spark-submit. ''' # Extract parameters from config ( master_url, deploy_mode, application_jar, spark_conf, application_arguments, spark_home, spark_outputs, ) = [ context.solid_config.get(k) for k in ( 'master_url', 'deploy_mode', 'application_jar', 'spark_conf', 'application_arguments', 'spark_home', 'spark_outputs', ) ] # Let the user use env vars in the jar path application_jar = os.path.expandvars(application_jar) if not os.path.exists(application_jar): raise SparkSolidError( ('Application jar {} does not exist. A valid jar must be ' 'built before running this solid.'.format(application_jar) )) spark_home = spark_home if spark_home else os.environ.get( 'SPARK_HOME') if spark_home is None: raise SparkSolidError(( 'No spark home set. You must either pass spark_home in config or ' 'set $SPARK_HOME in your environment (got None).')) deploy_mode = ['--deploy-mode', '{}'.format(deploy_mode) ] if deploy_mode else [] spark_shell_cmd = ( [ '{}/bin/spark-submit'.format(spark_home), '--class', main_class, '--master', master_url, ] + deploy_mode + parse_spark_config(spark_conf) + [application_jar] + ([application_arguments] if application_arguments else [])) context.log.info("Running spark-submit: " + ' '.join(spark_shell_cmd)) retcode = run_spark_subprocess(spark_shell_cmd, context.log) if retcode != 0: raise SparkSolidError( 'Spark job failed. Please consult your logs.') yield Result(spark_outputs, 'paths') super(SparkSolidDefinition, self).__init__( name=name, description=description, inputs=[InputDefinition('spark_inputs', List(Path))], outputs=[OutputDefinition(dagster_type=List(Path), name='paths')], transform_fn=_spark_transform_fn, config_field=define_spark_config(), metadata={ 'kind': 'spark', 'main_class': main_class }, )