def test_multi_input_partial_execution(): pipeline = create_multi_input_pipeline() first_sum_table = 'first_sum_table' first_mult_table = 'first_mult_table' first_sum_mult_table = 'first_sum_mult_table' environment = config.Environment(solids={ 'sum_table': config.Solid({'sum_table': first_sum_table}), 'mult_table': config.Solid({ 'mult_table': first_mult_table, }), 'sum_mult_table': config.Solid({ 'sum_table': first_sum_table, 'mult_table': first_mult_table, 'sum_mult_table': first_sum_mult_table, }), }, ) first_pipeline_result = execute_pipeline(pipeline, environment=environment) assert first_pipeline_result.success assert len(first_pipeline_result.result_list) == 3 assert _load_table(first_pipeline_result.context, first_sum_table) == [(1, 2, 3), (3, 4, 7)] assert _load_table(first_pipeline_result.context, first_mult_table) == [(1, 2, 2), (3, 4, 12)] assert _load_table(first_pipeline_result.context, first_sum_mult_table) == [(1, 3, 2), (3, 7, 12)] return
def test_pandas_output_csv_pipeline(): with get_temp_file_name() as temp_file_name: write_solid = dagster_pd.to_csv_solid('write_sum_mult_table') pipeline = create_diamond_pipeline( extra_solids=[write_solid], extra_dependencies={ write_solid.name: { 'df': DependencyDefinition('sum_mult_table') } }) environment = get_num_csv_environment({ 'load_csv': config.Solid({ 'path': script_relative_path('num.csv'), }), write_solid.name: config.Solid({'path': temp_file_name}), }) for _result in execute_pipeline_iterator(pipeline=pipeline, environment=environment): pass assert os.path.exists(temp_file_name) output_df = pd.read_csv(temp_file_name) assert output_df.to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], 'mult': [2, 12], 'sum_mult': [6, 84], }
def test_aliased_configs(): @solid( inputs=[], config_def=ConfigDefinition(types.Int), ) def load_constant(info): return info.config pipeline = PipelineDefinition( solids=[load_constant], dependencies={ SolidInstance(load_constant.name, 'load_a'): {}, SolidInstance(load_constant.name, 'load_b'): {}, }) result = execute_pipeline( pipeline, config.Environment(solids={ 'load_a': config.Solid(2), 'load_b': config.Solid(3), })) assert result.success assert result.result_for_solid('load_a').transformed_value() == 2 assert result.result_for_solid('load_b').transformed_value() == 3
def test_with_from_through_specifying_all_solids(): pipeline = create_multi_input_pipeline() first_sum_table = 'first_sum_table' first_mult_table = 'first_mult_table' first_sum_mult_table = 'first_sum_mult_table' environment = config.Environment(solids={ 'sum_table': config.Solid({ 'sum_table': first_sum_table, }), 'mult_table': config.Solid({ 'mult_table': first_mult_table, }), 'sum_mult_table': config.Solid({ 'sum_table': first_sum_table, 'mult_table': first_mult_table, 'sum_mult_table': first_sum_mult_table, }), }, ) pipeline_result = execute_pipeline(pipeline, environment=environment) assert len(pipeline_result.result_list) == 3 assert _load_table(pipeline_result.context, first_sum_table) == [(1, 2, 3), (3, 4, 7)] assert _load_table(pipeline_result.context, first_mult_table) == [(1, 2, 2), (3, 4, 12)] assert _load_table(pipeline_result.context, first_sum_mult_table) == [(1, 3, 2), (3, 7, 12)]
def execute_transform_in_temp_csv_files(solid_inst): load_csv_solid = dagster_pd.load_csv_solid('load_csv') to_csv_solid = dagster_pd.to_csv_solid('to_csv') key = solid_inst.input_defs[0].name pipeline = PipelineDefinition( solids=[load_csv_solid, solid_inst, to_csv_solid], dependencies={ solid_inst.name: { key: DependencyDefinition('load_csv'), }, 'to_csv': { 'df': DependencyDefinition(solid_inst.name), } }) with get_temp_file_name() as temp_file_name: result = execute_pipeline( pipeline, get_num_csv_environment({ load_csv_solid.name: config.Solid({'path': script_relative_path('num.csv')}), to_csv_solid.name: config.Solid({'path': temp_file_name}), }), ) assert result.success output_df = pd.read_csv(temp_file_name) return output_df
def test_pandas_multiple_outputs(): with get_temp_file_names(2) as temp_tuple: # false positive on pylint error csv_file, parquet_file = temp_tuple # pylint: disable=E0632 pipeline = create_diamond_pipeline() write_sum_mult_csv = dagster_pd.to_csv_solid('write_sum_mult_csv') write_sum_mult_parquet = dagster_pd.to_parquet_solid( 'write_sum_mult_parquet') pipeline = create_diamond_pipeline( extra_solids=[write_sum_mult_csv, write_sum_mult_parquet], extra_dependencies={ write_sum_mult_csv.name: { 'df': DependencyDefinition('sum_mult_table'), }, write_sum_mult_parquet.name: { 'df': DependencyDefinition('sum_mult_table'), } }) environment = get_num_csv_environment({ 'load_csv': config.Solid({ 'path': script_relative_path('num.csv'), }), write_sum_mult_csv.name: config.Solid({ 'path': csv_file, }), write_sum_mult_parquet.name: config.Solid({ 'path': parquet_file, }), }) execute_pipeline(pipeline, environment) assert os.path.exists(csv_file) output_csv_df = pd.read_csv(csv_file) assert output_csv_df.to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], 'mult': [2, 12], 'sum_mult': [6, 84], } assert os.path.exists(parquet_file) output_parquet_df = pd.read_parquet(parquet_file) assert output_parquet_df.to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], 'mult': [2, 12], 'sum_mult': [6, 84], }
def test_notebook_dag(): pipeline_result = execute_pipeline( define_test_notebook_dag_pipeline(), environment=config.Environment(solids={ 'load_a': config.Solid(1), 'load_b': config.Solid(2), })) assert pipeline_result.success assert pipeline_result.result_for_solid('add_two').transformed_value() == 3 assert pipeline_result.result_for_solid( 'mult_two').transformed_value() == 6
def test_part_thirteen_step_two(): pipeline_result = execute_pipeline( define_part_thirteen_step_two(), config.Environment(solids={ 'load_a': config.Solid(23), 'load_b': config.Solid(38), }) ) assert pipeline_result.success solid_result = pipeline_result.result_for_solid('a_plus_b') assert solid_result.transformed_value() == 23 + 38
def test_execute_two_solids_with_same_input_name(): input_def = InputDefinition(name='a_thing') solid_one = single_output_transform( 'solid_one', inputs=[input_def], transform_fn=lambda context, inputs: inputs['a_thing'] + inputs['a_thing'], output=dagster.OutputDefinition(), ) solid_two = single_output_transform( 'solid_two', inputs=[input_def], transform_fn=lambda context, inputs: inputs['a_thing'] + inputs['a_thing'], output=dagster.OutputDefinition(), ) pipeline = dagster.PipelineDefinition( solids=[ define_pass_value_solid('pass_to_one'), define_pass_value_solid('pass_to_two'), solid_one, solid_two, ], dependencies={ 'solid_one': { 'a_thing': DependencyDefinition('pass_to_one') }, 'solid_two': { 'a_thing': DependencyDefinition('pass_to_two') } } ) result = execute_pipeline( pipeline, environment=config.Environment( solids={ 'pass_to_one': config.Solid({ 'value': 'foo' }), 'pass_to_two': config.Solid({ 'value': 'bar' }), } ) ) assert result.success assert result.result_for_solid('solid_one').transformed_value() == 'foofoo' assert result.result_for_solid('solid_two').transformed_value() == 'barbar'
def test_two_input_solid(): def transform(_context, inputs): num_csv1 = inputs['num_csv1'] num_csv2 = inputs['num_csv2'] check.inst_param(num_csv1, 'num_csv1', pd.DataFrame) check.inst_param(num_csv2, 'num_csv2', pd.DataFrame) num_csv1['sum'] = num_csv1['num1'] + num_csv2['num2'] return num_csv1 two_input_solid = _dataframe_solid( name='two_input_solid', inputs=[ InputDefinition('num_csv1', dagster_pd.DataFrame), InputDefinition('num_csv2', dagster_pd.DataFrame), ], transform_fn=transform, ) environment = config.Environment( solids={ 'load_csv1': config.Solid( {'path': script_relative_path('num.csv')}), 'load_csv2': config.Solid( {'path': script_relative_path('num.csv')}), }) pipeline = PipelineDefinition(solids=[ dagster_pd.load_csv_solid('load_csv1'), dagster_pd.load_csv_solid('load_csv2'), two_input_solid ], dependencies={ 'two_input_solid': { 'num_csv1': DependencyDefinition('load_csv1'), 'num_csv2': DependencyDefinition('load_csv2'), } }) pipeline_result = execute_pipeline(pipeline, environment) assert pipeline_result.success df = pipeline_result.result_for_solid( 'two_input_solid').transformed_value() # df = get_solid_transformed_value(create_test_context(), two_input_solid, environment) assert isinstance(df, pd.DataFrame) assert df.to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7] }
def test_intro_tutorial_part_nine_step_one(): result = execute_pipeline( define_part_nine_step_one(), config.Environment(solids={ 'injest_a': config.Solid(2), 'injest_b': config.Solid(3), }, )) assert result.success assert result.result_for_solid('injest_a').transformed_value() == 2 assert result.result_for_solid('injest_b').transformed_value() == 3 assert result.result_for_solid('add_ints').transformed_value() == 5 assert result.result_for_solid('mult_ints').transformed_value() == 6
def test_basic_solid_with_config(): did_get = {} def _t_fn(info, _inputs): did_get['yep'] = info.config solid = SolidDefinition( name='solid_with_context', inputs=[], outputs=[], config_def=ConfigDefinition.config_dict({ 'some_config': Field(types.String) }), transform_fn=_t_fn, ) pipeline = PipelineDefinition(solids=[solid]) execute_pipeline( pipeline, config.Environment(solids={'solid_with_context': config.Solid({ 'some_config': 'foo' })}), ) assert 'yep' in did_get assert 'some_config' in did_get['yep']
def test_config_for_no_config(): def _t_fn(*_args): raise Exception('should not reach') solid_def = SolidDefinition( name='no_config_solid', inputs=[], outputs=[], transform_fn=_t_fn, ) pipeline = PipelineDefinition(solids=[solid_def]) with pytest.raises( DagsterInvariantViolationError, match="Solid no_config_solid was provided {'some_config': 1} but does not take config", ): execute_pipeline( pipeline, config.Environment(solids={ 'no_config_solid': config.Solid({ 'some_config': 1, }), }), )
def test_execute_solid_with_input_same_name(): a_thing_solid = single_output_transform( 'a_thing', inputs=[InputDefinition(name='a_thing')], transform_fn=lambda context, inputs: inputs['a_thing'] + inputs[ 'a_thing'], output=dagster.OutputDefinition(), ) pipeline = PipelineDefinition( solids=[define_pass_value_solid('pass_value'), a_thing_solid], dependencies={ 'a_thing': { 'a_thing': DependencyDefinition('pass_value') } }, ) result = execute_pipeline( pipeline, config.Environment( solids={'pass_value': config.Solid({'value': 'foo'})}), ) assert result.result_for_solid('a_thing').transformed_value() == 'foofoo'
def test_hello_world_composed(): pipeline = create_hello_world_solid_composed_pipeline() pipeline_result = execute_pipeline( pipeline, environment=config.Environment( solids={ 'read_hello_world': config.Solid({ 'path': script_relative_path('num.csv') }), }, ), ) assert pipeline_result.success result = pipeline_result.result_for_solid('hello_world') assert result.success assert result.transformed_value().to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], }
def test_execute_pipeline(): pipeline = define_success_pipeline() environment = config.Environment(solids={ 'load_num_csv': config.Solid({'path': script_relative_path('num.csv')}) }, ) result = execute_pipeline(pipeline, environment=environment) assert result.success assert result.result_for_solid('sum_solid').transformed_value().to_dict( 'list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], } assert result.result_for_solid('sum_sq_solid').transformed_value().to_dict( 'list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], 'sum_sq': [9, 49], }
def test_output_sql_sum_sq_solid(): create_sum_sq_table = define_create_table_solid('create_sum_sq_table') pipeline = create_sum_sq_pipeline( in_mem_context(), DagsterSqlTableExpression('num_table'), [create_sum_sq_table], {create_sum_sq_table.name: { 'expr': DependencyDefinition('sum_sq_table') }} ) environment = config.Environment( solids={'create_sum_sq_table': config.Solid({ 'table_name': 'sum_sq_table' })}, ) pipeline_result = execute_pipeline(pipeline=pipeline, environment=environment) assert pipeline_result.success result_list = pipeline_result.result_list assert len(result_list) == 3 engine = pipeline_result.context.resources.sa.engine result_list = engine.connect().execute('SELECT * FROM sum_sq_table').fetchall() assert result_list == [(1, 2, 3, 9), (3, 4, 7, 49)]
def test_single_templated_sql_solid_double_table_with_api(): sum_table_arg = 'specific_sum_table' num_table_arg = 'specific_num_table' sql = '''CREATE TABLE {{sum_table}} AS SELECT num1, num2, num1 + num2 as sum FROM {{num_table}}''' sum_solid = create_templated_sql_transform_solid( name='sum_solid', sql=sql, table_arguments=['sum_table', 'num_table'], ) pipeline = pipeline_test_def(solids=[sum_solid], context=in_mem_context(num_table_arg)) environment = config.Environment( solids={ 'sum_solid': config.Solid({ 'sum_table': sum_table_arg, 'num_table': num_table_arg, }) }) result = execute_pipeline(pipeline, environment=environment) assert result.success assert _load_table(result.context, sum_table_arg) == [(1, 2, 3), (3, 4, 7)]
def test_pandas_output_intermediate_parquet_files(): pipeline = create_diamond_pipeline() with get_temp_file_names(2) as temp_tuple: # false positive on pylint error sum_file, mult_file = temp_tuple # pylint: disable=E0632 write_sum_table = dagster_pd.to_parquet_solid('write_sum_table') write_mult_table = dagster_pd.to_parquet_solid('write_mult_table') pipeline = create_diamond_pipeline( extra_solids=[write_sum_table, write_mult_table], extra_dependencies={ write_sum_table.name: { 'df': DependencyDefinition('sum_table'), }, write_mult_table.name: { 'df': DependencyDefinition('mult_table'), } }) environment = get_num_csv_environment({ 'load_csv': config.Solid({ 'path': script_relative_path('num.csv'), }), write_sum_table.name: config.Solid({'path': sum_file}), write_mult_table.name: config.Solid({'path': mult_file}), }) pipeline_result = execute_pipeline( pipeline, environment, ) assert pipeline_result.success expected_sum = { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], } assert pd.read_parquet(sum_file).to_dict('list') == expected_sum
def test_pandas_multiple_inputs(): environment = config.Environment(solids={ 'load_one': config.Solid({'path': script_relative_path('num.csv')}), 'load_two': config.Solid({'path': script_relative_path('num.csv')}), }, ) def transform_fn(_context, inputs): return inputs['num_csv1'] + inputs['num_csv2'] double_sum = _dataframe_solid(name='double_sum', inputs=[ InputDefinition('num_csv1', dagster_pd.DataFrame), InputDefinition('num_csv2', dagster_pd.DataFrame), ], transform_fn=transform_fn) pipeline = PipelineDefinition( solids=[ dagster_pd.load_csv_solid('load_one'), dagster_pd.load_csv_solid('load_two'), double_sum ], dependencies={ 'double_sum': { 'num_csv1': DependencyDefinition('load_one'), 'num_csv2': DependencyDefinition('load_two'), } }, ) output_df = execute_pipeline( pipeline, environment=environment, ).result_for_solid('double_sum').transformed_value() assert not output_df.empty assert output_df.to_dict('list') == { 'num1': [2, 6], 'num2': [4, 8], }
def test_run_whole_pipeline(): pipeline = define_part_thirteen_step_three() pipeline_result = execute_pipeline( pipeline, config.Environment( solids={ 'a': config.Solid(2), 'b': config.Solid(6), 'c': config.Solid(4), 'd': config.Solid(8), } ) ) assert pipeline_result.success assert pipeline_result.result_for_solid('a_plus_b').transformed_value() == 8 assert pipeline_result.result_for_solid('c_plus_d').transformed_value() == 12 assert pipeline_result.result_for_solid('final').transformed_value() == 8 * 12
def test_intro_tutorial_part_nine_final_error(): with pytest.raises(DagsterTypeError, match='Field username not found'): execute_pipeline( define_part_nine_final(), config.Environment( solids={ 'injest_a': config.Solid(2), 'injest_b': config.Solid(3), }, context=config.Context( name='cloud', config={ 'credentials': { 'username': '******', 'pass': '******', }, }, ), ), )
def test_intro_tutorial_part_nine_final_local_success(): result = execute_pipeline( define_part_nine_final(), config.Environment(solids={ 'injest_a': config.Solid(2), 'injest_b': config.Solid(3), }, context=config.Context(name='local'))) assert result.success assert result.result_for_solid('injest_a').transformed_value() == 2 assert result.result_for_solid('injest_b').transformed_value() == 3 assert result.result_for_solid('add_ints').transformed_value() == 5 assert result.result_for_solid('mult_ints').transformed_value() == 6 assert result.context.resources.store.values == { 'a': 2, 'b': 3, 'add': 5, 'mult': 6, }
def test_intro_tutorial_part_nine_final_cloud_success(): result = execute_pipeline( define_part_nine_final(), config.Environment( solids={ 'injest_a': config.Solid(2), 'injest_b': config.Solid(3), }, context=config.Context( name='cloud', config={ 'credentials': { 'user': '******', 'pass': '******', }, }, ), ), ) assert result.success
def test_pandas_source_test_pipeline(): pipeline = define_pandas_source_test_pipeline() pipeline_result = execute_pipeline( pipeline, config.Environment(solids={ 'pandas_source_test': config.Solid(script_relative_path('num.csv')), }, ), ) assert pipeline_result.success solid_result = pipeline_result.result_for_solid('pandas_source_test') expected = pd.read_csv(script_relative_path('num.csv')) assert solid_result.transformed_value().equals(expected)
def test_any_config_definition(): called = {} conf_value = 234 @solid(config_def=ConfigDefinition()) def hello_world(info): assert info.config == conf_value called['yup'] = True result = execute_single_solid( create_test_context(), hello_world, environment=config.Environment( solids={'hello_world': config.Solid(conf_value)})) assert called['yup']
def test_hello_world_config(): with_config_solid = dm.define_dagstermill_solid( 'with_config', nb_test_path('hello_world_with_config'), [], [OutputDefinition()], ) pipeline = PipelineDefinition(solids=[with_config_solid]) pipeline_result = execute_pipeline( pipeline, config.Environment(solids={'with_config': config.Solid(script_relative_path('num.csv'))}), ) assert pipeline_result.success assert pipeline_result.result_for_solid('with_config').transformed_value() == 100
def test_hello_world_pipeline_no_api(): def hello_world_transform_fn(_context, inputs): num_df = inputs['num_df'] num_df['sum'] = num_df['num1'] + num_df['num2'] return num_df read_csv_solid = define_read_csv_solid('read_csv_solid') hello_world = single_output_transform( name='hello_world', inputs=[InputDefinition('num_df')], transform_fn=hello_world_transform_fn, output=OutputDefinition(), ) pipeline = PipelineDefinition( solids=[read_csv_solid, hello_world], dependencies={ 'hello_world': { 'num_df': DependencyDefinition('read_csv_solid'), }, } ) pipeline_result = execute_pipeline( pipeline, config.Environment( solids={ 'read_csv_solid': config.Solid({ 'path': script_relative_path('num.csv'), }), }, ), ) assert pipeline_result.success result = pipeline_result.result_for_solid('hello_world') assert result.transformed_value().to_dict('list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], }
def test_execute_dep_solid_different_input_name(): pass_to_first = define_pass_value_solid('pass_to_first') first_solid = single_output_transform( 'first_solid', inputs=[InputDefinition(name='a_thing')], transform_fn=lambda context, inputs: inputs['a_thing'] + inputs['a_thing'], output=dagster.OutputDefinition(), ) second_solid = single_output_transform( 'second_solid', inputs=[ InputDefinition(name='an_input'), ], transform_fn=lambda context, inputs: inputs['an_input'] + inputs['an_input'], output=dagster.OutputDefinition(), ) pipeline = dagster.PipelineDefinition( solids=[pass_to_first, first_solid, second_solid], dependencies={ 'first_solid': { 'a_thing': DependencyDefinition('pass_to_first'), }, 'second_solid': { 'an_input': DependencyDefinition('first_solid'), }, } ) result = dagster.execute_pipeline( pipeline, environment=config.Environment(solids={'pass_to_first': config.Solid({ 'value': 'bar' })}) ) assert result.success assert len(result.result_list) == 3 assert result.result_list[0].transformed_value() == 'bar' assert result.result_list[1].transformed_value() == 'barbar' assert result.result_list[2].transformed_value() == 'barbarbarbar'
def test_config_arg_mismatch(): def _t_fn(*_args): raise Exception('should not reach') solid = SolidDefinition( name='solid_with_context', inputs=[], outputs=[], config_def=ConfigDefinition.config_dict('SomeConfig', {'some_config': Field(types.String)}), transform_fn=_t_fn, ) pipeline = PipelineDefinition(solids=[solid]) with pytest.raises(DagsterTypeError): execute_pipeline( pipeline, config.Environment(solids={'solid_with_context': config.Solid({ 'some_config': 1 })}), )