def define_error_monster_pipeline(): return PipelineDefinition( name='error_monster', solids=[emit_num, num_to_str, str_to_num], dependencies={ SolidInstance('emit_num', 'start'): {}, SolidInstance('num_to_str', 'middle'): { 'num': DependencyDefinition('start') }, SolidInstance('str_to_num', 'end'): { 'string': DependencyDefinition('middle') }, }, mode_definitions=[ ModeDefinition( name='errorable_mode', resources={'errorable_resource': define_errorable_resource()}) ], preset_definitions=[ PresetDefinition( 'passing', environment_files=[ file_relative_path(__file__, 'environments/error.yaml') ], mode='errorable_mode', ) ], )
def test_required_inputs(): @lambda_solid(inputs=[InputDefinition('num', types.Int)], output=OutputDefinition(types.Int)) def add_one(num): return num + 1 pipeline_def = PipelineDefinition( name='required_int_input', solids=[add_one], dependencies={ SolidInstance('add_one', 'first_add'): {}, SolidInstance('add_one', 'second_add'): {'num': DependencyDefinition('first_add')}, }, ) env_type = pipeline_def.environment_type solids_type = env_type.fields['solids'].config_type first_add_fields = solids_type.fields['first_add'].config_type.fields assert 'inputs' in first_add_fields inputs_field = first_add_fields['inputs'] assert inputs_field.is_required assert inputs_field.config_type.fields['num'].is_required # second_add has a dependency so the input is not available assert 'inputs' not in solids_type.fields['second_add'].config_type.fields
def define_event_ingest_pipeline(): event_ingest = SparkSolidDefinition( name='event_ingest', main_class='io.dagster.events.EventPipeline', description='Ingest events from JSON to Parquet', ) # TODO: express dependency of this solid on event_ingest snowflake_load = SnowflakeLoadSolidDefinition( 'snowflake_load', # TODO: need to pull this out to a config src='file:///tmp/dagster/events/data/output/2019/01/01/*.parquet', table='events', ) return PipelineDefinition( name='event_ingest_pipeline', solids=[download_from_s3_to_file, gunzipper, event_ingest, snowflake_load], dependencies={ SolidInstance('gunzipper'): { 'gzip_file': DependencyDefinition('download_from_s3_to_file') }, SolidInstance('event_ingest'): {'spark_inputs': DependencyDefinition('gunzipper')}, SolidInstance('snowflake_load'): { 'start': DependencyDefinition('event_ingest', 'paths') }, }, mode_definitions=[ ModeDefinition(resources={'s3': s3_resource, 'snowflake': snowflake_resource}) ], )
def test_aliased_solids(): @lambda_solid() def first(): return ['first'] @lambda_solid(inputs=[InputDefinition(name="prev")]) def not_first(prev): return prev + ['not_first'] pipeline = PipelineDefinition( solids=[first, not_first], dependencies={ 'not_first': { 'prev': DependencyDefinition('first') }, SolidInstance('not_first', alias='second'): { 'prev': DependencyDefinition('not_first') }, SolidInstance('not_first', alias='third'): { 'prev': DependencyDefinition('second') }, }, ) result = execute_pipeline(pipeline) assert result.success solid_result = result.result_for_solid('third') assert solid_result.transformed_value() == [ 'first', 'not_first', 'not_first', 'not_first' ]
def test_aliased_configs(): @solid( inputs=[], config_def=ConfigDefinition(types.Int), ) def load_constant(info): return info.config pipeline = PipelineDefinition( solids=[load_constant], dependencies={ SolidInstance(load_constant.name, 'load_a'): {}, SolidInstance(load_constant.name, 'load_b'): {}, }) result = execute_pipeline( pipeline, config.Environment(solids={ 'load_a': config.Solid(2), 'load_b': config.Solid(3), })) assert result.success assert result.result_for_solid('load_a').transformed_value() == 2 assert result.result_for_solid('load_b').transformed_value() == 3
def test_aliased_configs(): @solid(inputs=[], config_field=Field(Int)) def load_constant(info): return info.config pipeline = PipelineDefinition( solids=[load_constant], dependencies={ SolidInstance(load_constant.name, 'load_a'): {}, SolidInstance(load_constant.name, 'load_b'): {}, }, ) result = execute_pipeline( pipeline, {'solids': { 'load_a': { 'config': 2 }, 'load_b': { 'config': 3 } }}) assert result.success assert result.result_for_solid('load_a').transformed_value() == 2 assert result.result_for_solid('load_b').transformed_value() == 3
def define_airline_demo_warehouse_pipeline(): return PipelineDefinition( name="airline_demo_warehouse_pipeline", solids=[ average_sfo_outbound_avg_delays_by_destination, delays_by_geography, delays_vs_fares, delays_vs_fares_nb, eastbound_delays, q2_sfo_outbound_flights, sfo_delays_by_destination, tickets_with_destination, upload_to_s3, westbound_delays, ], dependencies={ 'q2_sfo_outbound_flights': {}, 'tickets_with_destination': {}, 'westbound_delays': {}, 'eastbound_delays': {}, 'average_sfo_outbound_avg_delays_by_destination': { 'q2_sfo_outbound_flights': DependencyDefinition('q2_sfo_outbound_flights') }, 'delays_vs_fares': { 'tickets_with_destination': DependencyDefinition('tickets_with_destination'), 'average_sfo_outbound_avg_delays_by_destination': DependencyDefinition( 'average_sfo_outbound_avg_delays_by_destination'), }, 'fares_vs_delays': { 'table_name': DependencyDefinition('delays_vs_fares') }, 'sfo_delays_by_destination': { 'table_name': DependencyDefinition( 'average_sfo_outbound_avg_delays_by_destination') }, 'delays_by_geography': { 'eastbound_delays': DependencyDefinition('eastbound_delays'), 'westbound_delays': DependencyDefinition('westbound_delays'), }, SolidInstance('upload_to_s3', alias='upload_outbound_avg_delay_pdf_plots'): { 'file_obj': DependencyDefinition('sfo_delays_by_destination') }, SolidInstance('upload_to_s3', alias='upload_delays_vs_fares_pdf_plots'): { 'file_obj': DependencyDefinition('fares_vs_delays') }, SolidInstance('upload_to_s3', alias='upload_delays_by_geography_pdf_plots'): { 'file_obj': DependencyDefinition('delays_by_geography') }, }, context_definitions=CONTEXT_DEFINITIONS, )
def define_tutorial_pipeline(): return PipelineDefinition( name='tutorial_pipeline', solids=[clean_data_solid, LR_solid, RF_solid], dependencies={ SolidInstance('clean_data'): {}, SolidInstance('linear_regression'): {'df': DependencyDefinition('clean_data')}, SolidInstance('random_forest_regression'): {'df': DependencyDefinition('clean_data')}, }, )
def test_nothing_inputs(): @lambda_solid(inputs=[InputDefinition('never_defined', Nothing)]) def emit_one(): return 1 @lambda_solid def emit_two(): return 2 @lambda_solid def emit_three(): return 3 @lambda_solid(output=OutputDefinition(Nothing)) def emit_nothing(): pass @solid( inputs=[ InputDefinition('_one', Nothing), InputDefinition('one', Int), InputDefinition('_two', Nothing), InputDefinition('two', Int), InputDefinition('_three', Nothing), InputDefinition('three', Int), ] ) def adder(_context, one, two, three): assert one == 1 assert two == 2 assert three == 3 return one + two + three pipeline = PipelineDefinition( name='input_test', solids=[emit_one, emit_two, emit_three, emit_nothing, adder], dependencies={ SolidInstance('emit_nothing', '_one'): {}, SolidInstance('emit_nothing', '_two'): {}, SolidInstance('emit_nothing', '_three'): {}, 'adder': { '_one': DependencyDefinition('_one'), '_two': DependencyDefinition('_two'), '_three': DependencyDefinition('_three'), 'one': DependencyDefinition('emit_one'), 'two': DependencyDefinition('emit_two'), 'three': DependencyDefinition('emit_three'), }, }, ) result = execute_pipeline(pipeline) assert result.success
def define_part_thirteen_step_two(): return PipelineDefinition( name='thirteen_step_two', solids=[load_number, adder], dependencies={ SolidInstance('load_number', alias='load_a'): {}, SolidInstance('load_number', alias='load_b'): {}, SolidInstance('adder', alias='a_plus_b'): { 'num1': DependencyDefinition('load_a'), 'num2': DependencyDefinition('load_b'), } } )
def test_fanin_deps(): called = defaultdict(int) @lambda_solid def emit_two(): return 2 @lambda_solid(output=OutputDefinition(Nothing)) def emit_nothing(): called['emit_nothing'] += 1 @solid( inputs=[ InputDefinition('ready', Nothing), InputDefinition('num_1', Int), InputDefinition('num_2', Int), ] ) def adder(_context, num_1, num_2): assert called['emit_nothing'] == 3 called['adder'] += 1 return num_1 + num_2 pipeline = PipelineDefinition( name='input_test', solids=[emit_two, emit_nothing, adder], dependencies={ SolidInstance('emit_two', 'emit_1'): {}, SolidInstance('emit_two', 'emit_2'): {}, SolidInstance('emit_nothing', '_one'): {}, SolidInstance('emit_nothing', '_two'): {}, SolidInstance('emit_nothing', '_three'): {}, 'adder': { 'ready': MultiDependencyDefinition( [ DependencyDefinition('_one'), DependencyDefinition('_two'), DependencyDefinition('_three'), ] ), 'num_1': DependencyDefinition('emit_1'), 'num_2': DependencyDefinition('emit_2'), }, }, ) result = execute_pipeline(pipeline) assert result.success assert called['adder'] == 1 assert called['emit_nothing'] == 3
def define_reusable_solids_pipeline(): # (a + b) * (c + d) return PipelineDefinition( name='reusable_solids_pipeline', solids=[adder, multer], dependencies={ SolidInstance('adder', 'a_plus_b'): {}, SolidInstance('adder', 'c_plus_d'): {}, SolidInstance('multer', 'final'): { 'num1': DependencyDefinition('a_plus_b'), 'num2': DependencyDefinition('c_plus_d'), }, }, )
def define_part_fourteen_step_one_pipeline(): # (a + b) * (c + d) return PipelineDefinition( name='part_fourteen_step_one_pipeline', solids=[adder, multer], dependencies={ SolidInstance(adder.name, 'a_plus_b'): {}, SolidInstance(adder.name, 'c_plus_d'): {}, SolidInstance(multer.name, 'final'): { 'num1': DependencyDefinition('a_plus_b'), 'num2': DependencyDefinition('c_plus_d'), }, }, )
def test_create_single_solid_pipeline_with_alias(): a_source = define_stub_solid('A_source', [input_set('A_input')]) stub_solid = define_stub_solid('stub', [{'a_key': 'stubbed_thing'}]) single_solid_pipeline = PipelineDefinition.create_single_solid_pipeline( PipelineDefinition( solids=[a_source, create_root_solid('A')], dependencies={ SolidInstance('A', alias='aliased'): { 'A_input': DependencyDefinition(a_source.name) }, }, ), 'aliased', { 'aliased': { 'A_input': stub_solid, }, }, ) result = execute_pipeline(single_solid_pipeline) assert result.success expected = [{'a_key': 'stubbed_thing'}, {'A': 'transform_called'}] assert result.result_for_solid('aliased').transformed_value() == expected
def test_string_from_aliased_inputs(): called = {} @solid(inputs=[InputDefinition('string_input', types.String)]) def str_as_input(_context, string_input): assert string_input == 'foo' called['yup'] = True pipeline = PipelineDefinition( solids=[str_as_input], dependencies={SolidInstance('str_as_input', alias='aliased'): {}}) result = execute_pipeline(pipeline, { 'solids': { 'aliased': { 'inputs': { 'string_input': { 'value': 'foo' } } } } }) assert result.success assert called['yup']
def test_execute_aliased_solid_in_diamond(): a_source = define_stub_solid('A_source', [input_set('A_input')]) pipeline_def = PipelineDefinition( name='aliased_pipeline', solids=[a_source, create_root_solid('A')], dependencies={ SolidInstance('A', alias='aliased'): { 'A_input': DependencyDefinition(a_source.name) } }, ) solid_result = execute_solid(pipeline_def, 'aliased', inputs={'A_input': [{ 'a key': 'a value' }]}) assert solid_result.success assert solid_result.transformed_value() == [ { 'a key': 'a value' }, { 'aliased': 'transform_called' }, ]
def test_mapper_errors(): @lambda_solid def solid_a(): print('a: 1') return 1 with pytest.raises(DagsterInvalidDefinitionError) as excinfo_1: PipelineDefinition(solids=[solid_a], dependencies={ 'solid_b': { 'arg_a': DependencyDefinition('solid_a') } }) assert str( excinfo_1.value ) == 'Solid solid_b in dependency dictionary not found in solid list' with pytest.raises(DagsterInvalidDefinitionError) as excinfo_2: PipelineDefinition( solids=[solid_a], dependencies={ SolidInstance('solid_b', alias='solid_c'): { 'arg_a': DependencyDefinition('solid_a') } }, ) assert ( str(excinfo_2.value) == 'Solid solid_b (aliased by solid_c in dependency dictionary) not found in solid list' )
def define_airline_demo_download_pipeline(): solids = [download_from_s3, unzip_file] dependencies = { SolidInstance('download_from_s3', alias='download_archives'): {}, SolidInstance('unzip_file', alias='unzip_archives'): { 'archive_paths': DependencyDefinition('download_archives') }, SolidInstance('download_from_s3', alias='download_q2_sfo_weather'): {}, } return PipelineDefinition( name='airline_demo_download_pipeline', context_definitions=CONTEXT_DEFINITIONS, solids=solids, dependencies=dependencies, )
def define_test_notebook_dag_pipeline(): return PipelineDefinition( name='test_notebook_dag', solids=[load_constant, add_two_numbers_pm_solid, mult_two_numbers_pm_solid], dependencies={ SolidInstance('load_constant', alias='load_a'): {}, SolidInstance('load_constant', alias='load_b'): {}, SolidInstance(name='add_two_numbers', alias='add_two'): { 'a': DependencyDefinition('load_a'), 'b': DependencyDefinition('load_b'), }, SolidInstance(name='mult_two_numbers', alias='mult_two'): { 'a': DependencyDefinition('add_two'), 'b': DependencyDefinition('load_b'), }, }, )
def define_spew_pipeline(): return PipelineDefinition( name='log_spew', solids=[ nonce_solid('no_in_two_out', 0, 2), nonce_solid('one_in_one_out', 1, 1), nonce_solid('one_in_two_out', 1, 2), nonce_solid('two_in_one_out', 2, 1), nonce_solid('one_in_none_out', 1, 0), ], dependencies={ SolidInstance('no_in_two_out', alias='solid_a'): {}, SolidInstance('one_in_one_out', alias='solid_b'): { 'input_0': DependencyDefinition('solid_a', 'output_0') }, SolidInstance('one_in_two_out', alias='solid_c'): { 'input_0': DependencyDefinition('solid_a', 'output_1') }, SolidInstance('two_in_one_out', alias='solid_d'): { 'input_0': DependencyDefinition('solid_b', 'output_0'), 'input_1': DependencyDefinition('solid_c', 'output_0'), }, SolidInstance('one_in_one_out', alias='solid_e'): { 'input_0': DependencyDefinition('solid_c', 'output_0') }, SolidInstance('two_in_one_out', alias='solid_f'): { 'input_0': DependencyDefinition('solid_d', 'output_0'), 'input_1': DependencyDefinition('solid_e', 'output_0'), }, SolidInstance('one_in_none_out', alias='solid_g'): { 'input_0': DependencyDefinition('solid_f', 'output_0') }, }, )
def define_part_thirteen_step_three(): # (a + b) * (c + d) return PipelineDefinition( name='tutorial_part_thirteen_step_one', solids=[load_number, adder, multer], dependencies={ SolidInstance(load_number.name, 'a'): {}, SolidInstance(load_number.name, 'b'): {}, SolidInstance(load_number.name, 'c'): {}, SolidInstance(load_number.name, 'd'): {}, SolidInstance(adder.name, 'a_plus_b'): { 'num1': DependencyDefinition('a'), 'num2': DependencyDefinition('b'), }, SolidInstance(adder.name, 'c_plus_d'): { 'num1': DependencyDefinition('c'), 'num2': DependencyDefinition('d'), }, SolidInstance(multer.name, 'final'): { 'num1': DependencyDefinition('a_plus_b'), 'num2': DependencyDefinition('c_plus_d'), }, }, )
def test_aliased_solids_context(): record = defaultdict(set) @solid def log_things(info): solid_value = info.context.get_context_value('solid') solid_def_value = info.context.get_context_value('solid_definition') record[solid_def_value].add(solid_value) pipeline = PipelineDefinition(solids=[log_things], dependencies={ SolidInstance('log_things', 'log_a'): {}, SolidInstance('log_things', 'log_b'): {}, }) result = execute_pipeline(pipeline) assert result.success assert record == {'log_things': set(['log_a', 'log_b'])}
def test_composite_basic_execution(): a_source = define_stub_solid('A_source', [input_set('A_input')]) node_a = create_root_solid('A') node_b = create_solid_with_deps('B', node_a) node_c = create_solid_with_deps('C', node_a) node_d = create_solid_with_deps('D', node_b, node_c) diamond_composite = CompositeSolidDefinition( name='diamond_composite', solids=[a_source, node_a, node_b, node_c, node_d], dependencies={ 'A': {'A_input': DependencyDefinition('A_source')}, 'B': {'A': DependencyDefinition('A')}, 'C': {'A': DependencyDefinition('A')}, 'D': {'B': DependencyDefinition('B'), 'C': DependencyDefinition('C')}, }, ) result = execute_pipeline(PipelineDefinition(solids=[diamond_composite])) assert result.success result = execute_pipeline( PipelineDefinition( solids=[diamond_composite], dependencies={ SolidInstance('diamond_composite', alias='D1'): {}, SolidInstance('diamond_composite', alias='D2'): {}, }, ) ) assert result.success wrapped_composite = CompositeSolidDefinition( name='wrapped_composite', solids=[diamond_composite] ) result = execute_pipeline(PipelineDefinition(solids=[diamond_composite, wrapped_composite])) assert result.success empty_composite = CompositeSolidDefinition(name='empty', solids=[]) result = execute_pipeline(PipelineDefinition(solids=[empty_composite])) assert result.success
def test_aliased_solids_context(): record = defaultdict(set) @solid def log_things(context): solid_value = context.solid.name solid_def_value = context.solid_def.name record[solid_def_value].add(solid_value) pipeline = PipelineDefinition( solids=[log_things], dependencies={ SolidInstance('log_things', 'log_a'): {}, SolidInstance('log_things', 'log_b'): {}, }, ) result = execute_pipeline(pipeline) assert result.success assert dict(record) == {'log_things': set(['log_a', 'log_b'])}
def define_sleepy_pipeline(): return PipelineDefinition( name="sleepy", solids=[giver, sleeper, total], dependencies={ SolidInstance('giver'): {}, SolidInstance('sleeper', alias='sleeper_1'): { 'units': DependencyDefinition('giver', 'out_1') }, SolidInstance('sleeper', alias='sleeper_2'): { 'units': DependencyDefinition('giver', 'out_2') }, SolidInstance('sleeper', alias='sleeper_3'): { 'units': DependencyDefinition('giver', 'out_3') }, SolidInstance('sleeper', alias='sleeper_4'): { 'units': DependencyDefinition('giver', 'out_4') }, SolidInstance('total'): { 'in_1': DependencyDefinition('sleeper_1', 'total'), 'in_2': DependencyDefinition('sleeper_2', 'total'), 'in_3': DependencyDefinition('sleeper_3', 'total'), 'in_4': DependencyDefinition('sleeper_4', 'total'), }, }, )
def define_hammer_pipeline(): return PipelineDefinition( name="thors_hammer", solids=[giver, hammer, total], dependencies={ SolidInstance('giver'): {}, SolidInstance('hammer', alias='hammer_1'): { 'chase_duration': DependencyDefinition('giver', 'out_1') }, SolidInstance('hammer', alias='hammer_2'): { 'chase_duration': DependencyDefinition('giver', 'out_2') }, SolidInstance('hammer', alias='hammer_3'): { 'chase_duration': DependencyDefinition('giver', 'out_3') }, SolidInstance('hammer', alias='hammer_4'): { 'chase_duration': DependencyDefinition('giver', 'out_4') }, SolidInstance('total'): { 'in_1': DependencyDefinition('hammer_1', 'total'), 'in_2': DependencyDefinition('hammer_2', 'total'), 'in_3': DependencyDefinition('hammer_3', 'total'), 'in_4': DependencyDefinition('hammer_4', 'total'), }, }, mode_definitions=[ModeDefinition()], )
def define_pipeline(): return PipelineDefinition( name="error_monster", solids=[emit_num, num_to_str, str_to_num], dependencies={ SolidInstance('emit_num', 'start'): {}, SolidInstance('num_to_str', 'middle'): { 'num': DependencyDefinition('start') }, SolidInstance('str_to_num', 'end'): { 'string': DependencyDefinition('middle') }, }, context_definitions={ 'errorable_context': PipelineContextDefinition( config_field=Field(Dict({'throw_on_context_init': Field(Bool)})), context_fn=context_init, resources={'errorable_resource': define_errorable_resource()}, ) }, )
def define_airline_demo_ingest_pipeline(): solids = [process_on_time_data, sfo_weather_data, s3_to_dw_table] dependencies = { SolidInstance('s3_to_dw_table', alias='process_q2_coupon_data'): {}, SolidInstance('s3_to_dw_table', alias='process_q2_market_data'): {}, SolidInstance('s3_to_dw_table', alias='process_q2_ticket_data'): {}, } return PipelineDefinition( name="airline_demo_ingest_pipeline", solids=solids, dependencies=dependencies, mode_definitions=[test_mode, local_mode, prod_mode], preset_definitions=[ PresetDefinition( name='local_fast', mode='local', environment_files=[ file_relative_path(__file__, 'environments/local_base.yaml'), file_relative_path(__file__, 'environments/local_fast_ingest.yaml'), ], ), PresetDefinition( name='local_full', mode='local', environment_files=[ file_relative_path(__file__, 'environments/local_base.yaml'), file_relative_path(__file__, 'environments/local_full_ingest.yaml'), ], ), ], )
def define_composites_pipeline(): @lambda_solid(inputs=[InputDefinition('num', Int)]) def add_one(num): return num + 1 @lambda_solid(inputs=[InputDefinition('num')]) def div_two(num): return num / 2 add_two = CompositeSolidDefinition( 'add_two', solids=[add_one], dependencies={ SolidInstance('add_one', 'adder_1'): {}, SolidInstance('add_one', 'adder_2'): {'num': DependencyDefinition('adder_1')}, }, input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')], output_mappings=[OutputDefinition(Int).mapping_from('adder_2')], ) add_four = CompositeSolidDefinition( 'add_four', solids=[add_two], dependencies={ SolidInstance('add_two', 'adder_1'): {}, SolidInstance('add_two', 'adder_2'): {'num': DependencyDefinition('adder_1')}, }, input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')], output_mappings=[OutputDefinition(Int).mapping_from('adder_2')], ) div_four = CompositeSolidDefinition( 'div_four', solids=[div_two], dependencies={ SolidInstance('div_two', 'div_1'): {}, SolidInstance('div_two', 'div_2'): {'num': DependencyDefinition('div_1')}, }, input_mappings=[InputDefinition('num', Int).mapping_to('div_1', 'num')], output_mappings=[OutputDefinition(Float).mapping_from('div_2')], ) return PipelineDefinition( name='composites_pipeline', solids=[add_four, div_four], dependencies={'div_four': {'num': DependencyDefinition('add_four')}}, )
def define_airline_demo_ingest_pipeline(): solids = [ canonicalize_column_names, ingest_csv_to_spark, join_spark_data_frames, load_data_to_database_from_spark, normalize_weather_na_values, prefix_column_names, subsample_spark_dataset, union_spark_data_frames, ] dependencies = { SolidInstance('ingest_csv_to_spark', alias='ingest_april_on_time_data'): {}, SolidInstance('ingest_csv_to_spark', alias='ingest_may_on_time_data'): {}, SolidInstance('ingest_csv_to_spark', alias='ingest_june_on_time_data'): {}, SolidInstance('ingest_csv_to_spark', alias='ingest_q2_sfo_weather'): {}, SolidInstance('ingest_csv_to_spark', alias='ingest_q2_coupon_data'): {}, SolidInstance('ingest_csv_to_spark', alias='ingest_q2_market_data'): {}, SolidInstance('ingest_csv_to_spark', alias='ingest_q2_ticket_data'): {}, SolidInstance('ingest_csv_to_spark', alias='ingest_master_cord_data'): {}, SolidInstance('union_spark_data_frames', alias='combine_april_may_on_time_data'): { 'left_data_frame': DependencyDefinition('ingest_april_on_time_data'), 'right_data_frame': DependencyDefinition('ingest_may_on_time_data'), }, SolidInstance('union_spark_data_frames', alias='combine_q2_on_time_data'): { 'left_data_frame': DependencyDefinition('combine_april_may_on_time_data'), 'right_data_frame': DependencyDefinition('ingest_june_on_time_data'), }, SolidInstance('subsample_spark_dataset', alias='subsample_q2_on_time_data'): { 'data_frame': DependencyDefinition('combine_q2_on_time_data') }, SolidInstance('subsample_spark_dataset', alias='subsample_q2_ticket_data'): { 'data_frame': DependencyDefinition('ingest_q2_ticket_data') }, SolidInstance('subsample_spark_dataset', alias='subsample_q2_market_data'): { 'data_frame': DependencyDefinition('ingest_q2_market_data') }, SolidInstance('subsample_spark_dataset', alias='subsample_q2_coupon_data'): { 'data_frame': DependencyDefinition('ingest_q2_coupon_data') }, SolidInstance('normalize_weather_na_values', alias='normalize_q2_weather_na_values'): { 'data_frame': DependencyDefinition('ingest_q2_sfo_weather') }, SolidInstance('prefix_column_names', alias='prefix_dest_cord_data'): { 'data_frame': DependencyDefinition('ingest_master_cord_data') }, SolidInstance('prefix_column_names', alias='prefix_origin_cord_data'): { 'data_frame': DependencyDefinition('ingest_master_cord_data') }, SolidInstance('join_spark_data_frames', alias='join_q2_on_time_data_to_dest_cord_data'): { 'left_data_frame': DependencyDefinition('subsample_q2_on_time_data'), 'right_data_frame': DependencyDefinition('prefix_dest_cord_data'), }, SolidInstance('join_spark_data_frames', alias='join_q2_on_time_data_to_origin_cord_data'): { 'left_data_frame': DependencyDefinition('join_q2_on_time_data_to_dest_cord_data'), 'right_data_frame': DependencyDefinition('prefix_origin_cord_data'), }, SolidInstance('canonicalize_column_names', alias='canonicalize_q2_on_time_data'): { 'data_frame': DependencyDefinition('join_q2_on_time_data_to_origin_cord_data') }, SolidInstance('canonicalize_column_names', alias='canonicalize_q2_coupon_data'): { 'data_frame': DependencyDefinition('subsample_q2_coupon_data') }, SolidInstance('canonicalize_column_names', alias='canonicalize_q2_market_data'): { 'data_frame': DependencyDefinition('subsample_q2_market_data') }, SolidInstance('canonicalize_column_names', alias='canonicalize_q2_ticket_data'): { 'data_frame': DependencyDefinition('subsample_q2_ticket_data') }, SolidInstance('canonicalize_column_names', alias='canonicalize_q2_sfo_weather'): { 'data_frame': DependencyDefinition('normalize_q2_weather_na_values') }, SolidInstance('load_data_to_database_from_spark', alias='load_q2_on_time_data'): { 'data_frame': DependencyDefinition('canonicalize_q2_on_time_data') }, SolidInstance('load_data_to_database_from_spark', alias='load_q2_coupon_data'): { 'data_frame': DependencyDefinition('canonicalize_q2_coupon_data') }, SolidInstance('load_data_to_database_from_spark', alias='load_q2_market_data'): { 'data_frame': DependencyDefinition('canonicalize_q2_market_data') }, SolidInstance('load_data_to_database_from_spark', alias='load_q2_ticket_data'): { 'data_frame': DependencyDefinition('canonicalize_q2_ticket_data') }, SolidInstance('load_data_to_database_from_spark', alias='load_q2_sfo_weather'): { 'data_frame': DependencyDefinition('canonicalize_q2_sfo_weather') }, } return PipelineDefinition( name="airline_demo_ingest_pipeline", solids=solids, dependencies=dependencies, context_definitions=CONTEXT_DEFINITIONS, )