Ejemplo n.º 1
0
def define_error_monster_pipeline():
    return PipelineDefinition(
        name='error_monster',
        solids=[emit_num, num_to_str, str_to_num],
        dependencies={
            SolidInstance('emit_num', 'start'): {},
            SolidInstance('num_to_str', 'middle'): {
                'num': DependencyDefinition('start')
            },
            SolidInstance('str_to_num', 'end'): {
                'string': DependencyDefinition('middle')
            },
        },
        mode_definitions=[
            ModeDefinition(
                name='errorable_mode',
                resources={'errorable_resource': define_errorable_resource()})
        ],
        preset_definitions=[
            PresetDefinition(
                'passing',
                environment_files=[
                    file_relative_path(__file__, 'environments/error.yaml')
                ],
                mode='errorable_mode',
            )
        ],
    )
Ejemplo n.º 2
0
def test_required_inputs():
    @lambda_solid(inputs=[InputDefinition('num', types.Int)], output=OutputDefinition(types.Int))
    def add_one(num):
        return num + 1

    pipeline_def = PipelineDefinition(
        name='required_int_input',
        solids=[add_one],
        dependencies={
            SolidInstance('add_one', 'first_add'): {},
            SolidInstance('add_one', 'second_add'): {'num': DependencyDefinition('first_add')},
        },
    )

    env_type = pipeline_def.environment_type

    solids_type = env_type.fields['solids'].config_type

    first_add_fields = solids_type.fields['first_add'].config_type.fields

    assert 'inputs' in first_add_fields

    inputs_field = first_add_fields['inputs']

    assert inputs_field.is_required

    assert inputs_field.config_type.fields['num'].is_required

    # second_add has a dependency so the input is not available
    assert 'inputs' not in solids_type.fields['second_add'].config_type.fields
Ejemplo n.º 3
0
def define_event_ingest_pipeline():
    event_ingest = SparkSolidDefinition(
        name='event_ingest',
        main_class='io.dagster.events.EventPipeline',
        description='Ingest events from JSON to Parquet',
    )

    # TODO: express dependency of this solid on event_ingest
    snowflake_load = SnowflakeLoadSolidDefinition(
        'snowflake_load',
        # TODO: need to pull this out to a config
        src='file:///tmp/dagster/events/data/output/2019/01/01/*.parquet',
        table='events',
    )

    return PipelineDefinition(
        name='event_ingest_pipeline',
        solids=[download_from_s3_to_file, gunzipper, event_ingest, snowflake_load],
        dependencies={
            SolidInstance('gunzipper'): {
                'gzip_file': DependencyDefinition('download_from_s3_to_file')
            },
            SolidInstance('event_ingest'): {'spark_inputs': DependencyDefinition('gunzipper')},
            SolidInstance('snowflake_load'): {
                'start': DependencyDefinition('event_ingest', 'paths')
            },
        },
        mode_definitions=[
            ModeDefinition(resources={'s3': s3_resource, 'snowflake': snowflake_resource})
        ],
    )
Ejemplo n.º 4
0
def test_aliased_solids():
    @lambda_solid()
    def first():
        return ['first']

    @lambda_solid(inputs=[InputDefinition(name="prev")])
    def not_first(prev):
        return prev + ['not_first']

    pipeline = PipelineDefinition(
        solids=[first, not_first],
        dependencies={
            'not_first': {
                'prev': DependencyDefinition('first')
            },
            SolidInstance('not_first', alias='second'): {
                'prev': DependencyDefinition('not_first')
            },
            SolidInstance('not_first', alias='third'): {
                'prev': DependencyDefinition('second')
            },
        },
    )

    result = execute_pipeline(pipeline)
    assert result.success
    solid_result = result.result_for_solid('third')
    assert solid_result.transformed_value() == [
        'first', 'not_first', 'not_first', 'not_first'
    ]
Ejemplo n.º 5
0
def test_aliased_configs():
    @solid(
        inputs=[],
        config_def=ConfigDefinition(types.Int),
    )
    def load_constant(info):
        return info.config

    pipeline = PipelineDefinition(
        solids=[load_constant],
        dependencies={
            SolidInstance(load_constant.name, 'load_a'): {},
            SolidInstance(load_constant.name, 'load_b'): {},
        })

    result = execute_pipeline(
        pipeline,
        config.Environment(solids={
            'load_a': config.Solid(2),
            'load_b': config.Solid(3),
        }))

    assert result.success
    assert result.result_for_solid('load_a').transformed_value() == 2
    assert result.result_for_solid('load_b').transformed_value() == 3
Ejemplo n.º 6
0
def test_aliased_configs():
    @solid(inputs=[], config_field=Field(Int))
    def load_constant(info):
        return info.config

    pipeline = PipelineDefinition(
        solids=[load_constant],
        dependencies={
            SolidInstance(load_constant.name, 'load_a'): {},
            SolidInstance(load_constant.name, 'load_b'): {},
        },
    )

    result = execute_pipeline(
        pipeline,
        {'solids': {
            'load_a': {
                'config': 2
            },
            'load_b': {
                'config': 3
            }
        }})

    assert result.success
    assert result.result_for_solid('load_a').transformed_value() == 2
    assert result.result_for_solid('load_b').transformed_value() == 3
Ejemplo n.º 7
0
def define_airline_demo_warehouse_pipeline():
    return PipelineDefinition(
        name="airline_demo_warehouse_pipeline",
        solids=[
            average_sfo_outbound_avg_delays_by_destination,
            delays_by_geography,
            delays_vs_fares,
            delays_vs_fares_nb,
            eastbound_delays,
            q2_sfo_outbound_flights,
            sfo_delays_by_destination,
            tickets_with_destination,
            upload_to_s3,
            westbound_delays,
        ],
        dependencies={
            'q2_sfo_outbound_flights': {},
            'tickets_with_destination': {},
            'westbound_delays': {},
            'eastbound_delays': {},
            'average_sfo_outbound_avg_delays_by_destination': {
                'q2_sfo_outbound_flights':
                DependencyDefinition('q2_sfo_outbound_flights')
            },
            'delays_vs_fares': {
                'tickets_with_destination':
                DependencyDefinition('tickets_with_destination'),
                'average_sfo_outbound_avg_delays_by_destination':
                DependencyDefinition(
                    'average_sfo_outbound_avg_delays_by_destination'),
            },
            'fares_vs_delays': {
                'table_name': DependencyDefinition('delays_vs_fares')
            },
            'sfo_delays_by_destination': {
                'table_name':
                DependencyDefinition(
                    'average_sfo_outbound_avg_delays_by_destination')
            },
            'delays_by_geography': {
                'eastbound_delays': DependencyDefinition('eastbound_delays'),
                'westbound_delays': DependencyDefinition('westbound_delays'),
            },
            SolidInstance('upload_to_s3',
                          alias='upload_outbound_avg_delay_pdf_plots'): {
                'file_obj': DependencyDefinition('sfo_delays_by_destination')
            },
            SolidInstance('upload_to_s3',
                          alias='upload_delays_vs_fares_pdf_plots'): {
                'file_obj': DependencyDefinition('fares_vs_delays')
            },
            SolidInstance('upload_to_s3',
                          alias='upload_delays_by_geography_pdf_plots'): {
                'file_obj': DependencyDefinition('delays_by_geography')
            },
        },
        context_definitions=CONTEXT_DEFINITIONS,
    )
Ejemplo n.º 8
0
def define_tutorial_pipeline():
    return PipelineDefinition(
        name='tutorial_pipeline',
        solids=[clean_data_solid, LR_solid, RF_solid],
        dependencies={
            SolidInstance('clean_data'): {},
            SolidInstance('linear_regression'): {'df': DependencyDefinition('clean_data')},
            SolidInstance('random_forest_regression'): {'df': DependencyDefinition('clean_data')},
        },
    )
def test_nothing_inputs():
    @lambda_solid(inputs=[InputDefinition('never_defined', Nothing)])
    def emit_one():
        return 1

    @lambda_solid
    def emit_two():
        return 2

    @lambda_solid
    def emit_three():
        return 3

    @lambda_solid(output=OutputDefinition(Nothing))
    def emit_nothing():
        pass

    @solid(
        inputs=[
            InputDefinition('_one', Nothing),
            InputDefinition('one', Int),
            InputDefinition('_two', Nothing),
            InputDefinition('two', Int),
            InputDefinition('_three', Nothing),
            InputDefinition('three', Int),
        ]
    )
    def adder(_context, one, two, three):
        assert one == 1
        assert two == 2
        assert three == 3
        return one + two + three

    pipeline = PipelineDefinition(
        name='input_test',
        solids=[emit_one, emit_two, emit_three, emit_nothing, adder],
        dependencies={
            SolidInstance('emit_nothing', '_one'): {},
            SolidInstance('emit_nothing', '_two'): {},
            SolidInstance('emit_nothing', '_three'): {},
            'adder': {
                '_one': DependencyDefinition('_one'),
                '_two': DependencyDefinition('_two'),
                '_three': DependencyDefinition('_three'),
                'one': DependencyDefinition('emit_one'),
                'two': DependencyDefinition('emit_two'),
                'three': DependencyDefinition('emit_three'),
            },
        },
    )
    result = execute_pipeline(pipeline)
    assert result.success
def define_part_thirteen_step_two():
    return PipelineDefinition(
        name='thirteen_step_two',
        solids=[load_number, adder],
        dependencies={
            SolidInstance('load_number', alias='load_a'): {},
            SolidInstance('load_number', alias='load_b'): {},
            SolidInstance('adder', alias='a_plus_b'): {
                'num1': DependencyDefinition('load_a'),
                'num2': DependencyDefinition('load_b'),
            }
        }
    )
def test_fanin_deps():
    called = defaultdict(int)

    @lambda_solid
    def emit_two():
        return 2

    @lambda_solid(output=OutputDefinition(Nothing))
    def emit_nothing():
        called['emit_nothing'] += 1

    @solid(
        inputs=[
            InputDefinition('ready', Nothing),
            InputDefinition('num_1', Int),
            InputDefinition('num_2', Int),
        ]
    )
    def adder(_context, num_1, num_2):
        assert called['emit_nothing'] == 3
        called['adder'] += 1
        return num_1 + num_2

    pipeline = PipelineDefinition(
        name='input_test',
        solids=[emit_two, emit_nothing, adder],
        dependencies={
            SolidInstance('emit_two', 'emit_1'): {},
            SolidInstance('emit_two', 'emit_2'): {},
            SolidInstance('emit_nothing', '_one'): {},
            SolidInstance('emit_nothing', '_two'): {},
            SolidInstance('emit_nothing', '_three'): {},
            'adder': {
                'ready': MultiDependencyDefinition(
                    [
                        DependencyDefinition('_one'),
                        DependencyDefinition('_two'),
                        DependencyDefinition('_three'),
                    ]
                ),
                'num_1': DependencyDefinition('emit_1'),
                'num_2': DependencyDefinition('emit_2'),
            },
        },
    )
    result = execute_pipeline(pipeline)
    assert result.success
    assert called['adder'] == 1
    assert called['emit_nothing'] == 3
Ejemplo n.º 12
0
def define_reusable_solids_pipeline():
    # (a + b) * (c + d)

    return PipelineDefinition(
        name='reusable_solids_pipeline',
        solids=[adder, multer],
        dependencies={
            SolidInstance('adder', 'a_plus_b'): {},
            SolidInstance('adder', 'c_plus_d'): {},
            SolidInstance('multer', 'final'): {
                'num1': DependencyDefinition('a_plus_b'),
                'num2': DependencyDefinition('c_plus_d'),
            },
        },
    )
Ejemplo n.º 13
0
def define_part_fourteen_step_one_pipeline():
    # (a + b) * (c + d)

    return PipelineDefinition(
        name='part_fourteen_step_one_pipeline',
        solids=[adder, multer],
        dependencies={
            SolidInstance(adder.name, 'a_plus_b'): {},
            SolidInstance(adder.name, 'c_plus_d'): {},
            SolidInstance(multer.name, 'final'): {
                'num1': DependencyDefinition('a_plus_b'),
                'num2': DependencyDefinition('c_plus_d'),
            },
        },
    )
Ejemplo n.º 14
0
def test_create_single_solid_pipeline_with_alias():
    a_source = define_stub_solid('A_source', [input_set('A_input')])
    stub_solid = define_stub_solid('stub', [{'a_key': 'stubbed_thing'}])
    single_solid_pipeline = PipelineDefinition.create_single_solid_pipeline(
        PipelineDefinition(
            solids=[a_source, create_root_solid('A')],
            dependencies={
                SolidInstance('A', alias='aliased'): {
                    'A_input': DependencyDefinition(a_source.name)
                },
            },
        ),
        'aliased',
        {
            'aliased': {
                'A_input': stub_solid,
            },
        },
    )

    result = execute_pipeline(single_solid_pipeline)
    assert result.success

    expected = [{'a_key': 'stubbed_thing'}, {'A': 'transform_called'}]
    assert result.result_for_solid('aliased').transformed_value() == expected
Ejemplo n.º 15
0
def test_string_from_aliased_inputs():
    called = {}

    @solid(inputs=[InputDefinition('string_input', types.String)])
    def str_as_input(_context, string_input):
        assert string_input == 'foo'
        called['yup'] = True

    pipeline = PipelineDefinition(
        solids=[str_as_input],
        dependencies={SolidInstance('str_as_input', alias='aliased'): {}})

    result = execute_pipeline(pipeline, {
        'solids': {
            'aliased': {
                'inputs': {
                    'string_input': {
                        'value': 'foo'
                    }
                }
            }
        }
    })

    assert result.success
    assert called['yup']
Ejemplo n.º 16
0
def test_execute_aliased_solid_in_diamond():
    a_source = define_stub_solid('A_source', [input_set('A_input')])
    pipeline_def = PipelineDefinition(
        name='aliased_pipeline',
        solids=[a_source, create_root_solid('A')],
        dependencies={
            SolidInstance('A', alias='aliased'): {
                'A_input': DependencyDefinition(a_source.name)
            }
        },
    )

    solid_result = execute_solid(pipeline_def,
                                 'aliased',
                                 inputs={'A_input': [{
                                     'a key': 'a value'
                                 }]})

    assert solid_result.success
    assert solid_result.transformed_value() == [
        {
            'a key': 'a value'
        },
        {
            'aliased': 'transform_called'
        },
    ]
Ejemplo n.º 17
0
def test_mapper_errors():
    @lambda_solid
    def solid_a():
        print('a: 1')
        return 1

    with pytest.raises(DagsterInvalidDefinitionError) as excinfo_1:
        PipelineDefinition(solids=[solid_a],
                           dependencies={
                               'solid_b': {
                                   'arg_a': DependencyDefinition('solid_a')
                               }
                           })
    assert str(
        excinfo_1.value
    ) == 'Solid solid_b in dependency dictionary not found in solid list'

    with pytest.raises(DagsterInvalidDefinitionError) as excinfo_2:
        PipelineDefinition(
            solids=[solid_a],
            dependencies={
                SolidInstance('solid_b', alias='solid_c'): {
                    'arg_a': DependencyDefinition('solid_a')
                }
            },
        )
    assert (
        str(excinfo_2.value) ==
        'Solid solid_b (aliased by solid_c in dependency dictionary) not found in solid list'
    )
Ejemplo n.º 18
0
def define_airline_demo_download_pipeline():
    solids = [download_from_s3, unzip_file]
    dependencies = {
        SolidInstance('download_from_s3', alias='download_archives'): {},
        SolidInstance('unzip_file', alias='unzip_archives'): {
            'archive_paths': DependencyDefinition('download_archives')
        },
        SolidInstance('download_from_s3', alias='download_q2_sfo_weather'): {},
    }

    return PipelineDefinition(
        name='airline_demo_download_pipeline',
        context_definitions=CONTEXT_DEFINITIONS,
        solids=solids,
        dependencies=dependencies,
    )
Ejemplo n.º 19
0
def define_test_notebook_dag_pipeline():
    return PipelineDefinition(
        name='test_notebook_dag',
        solids=[load_constant, add_two_numbers_pm_solid, mult_two_numbers_pm_solid],
        dependencies={
            SolidInstance('load_constant', alias='load_a'): {},
            SolidInstance('load_constant', alias='load_b'): {},
            SolidInstance(name='add_two_numbers', alias='add_two'): {
                'a': DependencyDefinition('load_a'),
                'b': DependencyDefinition('load_b'),
            },
            SolidInstance(name='mult_two_numbers', alias='mult_two'): {
                'a': DependencyDefinition('add_two'),
                'b': DependencyDefinition('load_b'),
            },
        },
    )
Ejemplo n.º 20
0
def define_spew_pipeline():
    return PipelineDefinition(
        name='log_spew',
        solids=[
            nonce_solid('no_in_two_out', 0, 2),
            nonce_solid('one_in_one_out', 1, 1),
            nonce_solid('one_in_two_out', 1, 2),
            nonce_solid('two_in_one_out', 2, 1),
            nonce_solid('one_in_none_out', 1, 0),
        ],
        dependencies={
            SolidInstance('no_in_two_out', alias='solid_a'): {},
            SolidInstance('one_in_one_out', alias='solid_b'): {
                'input_0': DependencyDefinition('solid_a', 'output_0')
            },
            SolidInstance('one_in_two_out', alias='solid_c'): {
                'input_0': DependencyDefinition('solid_a', 'output_1')
            },
            SolidInstance('two_in_one_out', alias='solid_d'): {
                'input_0': DependencyDefinition('solid_b', 'output_0'),
                'input_1': DependencyDefinition('solid_c', 'output_0'),
            },
            SolidInstance('one_in_one_out', alias='solid_e'): {
                'input_0': DependencyDefinition('solid_c', 'output_0')
            },
            SolidInstance('two_in_one_out', alias='solid_f'): {
                'input_0': DependencyDefinition('solid_d', 'output_0'),
                'input_1': DependencyDefinition('solid_e', 'output_0'),
            },
            SolidInstance('one_in_none_out', alias='solid_g'): {
                'input_0': DependencyDefinition('solid_f', 'output_0')
            },
        },
    )
def define_part_thirteen_step_three():
    # (a + b) * (c + d)

    return PipelineDefinition(
        name='tutorial_part_thirteen_step_one',
        solids=[load_number, adder, multer],
        dependencies={
            SolidInstance(load_number.name, 'a'): {},
            SolidInstance(load_number.name, 'b'): {},
            SolidInstance(load_number.name, 'c'): {},
            SolidInstance(load_number.name, 'd'): {},
            SolidInstance(adder.name, 'a_plus_b'): {
                'num1': DependencyDefinition('a'),
                'num2': DependencyDefinition('b'),
            },
            SolidInstance(adder.name, 'c_plus_d'): {
                'num1': DependencyDefinition('c'),
                'num2': DependencyDefinition('d'),
            },
            SolidInstance(multer.name, 'final'): {
                'num1': DependencyDefinition('a_plus_b'),
                'num2': DependencyDefinition('c_plus_d'),
            },
        },
    )
Ejemplo n.º 22
0
def test_aliased_solids_context():
    record = defaultdict(set)

    @solid
    def log_things(info):
        solid_value = info.context.get_context_value('solid')
        solid_def_value = info.context.get_context_value('solid_definition')
        record[solid_def_value].add(solid_value)

    pipeline = PipelineDefinition(solids=[log_things],
                                  dependencies={
                                      SolidInstance('log_things', 'log_a'): {},
                                      SolidInstance('log_things', 'log_b'): {},
                                  })

    result = execute_pipeline(pipeline)
    assert result.success

    assert record == {'log_things': set(['log_a', 'log_b'])}
Ejemplo n.º 23
0
def test_composite_basic_execution():
    a_source = define_stub_solid('A_source', [input_set('A_input')])
    node_a = create_root_solid('A')
    node_b = create_solid_with_deps('B', node_a)
    node_c = create_solid_with_deps('C', node_a)
    node_d = create_solid_with_deps('D', node_b, node_c)

    diamond_composite = CompositeSolidDefinition(
        name='diamond_composite',
        solids=[a_source, node_a, node_b, node_c, node_d],
        dependencies={
            'A': {'A_input': DependencyDefinition('A_source')},
            'B': {'A': DependencyDefinition('A')},
            'C': {'A': DependencyDefinition('A')},
            'D': {'B': DependencyDefinition('B'), 'C': DependencyDefinition('C')},
        },
    )

    result = execute_pipeline(PipelineDefinition(solids=[diamond_composite]))
    assert result.success

    result = execute_pipeline(
        PipelineDefinition(
            solids=[diamond_composite],
            dependencies={
                SolidInstance('diamond_composite', alias='D1'): {},
                SolidInstance('diamond_composite', alias='D2'): {},
            },
        )
    )
    assert result.success

    wrapped_composite = CompositeSolidDefinition(
        name='wrapped_composite', solids=[diamond_composite]
    )
    result = execute_pipeline(PipelineDefinition(solids=[diamond_composite, wrapped_composite]))
    assert result.success

    empty_composite = CompositeSolidDefinition(name='empty', solids=[])
    result = execute_pipeline(PipelineDefinition(solids=[empty_composite]))
    assert result.success
Ejemplo n.º 24
0
def test_aliased_solids_context():
    record = defaultdict(set)

    @solid
    def log_things(context):
        solid_value = context.solid.name
        solid_def_value = context.solid_def.name
        record[solid_def_value].add(solid_value)

    pipeline = PipelineDefinition(
        solids=[log_things],
        dependencies={
            SolidInstance('log_things', 'log_a'): {},
            SolidInstance('log_things', 'log_b'): {},
        },
    )

    result = execute_pipeline(pipeline)
    assert result.success

    assert dict(record) == {'log_things': set(['log_a', 'log_b'])}
Ejemplo n.º 25
0
def define_sleepy_pipeline():
    return PipelineDefinition(
        name="sleepy",
        solids=[giver, sleeper, total],
        dependencies={
            SolidInstance('giver'): {},
            SolidInstance('sleeper', alias='sleeper_1'): {
                'units': DependencyDefinition('giver', 'out_1')
            },
            SolidInstance('sleeper', alias='sleeper_2'): {
                'units': DependencyDefinition('giver', 'out_2')
            },
            SolidInstance('sleeper', alias='sleeper_3'): {
                'units': DependencyDefinition('giver', 'out_3')
            },
            SolidInstance('sleeper', alias='sleeper_4'): {
                'units': DependencyDefinition('giver', 'out_4')
            },
            SolidInstance('total'): {
                'in_1': DependencyDefinition('sleeper_1', 'total'),
                'in_2': DependencyDefinition('sleeper_2', 'total'),
                'in_3': DependencyDefinition('sleeper_3', 'total'),
                'in_4': DependencyDefinition('sleeper_4', 'total'),
            },
        },
    )
Ejemplo n.º 26
0
def define_hammer_pipeline():
    return PipelineDefinition(
        name="thors_hammer",
        solids=[giver, hammer, total],
        dependencies={
            SolidInstance('giver'): {},
            SolidInstance('hammer', alias='hammer_1'): {
                'chase_duration': DependencyDefinition('giver', 'out_1')
            },
            SolidInstance('hammer', alias='hammer_2'): {
                'chase_duration': DependencyDefinition('giver', 'out_2')
            },
            SolidInstance('hammer', alias='hammer_3'): {
                'chase_duration': DependencyDefinition('giver', 'out_3')
            },
            SolidInstance('hammer', alias='hammer_4'): {
                'chase_duration': DependencyDefinition('giver', 'out_4')
            },
            SolidInstance('total'): {
                'in_1': DependencyDefinition('hammer_1', 'total'),
                'in_2': DependencyDefinition('hammer_2', 'total'),
                'in_3': DependencyDefinition('hammer_3', 'total'),
                'in_4': DependencyDefinition('hammer_4', 'total'),
            },
        },
        mode_definitions=[ModeDefinition()],
    )
Ejemplo n.º 27
0
def define_pipeline():
    return PipelineDefinition(
        name="error_monster",
        solids=[emit_num, num_to_str, str_to_num],
        dependencies={
            SolidInstance('emit_num', 'start'): {},
            SolidInstance('num_to_str', 'middle'): {
                'num': DependencyDefinition('start')
            },
            SolidInstance('str_to_num', 'end'): {
                'string': DependencyDefinition('middle')
            },
        },
        context_definitions={
            'errorable_context':
            PipelineContextDefinition(
                config_field=Field(Dict({'throw_on_context_init':
                                         Field(Bool)})),
                context_fn=context_init,
                resources={'errorable_resource': define_errorable_resource()},
            )
        },
    )
Ejemplo n.º 28
0
def define_airline_demo_ingest_pipeline():
    solids = [process_on_time_data, sfo_weather_data, s3_to_dw_table]
    dependencies = {
        SolidInstance('s3_to_dw_table', alias='process_q2_coupon_data'): {},
        SolidInstance('s3_to_dw_table', alias='process_q2_market_data'): {},
        SolidInstance('s3_to_dw_table', alias='process_q2_ticket_data'): {},
    }

    return PipelineDefinition(
        name="airline_demo_ingest_pipeline",
        solids=solids,
        dependencies=dependencies,
        mode_definitions=[test_mode, local_mode, prod_mode],
        preset_definitions=[
            PresetDefinition(
                name='local_fast',
                mode='local',
                environment_files=[
                    file_relative_path(__file__,
                                       'environments/local_base.yaml'),
                    file_relative_path(__file__,
                                       'environments/local_fast_ingest.yaml'),
                ],
            ),
            PresetDefinition(
                name='local_full',
                mode='local',
                environment_files=[
                    file_relative_path(__file__,
                                       'environments/local_base.yaml'),
                    file_relative_path(__file__,
                                       'environments/local_full_ingest.yaml'),
                ],
            ),
        ],
    )
Ejemplo n.º 29
0
def define_composites_pipeline():
    @lambda_solid(inputs=[InputDefinition('num', Int)])
    def add_one(num):
        return num + 1

    @lambda_solid(inputs=[InputDefinition('num')])
    def div_two(num):
        return num / 2

    add_two = CompositeSolidDefinition(
        'add_two',
        solids=[add_one],
        dependencies={
            SolidInstance('add_one', 'adder_1'): {},
            SolidInstance('add_one', 'adder_2'): {'num': DependencyDefinition('adder_1')},
        },
        input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')],
        output_mappings=[OutputDefinition(Int).mapping_from('adder_2')],
    )

    add_four = CompositeSolidDefinition(
        'add_four',
        solids=[add_two],
        dependencies={
            SolidInstance('add_two', 'adder_1'): {},
            SolidInstance('add_two', 'adder_2'): {'num': DependencyDefinition('adder_1')},
        },
        input_mappings=[InputDefinition('num', Int).mapping_to('adder_1', 'num')],
        output_mappings=[OutputDefinition(Int).mapping_from('adder_2')],
    )

    div_four = CompositeSolidDefinition(
        'div_four',
        solids=[div_two],
        dependencies={
            SolidInstance('div_two', 'div_1'): {},
            SolidInstance('div_two', 'div_2'): {'num': DependencyDefinition('div_1')},
        },
        input_mappings=[InputDefinition('num', Int).mapping_to('div_1', 'num')],
        output_mappings=[OutputDefinition(Float).mapping_from('div_2')],
    )

    return PipelineDefinition(
        name='composites_pipeline',
        solids=[add_four, div_four],
        dependencies={'div_four': {'num': DependencyDefinition('add_four')}},
    )
Ejemplo n.º 30
0
def define_airline_demo_ingest_pipeline():
    solids = [
        canonicalize_column_names,
        ingest_csv_to_spark,
        join_spark_data_frames,
        load_data_to_database_from_spark,
        normalize_weather_na_values,
        prefix_column_names,
        subsample_spark_dataset,
        union_spark_data_frames,
    ]
    dependencies = {
        SolidInstance('ingest_csv_to_spark', alias='ingest_april_on_time_data'):
        {},
        SolidInstance('ingest_csv_to_spark', alias='ingest_may_on_time_data'):
        {},
        SolidInstance('ingest_csv_to_spark', alias='ingest_june_on_time_data'):
        {},
        SolidInstance('ingest_csv_to_spark', alias='ingest_q2_sfo_weather'):
        {},
        SolidInstance('ingest_csv_to_spark', alias='ingest_q2_coupon_data'):
        {},
        SolidInstance('ingest_csv_to_spark', alias='ingest_q2_market_data'):
        {},
        SolidInstance('ingest_csv_to_spark', alias='ingest_q2_ticket_data'):
        {},
        SolidInstance('ingest_csv_to_spark', alias='ingest_master_cord_data'):
        {},
        SolidInstance('union_spark_data_frames',
                      alias='combine_april_may_on_time_data'): {
            'left_data_frame':
            DependencyDefinition('ingest_april_on_time_data'),
            'right_data_frame':
            DependencyDefinition('ingest_may_on_time_data'),
        },
        SolidInstance('union_spark_data_frames',
                      alias='combine_q2_on_time_data'): {
            'left_data_frame':
            DependencyDefinition('combine_april_may_on_time_data'),
            'right_data_frame':
            DependencyDefinition('ingest_june_on_time_data'),
        },
        SolidInstance('subsample_spark_dataset',
                      alias='subsample_q2_on_time_data'): {
            'data_frame': DependencyDefinition('combine_q2_on_time_data')
        },
        SolidInstance('subsample_spark_dataset',
                      alias='subsample_q2_ticket_data'): {
            'data_frame': DependencyDefinition('ingest_q2_ticket_data')
        },
        SolidInstance('subsample_spark_dataset',
                      alias='subsample_q2_market_data'): {
            'data_frame': DependencyDefinition('ingest_q2_market_data')
        },
        SolidInstance('subsample_spark_dataset',
                      alias='subsample_q2_coupon_data'): {
            'data_frame': DependencyDefinition('ingest_q2_coupon_data')
        },
        SolidInstance('normalize_weather_na_values',
                      alias='normalize_q2_weather_na_values'): {
            'data_frame': DependencyDefinition('ingest_q2_sfo_weather')
        },
        SolidInstance('prefix_column_names', alias='prefix_dest_cord_data'): {
            'data_frame': DependencyDefinition('ingest_master_cord_data')
        },
        SolidInstance('prefix_column_names', alias='prefix_origin_cord_data'):
        {
            'data_frame': DependencyDefinition('ingest_master_cord_data')
        },
        SolidInstance('join_spark_data_frames',
                      alias='join_q2_on_time_data_to_dest_cord_data'): {
            'left_data_frame':
            DependencyDefinition('subsample_q2_on_time_data'),
            'right_data_frame': DependencyDefinition('prefix_dest_cord_data'),
        },
        SolidInstance('join_spark_data_frames',
                      alias='join_q2_on_time_data_to_origin_cord_data'): {
            'left_data_frame':
            DependencyDefinition('join_q2_on_time_data_to_dest_cord_data'),
            'right_data_frame':
            DependencyDefinition('prefix_origin_cord_data'),
        },
        SolidInstance('canonicalize_column_names',
                      alias='canonicalize_q2_on_time_data'): {
            'data_frame':
            DependencyDefinition('join_q2_on_time_data_to_origin_cord_data')
        },
        SolidInstance('canonicalize_column_names',
                      alias='canonicalize_q2_coupon_data'): {
            'data_frame': DependencyDefinition('subsample_q2_coupon_data')
        },
        SolidInstance('canonicalize_column_names',
                      alias='canonicalize_q2_market_data'): {
            'data_frame': DependencyDefinition('subsample_q2_market_data')
        },
        SolidInstance('canonicalize_column_names',
                      alias='canonicalize_q2_ticket_data'): {
            'data_frame': DependencyDefinition('subsample_q2_ticket_data')
        },
        SolidInstance('canonicalize_column_names',
                      alias='canonicalize_q2_sfo_weather'): {
            'data_frame':
            DependencyDefinition('normalize_q2_weather_na_values')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_on_time_data'): {
            'data_frame': DependencyDefinition('canonicalize_q2_on_time_data')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_coupon_data'): {
            'data_frame': DependencyDefinition('canonicalize_q2_coupon_data')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_market_data'): {
            'data_frame': DependencyDefinition('canonicalize_q2_market_data')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_ticket_data'): {
            'data_frame': DependencyDefinition('canonicalize_q2_ticket_data')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_sfo_weather'): {
            'data_frame': DependencyDefinition('canonicalize_q2_sfo_weather')
        },
    }

    return PipelineDefinition(
        name="airline_demo_ingest_pipeline",
        solids=solids,
        dependencies=dependencies,
        context_definitions=CONTEXT_DEFINITIONS,
    )