Esempio n. 1
0
def define_airline_demo_ingest_pipeline():
    solids = [
        canonicalize_column_names,
        download_from_s3_to_bytes,
        ingest_csv_to_spark,
        load_data_to_database_from_spark,
        process_q2_data,
        process_sfo_weather_data,
        subsample_spark_dataset,
        unzip_file,
    ]
    dependencies = {
        SolidInstance('download_from_s3_to_bytes',
                      alias='download_april_on_time_data'): {},
        SolidInstance('download_from_s3_to_bytes',
                      alias='download_may_on_time_data'): {},
        SolidInstance('download_from_s3_to_bytes',
                      alias='download_june_on_time_data'): {},
        SolidInstance('download_from_s3_to_bytes',
                      alias='download_master_cord_data'): {},
        SolidInstance('download_from_s3_to_bytes',
                      alias='download_q2_coupon_data'): {},
        SolidInstance('download_from_s3_to_bytes',
                      alias='download_q2_market_data'): {},
        SolidInstance('download_from_s3_to_bytes',
                      alias='download_q2_ticket_data'): {},
        SolidInstance('download_from_s3_to_bytes',
                      alias='download_q2_sfo_weather'): {},
        SolidInstance('unzip_file', alias='unzip_april_on_time_data'): {
            'archive_file': DependencyDefinition('download_april_on_time_data')
        },
        SolidInstance('unzip_file', alias='unzip_may_on_time_data'): {
            'archive_file': DependencyDefinition('download_may_on_time_data')
        },
        SolidInstance('unzip_file', alias='unzip_june_on_time_data'): {
            'archive_file': DependencyDefinition('download_june_on_time_data')
        },
        SolidInstance('unzip_file', alias='unzip_master_cord_data'): {
            'archive_file': DependencyDefinition('download_master_cord_data')
        },
        SolidInstance('unzip_file', alias='unzip_q2_coupon_data'): {
            'archive_file': DependencyDefinition('download_q2_coupon_data')
        },
        SolidInstance('unzip_file', alias='unzip_q2_market_data'): {
            'archive_file': DependencyDefinition('download_q2_market_data')
        },
        SolidInstance('unzip_file', alias='unzip_q2_ticket_data'): {
            'archive_file': DependencyDefinition('download_q2_ticket_data')
        },
        SolidInstance('ingest_csv_to_spark', alias='ingest_april_on_time_data'):
        {
            'input_csv_file': DependencyDefinition('unzip_april_on_time_data')
        },
        SolidInstance('ingest_csv_to_spark', alias='ingest_may_on_time_data'):
        {
            'input_csv_file': DependencyDefinition('unzip_may_on_time_data')
        },
        SolidInstance('ingest_csv_to_spark', alias='ingest_june_on_time_data'):
        {
            'input_csv_file': DependencyDefinition('unzip_june_on_time_data')
        },
        SolidInstance('ingest_csv_to_spark', alias='ingest_q2_sfo_weather'): {
            'input_csv_file': DependencyDefinition('download_q2_sfo_weather')
        },
        SolidInstance('ingest_csv_to_spark', alias='ingest_q2_coupon_data'): {
            'input_csv_file': DependencyDefinition('unzip_q2_coupon_data')
        },
        SolidInstance('ingest_csv_to_spark', alias='ingest_q2_market_data'): {
            'input_csv_file': DependencyDefinition('unzip_q2_market_data')
        },
        SolidInstance('ingest_csv_to_spark', alias='ingest_q2_ticket_data'): {
            'input_csv_file': DependencyDefinition('unzip_q2_ticket_data')
        },
        SolidInstance('ingest_csv_to_spark', alias='ingest_master_cord_data'):
        {
            'input_csv_file': DependencyDefinition('unzip_master_cord_data')
        },
        'process_q2_data': {
            'april_data': DependencyDefinition('ingest_april_on_time_data'),
            'may_data': DependencyDefinition('ingest_may_on_time_data'),
            'june_data': DependencyDefinition('ingest_june_on_time_data'),
            'master_cord_data':
            DependencyDefinition('ingest_master_cord_data'),
        },
        SolidInstance('subsample_spark_dataset',
                      alias='subsample_q2_ticket_data'): {
            'data_frame': DependencyDefinition('ingest_q2_ticket_data')
        },
        SolidInstance('subsample_spark_dataset',
                      alias='subsample_q2_market_data'): {
            'data_frame': DependencyDefinition('ingest_q2_market_data')
        },
        SolidInstance('subsample_spark_dataset',
                      alias='subsample_q2_coupon_data'): {
            'data_frame': DependencyDefinition('ingest_q2_coupon_data')
        },
        'process_sfo_weather_data': {
            'sfo_weather_data': DependencyDefinition('ingest_q2_sfo_weather')
        },
        SolidInstance('canonicalize_column_names',
                      alias='canonicalize_q2_coupon_data'): {
            'data_frame': DependencyDefinition('subsample_q2_coupon_data')
        },
        SolidInstance('canonicalize_column_names',
                      alias='canonicalize_q2_market_data'): {
            'data_frame': DependencyDefinition('subsample_q2_market_data')
        },
        SolidInstance('canonicalize_column_names',
                      alias='canonicalize_q2_ticket_data'): {
            'data_frame': DependencyDefinition('subsample_q2_ticket_data')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_on_time_data'): {
            'data_frame': DependencyDefinition('process_q2_data')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_coupon_data'): {
            'data_frame': DependencyDefinition('canonicalize_q2_coupon_data')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_market_data'): {
            'data_frame': DependencyDefinition('canonicalize_q2_market_data')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_ticket_data'): {
            'data_frame': DependencyDefinition('canonicalize_q2_ticket_data')
        },
        SolidInstance('load_data_to_database_from_spark',
                      alias='load_q2_sfo_weather'): {
            'data_frame': DependencyDefinition('process_sfo_weather_data')
        },
    }

    return PipelineDefinition(
        name="airline_demo_ingest_pipeline",
        solids=solids,
        dependencies=dependencies,
        mode_definitions=[test_mode, local_mode, prod_mode],
        preset_definitions=[
            PresetDefinition(
                name='local_fast',
                mode='local',
                environment_files=[
                    file_relative_path(__file__,
                                       'environments/local_base.yaml'),
                    file_relative_path(__file__,
                                       'environments/local_fast_ingest.yaml'),
                ],
            ),
            PresetDefinition(
                name='local_full',
                mode='local',
                environment_files=[
                    file_relative_path(__file__,
                                       'environments/local_base.yaml'),
                    file_relative_path(__file__,
                                       'environments/local_full_ingest.yaml'),
                ],
            ),
        ],
    )
Esempio n. 2
0
        return string + string

    return int(string)


@pipeline(
    mode_defs=[
        ModeDefinition(
            name='errorable_mode',
            resource_defs={'errorable_resource': define_errorable_resource()})
    ],
    preset_defs=[
        PresetDefinition.from_files(
            'passing',
            environment_files=[
                file_relative_path(__file__, 'environments/error.yaml')
            ],
            mode='errorable_mode',
        )
    ],
)
def error_monster():
    start = emit_num.alias('start')()
    middle = num_to_str.alias('middle')(num=start)
    str_to_num.alias('end')(string=middle)


if __name__ == '__main__':
    result = execute_pipeline(
        error_monster,
        {
Esempio n. 3
0
    return people.count()


emr_mode = ModeDefinition(
    name='emr',
    resource_defs={
        'pyspark_step_launcher': emr_pyspark_step_launcher,
        'pyspark': pyspark_resource,
        's3': s3_resource,
    },
    intermediate_storage_defs=s3_plus_default_intermediate_storage_defs,
)

emr_preset = PresetDefinition.from_pkg_resources(
    name='emr',
    mode='emr',
    pkg_resource_defs=[('emr_pyspark', 'prod_resources.yaml'),
                       ('emr_pyspark', 's3_storage.yaml')],
)

local_mode = ModeDefinition(
    name='local',
    resource_defs={
        'pyspark_step_launcher': no_step_launcher,
        'pyspark': pyspark_resource
    },
)


@pipeline(
    mode_defs=[emr_mode, local_mode],
    preset_defs=[emr_preset],
Esempio n. 4
0
@pipeline(
    mode_defs=[
        ModeDefinition(name="prod", resource_defs={"db": postgres, "slack": slack_resource}),
        ModeDefinition(name="dev", resource_defs={"db": postgres, "slack": mock_slack_resource}),
    ],
    preset_defs=[
        PresetDefinition(
            name="dev",
            run_config={
                "solids": {
                    "download_file": {
                        "config": {"url": CEREALS_DATASET_URL, "target_path": "cereals.csv"}
                    },
                    "post_plot_to_slack": {"config": {"channels": ["foo_channel"]}},
                },
                "resources": {
                    "db": {
                        "config": {
                            "db_url": "postgresql://*****:*****@localhost:5432/dbt_example"
                        }
                    },
                    "slack": {"config": {"token": "nonce"}},
                },
            },
            mode="dev",
        ),
        PresetDefinition(
            name="prod",
            run_config={
                "solids": {
                    "download_file": {
                        "config": {"url": CEREALS_DATASET_URL, "target_path": "cereals.csv"}
Esempio n. 5
0
    sum_df['sum'] = sum_df['num1'] + sum_df['num2']
    return sum_df


@solid
def sum_sq_solid(_, sum_df: DataFrame) -> DataFrame:
    sum_sq_df = sum_df.copy()
    sum_sq_df['sum_sq'] = sum_df['sum']**2
    return sum_sq_df


@pipeline(preset_defs=[
    PresetDefinition.from_files(
        'test',
        environment_files=[
            file_relative_path(__file__,
                               'environments/pandas_hello_world_test.yaml')
        ],
    ),
    PresetDefinition.from_files(
        'prod',
        environment_files=[
            file_relative_path(__file__,
                               'environments/pandas_hello_world_prod.yaml')
        ],
    ),
])
def pandas_hello_world_pipeline():
    return sum_sq_solid(sum_solid())

Esempio n. 6
0
    input_defs=[InputDefinition("sum_sq_solid", dagster_pd.DataFrame)],
    output_def=OutputDefinition(dagster_pd.DataFrame),
)
def always_fails_solid(**_kwargs):
    raise Exception("I am a programmer and I make error")


@pipeline
def pandas_hello_world_fails():
    always_fails_solid(sum_sq_solid=sum_sq_solid(sum_df=sum_solid()))


@pipeline(preset_defs=[
    PresetDefinition.from_files(
        "test",
        config_files=[
            file_relative_path(__file__,
                               "environments/pandas_hello_world_test.yaml")
        ],
    ),
    PresetDefinition.from_files(
        "prod",
        config_files=[
            file_relative_path(__file__,
                               "environments/pandas_hello_world_prod.yaml")
        ],
    ),
])
def pandas_hello_world():
    sum_sq_solid(sum_solid())
Esempio n. 7
0
                        Field(Int),
                        'field_six_nullable_int_list':
                        Field(List[Optional[Int]], is_optional=True),
                    })),
            })),
    )
    def a_solid_with_multilayered_config(_):
        return None

    return a_solid_with_multilayered_config()


@pipeline(preset_defs=[
    PresetDefinition.from_files(
        name='prod',
        environment_files=[
            script_relative_path('../environments/csv_hello_world_prod.yaml')
        ],
    ),
    PresetDefinition.from_files(
        name='test',
        environment_files=[
            script_relative_path('../environments/csv_hello_world_test.yaml')
        ],
    ),
    PresetDefinition(
        name='test_inline',
        environment_dict={
            'solids': {
                'sum_solid': {
                    'inputs': {
                        'num': script_relative_path("../data/num.csv")
Esempio n. 8
0
@pipeline(
    mode_defs=[
        ModeDefinition(
            name='default',
            resource_defs={
                's3': s3_resource,
                'snowflake': snowflake_resource,
                'spark': spark_resource,
            },
        )
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            'default',
            pkg_resource_defs=[
                ('dagster_examples.event_pipeline_demo.environments', 'default.yaml'),
            ],
        )
    ],
)
def event_ingest_pipeline():
    event_ingest = create_spark_solid(
        name='event_ingest',
        main_class='io.dagster.events.EventPipeline',
        description='Ingest events from JSON to Parquet',
    )

    @solid(input_defs=[InputDefinition('start', Nothing)], required_resource_keys={'snowflake'})
    def snowflake_load(context):
        # TODO: express dependency of this solid on event_ingest
        context.resources.snowflake.load_table_from_local_parquet(
Esempio n. 9
0
@pipeline(
    mode_defs=[
        ModeDefinition(
            name='local',
            resource_defs={'transporter': local_transporter, 'volume': temporary_directory_mount},
        ),
        ModeDefinition(
            name='production',
            resource_defs={'transporter': production_transporter, 'volume': mount},
        ),
    ],
    preset_defs=[
        PresetDefinition.from_files(
            'dev',
            mode='local',
            environment_files=[
                file_relative_path(__file__, 'environments/bay_bike_pipeline_base.yaml'),
                file_relative_path(__file__, 'environments/bay_bike_pipeline_dev.yaml'),
            ],
        ),
        PresetDefinition.from_files(
            'production',
            mode='production',
            environment_files=[
                file_relative_path(__file__, 'environments/bay_bike_pipeline_base.yaml'),
                file_relative_path(__file__, 'environments/bay_bike_pipeline_production.yaml'),
            ],
        ),
    ],
)
def extract_monthly_bay_bike_pipeline():
    upload_consolidated_csv = upload_file_to_bucket.alias('upload_consolidated_csv')
Esempio n. 10
0
                    resource_defs={"ge_data_context": ge_data_context})
 ],
 preset_defs=[
     PresetDefinition(
         "sample_preset_success",
         mode="basic",
         run_config={
             "resources": {
                 "ge_data_context": {
                     "config": {
                         "ge_root_dir":
                         file_relative_path(__file__,
                                            "./great_expectations")
                     }
                 }
             },
             "solids": {
                 "read_in_datafile": {
                     "inputs": {
                         "csv_path": {
                             "value":
                             file_relative_path(__file__, "./succeed.csv")
                         }
                     }
                 }
             },
         },
     ),
     PresetDefinition(
         "sample_preset_fail",
         mode="basic",
Esempio n. 11
0
    with open(file_relative_path(__file__, 'sql/explore_visits_by_hour.sql'),
              'r') as f:
        query = f.read()

    return bq_solid_for_queries(
        [query]).alias('explore_visits_by_hour_internal')(start=start)


@pipeline(
    mode_defs=[
        ModeDefinition(
            name='default',
            resource_defs={
                'bigquery': bigquery_resource,
                'dataproc': dataproc_resource
            },
        )
    ],
    preset_defs=[
        PresetDefinition.from_files(
            name='default',
            mode='default',
            environment_files=[
                file_relative_path(__file__, 'environments/default.yaml')
            ],
        )
    ],
)
def gcp_pipeline():
    return explore_visits_by_hour(bq_load_events(events_dataproc()))
Esempio n. 12
0
    if context.solid_config['return_wrong_type']:
        return string + string

    return int(string)


@pipeline(
    mode_defs=[
        ModeDefinition(
            name='errorable_mode', resource_defs={'errorable_resource': define_errorable_resource()}
        )
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            'passing',
            pkg_resource_defs=[('dagster_examples.toys.environments', 'error.yaml')],
            mode='errorable_mode',
        )
    ],
)
def error_monster():
    start = emit_num.alias('start')()
    middle = num_to_str.alias('middle')(num=start)
    str_to_num.alias('end')(string=middle)


if __name__ == '__main__':
    result = execute_pipeline(
        error_monster,
        {
            'solids': {
Esempio n. 13
0
     ),
     ModeDefinition(
         name="dev",
         resource_defs={
             "warehouse": sqlalchemy_postgres_warehouse_resource
         },
     ),
 ],
 preset_defs=[
     PresetDefinition(
         "unittest",
         run_config={
             "resources": {
                 "warehouse": {
                     "config": {
                         "conn_str": ":memory:"
                     }
                 }
             }
         },
         mode="unittest",
     ),
     PresetDefinition.from_files(
         "dev",
         config_files=[
             file_relative_path(__file__, "presets_dev_warehouse.yaml"),
             file_relative_path(__file__, "presets_csv.yaml"),
         ],
         mode="dev",
     ),
 ],
Esempio n. 14
0
        "tempfile": tempfile_resource,
        "file_cache": s3_file_cache,
        "file_manager": s3_file_manager,
    },
    intermediate_storage_defs=s3_plus_default_intermediate_storage_defs,
)


@pipeline(
    # ordered so the local is first and therefore the default
    mode_defs=[local_mode, test_mode, prod_mode],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            name="local_fast",
            mode="local",
            pkg_resource_defs=[
                ("airline_demo.environments", "local_base.yaml"),
                ("airline_demo.environments", "local_fast_ingest.yaml"),
            ],
        ),
        PresetDefinition.from_pkg_resources(
            name="local_full",
            mode="local",
            pkg_resource_defs=[
                ("airline_demo.environments", "local_base.yaml"),
                ("airline_demo.environments", "local_full_ingest.yaml"),
            ],
        ),
        PresetDefinition.from_pkg_resources(
            name="prod_fast",
            mode="prod",
            pkg_resource_defs=[
Esempio n. 15
0
        'tempfile': tempfile_resource,
        'file_cache': s3_file_cache,
    },
    system_storage_defs=s3_plus_default_storage_defs,
)


@pipeline(
    # ordered so the local is first and therefore the default
    mode_defs=[local_mode, test_mode, prod_mode],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            name='local_fast',
            mode='local',
            pkg_resource_defs=[
                ('dagster_examples.airline_demo.environments',
                 'local_base.yaml'),
                ('dagster_examples.airline_demo.environments',
                 'local_fast_ingest.yaml'),
            ],
        ),
        PresetDefinition.from_pkg_resources(
            name='local_full',
            mode='local',
            pkg_resource_defs=[
                ('dagster_examples.airline_demo.environments',
                 'local_base.yaml'),
                ('dagster_examples.airline_demo.environments',
                 'local_full_ingest.yaml'),
            ],
        ),
        PresetDefinition.from_pkg_resources(
def test_presets():
    @solid(config={'error': Bool})
    def can_fail(context):
        if context.solid_config['error']:
            raise Exception('I did an error')
        return 'cool'

    @lambda_solid
    def always_fail():
        raise Exception('I always do this')

    pipeline = PipelineDefinition(
        name='simple',
        solid_defs=[can_fail, always_fail],
        preset_defs=[
            PresetDefinition.from_files(
                'passing',
                environment_files=[
                    file_relative_path(__file__, 'pass_env.yaml')
                ],
                solid_subset=['can_fail'],
            ),
            PresetDefinition.from_files(
                'passing_overide_to_fail',
                environment_files=[
                    file_relative_path(__file__, 'pass_env.yaml')
                ],
                solid_subset=['can_fail'],
            ).with_additional_config(
                {'solids': {
                    'can_fail': {
                        'config': {
                            'error': True
                        }
                    }
                }}),
            PresetDefinition(
                'passing_direct_dict',
                environment_dict={
                    'solids': {
                        'can_fail': {
                            'config': {
                                'error': False
                            }
                        }
                    }
                },
                solid_subset=['can_fail'],
            ),
            PresetDefinition.from_files(
                'failing_1',
                environment_files=[
                    file_relative_path(__file__, 'fail_env.yaml')
                ],
                solid_subset=['can_fail'],
            ),
            PresetDefinition.from_files('failing_2',
                                        environment_files=[
                                            file_relative_path(
                                                __file__, 'pass_env.yaml')
                                        ]),
            PresetDefinition(
                'subset',
                solid_subset=['can_fail'],
            ),
        ],
    )

    with pytest.raises(DagsterInvalidDefinitionError):
        PresetDefinition.from_files('invalid_1',
                                    environment_files=[
                                        file_relative_path(
                                            __file__, 'not_a_file.yaml')
                                    ])

    with pytest.raises(DagsterInvariantViolationError):
        PresetDefinition.from_files(
            'invalid_2',
            environment_files=[
                file_relative_path(__file__, 'test_repository_definition.py')
            ],
        )

    assert execute_pipeline(pipeline, preset='passing').success

    assert execute_pipeline(pipeline, preset='passing_direct_dict').success
    assert execute_pipeline(pipeline, preset='failing_1',
                            raise_on_error=False).success == False

    assert execute_pipeline(pipeline, preset='failing_2',
                            raise_on_error=False).success == False

    with pytest.raises(DagsterInvariantViolationError,
                       match='Could not find preset'):
        execute_pipeline(pipeline, preset='not_failing', raise_on_error=False)

    assert (execute_pipeline(pipeline,
                             preset='passing_overide_to_fail',
                             raise_on_error=False).success == False)

    assert execute_pipeline(
        pipeline,
        preset='passing',
        environment_dict={
            'solids': {
                'can_fail': {
                    'config': {
                        'error': False
                    }
                }
            }
        },
    ).success

    with pytest.raises(
            check.CheckError,
            match=re.escape(
                'The environment set in preset \'passing\' does not agree with the environment passed '
                'in the `environment_dict` argument.'),
    ):
        execute_pipeline(
            pipeline,
            preset='passing',
            environment_dict={
                'solids': {
                    'can_fail': {
                        'config': {
                            'error': True
                        }
                    }
                }
            },
        )

    assert execute_pipeline(
        pipeline,
        preset='subset',
        environment_dict={
            'solids': {
                'can_fail': {
                    'config': {
                        'error': False
                    }
                }
            }
        },
    ).success
Esempio n. 17
0
def test_empty_preset():
    empty_preset = PresetDefinition("empty")
    assert empty_preset.run_config == None
    assert empty_preset.get_environment_yaml() == "{}\n"
Esempio n. 18
0
@solid
def save_metrics(context, data_path):
    context.log.info(
        "Saving metrics to path {data_path}".format(data_path=data_path))


@pipeline(
    preset_defs=[
        PresetDefinition(
            name="test",
            environment_dict={
                "solids": {
                    "save_metrics": {
                        "inputs": {
                            "data_path": {
                                "value": "s3://bucket-name/test_data"
                            }
                        }
                    }
                }
            },
        ),
    ], )
def metrics_pipeline():
    save_metrics()


@solid
def rollup_data(context, data_path):
    context.log.info(
        "Rolling up data from path {data_path}".format(data_path=data_path))
Esempio n. 19
0

@pipeline(
    description=
    ("Demo fork-shaped pipeline that has two-path parallel structure of solids."
     ),
    preset_defs=[
        PresetDefinition(
            "sleep_failed",
            {
                "intermediate_storage": {
                    "filesystem": {}
                },
                "execution": {
                    "multiprocess": {}
                },
                "solids": {
                    "root": {
                        "config": {
                            "sleep_secs": [-10, 30]
                        }
                    }
                },
            },
        ),
        PresetDefinition(
            "sleep",
            {
                "intermediate_storage": {
                    "filesystem": {}
                },
                "execution": {
Esempio n. 20
0
        'tempfile': tempfile_resource,
        'file_cache': s3_file_cache,
    },
    system_storage_defs=s3_plus_default_storage_defs,
)


@pipeline(
    # ordered so the local is first and therefore the default
    mode_defs=[local_mode, test_mode, prod_mode],
    preset_defs=[
        PresetDefinition(
            name='local_fast',
            mode='local',
            environment_files=[
                file_relative_path(__file__, 'environments/local_base.yaml'),
                file_relative_path(__file__,
                                   'environments/local_fast_ingest.yaml'),
            ],
        ),
        PresetDefinition(
            name='local_full',
            mode='local',
            environment_files=[
                file_relative_path(__file__, 'environments/local_base.yaml'),
                file_relative_path(__file__,
                                   'environments/local_full_ingest.yaml'),
            ],
        ),
    ],
)
Esempio n. 21
0
    time.sleep(0.1)
    if (context.retry_number + 1) >= context.solid_config["work_on_attempt"]:
        return "success"
    else:
        raise RetryRequested(
            max_retries=context.solid_config["max_retries"],
            seconds_to_wait=context.solid_config["delay"],
        )


@pipeline(
    preset_defs=[
        PresetDefinition(
            name="pass_after_retry",
            run_config={
                "solids": {
                    "retry_solid": {
                        "config": {
                            "delay": 0.2,
                            "work_on_attempt": 2,
                            "max_retries": 1,
                        }
                    }
                }
            },
        )
    ]
)
def retry_pipeline():
    echo(retry_solid())
Esempio n. 22
0
@pipeline(
    mode_defs=[
        ModeDefinition(
            intermediate_storage_defs=s3_plus_default_intermediate_storage_defs,
            resource_defs={"s3": s3_resource},
            executor_defs=default_executors + [celery_k8s_job_executor],
        )
    ],
    preset_defs=[
        PresetDefinition.from_files(
            "example",
            config_files=[
                file_relative_path(
                    __file__,
                    os.path.join("..", "run_config", "celery_k8s.yaml")),
                file_relative_path(
                    __file__, os.path.join("..", "run_config",
                                           "pipeline.yaml")),
            ],
            mode="default",
        ),
    ],
)
def example_pipe():
    count_letters(multiply_the_word())


@repository
def example_repo():
    return [example_pipe]
Esempio n. 23
0
def test_presets():
    @solid(config_schema={"error": Bool})
    def can_fail(context):
        if context.solid_config["error"]:
            raise Exception("I did an error")
        return "cool"

    @lambda_solid
    def always_fail():
        raise Exception("I always do this")

    pipe = PipelineDefinition(
        name="simple",
        solid_defs=[can_fail, always_fail],
        preset_defs=[
            PresetDefinition.from_files(
                "passing",
                config_files=[file_relative_path(__file__, "pass_env.yaml")],
                solid_selection=["can_fail"],
            ),
            PresetDefinition.from_files(
                "passing_overide_to_fail",
                config_files=[file_relative_path(__file__, "pass_env.yaml")],
                solid_selection=["can_fail"],
            ).with_additional_config(
                {"solids": {
                    "can_fail": {
                        "config": {
                            "error": True
                        }
                    }
                }}),
            PresetDefinition(
                "passing_direct_dict",
                run_config={
                    "solids": {
                        "can_fail": {
                            "config": {
                                "error": False
                            }
                        }
                    }
                },
                solid_selection=["can_fail"],
            ),
            PresetDefinition.from_files(
                "failing_1",
                config_files=[file_relative_path(__file__, "fail_env.yaml")],
                solid_selection=["can_fail"],
            ),
            PresetDefinition.from_files(
                "failing_2",
                config_files=[file_relative_path(__file__, "pass_env.yaml")]),
            PresetDefinition(
                "subset",
                solid_selection=["can_fail"],
            ),
        ],
    )

    with pytest.raises(DagsterInvariantViolationError):
        PresetDefinition.from_files(
            "invalid_1",
            config_files=[file_relative_path(__file__, "not_a_file.yaml")])

    with pytest.raises(DagsterInvariantViolationError):
        PresetDefinition.from_files(
            "invalid_2",
            config_files=[
                file_relative_path(__file__, "test_repository_definition.py")
            ],
        )

    assert execute_pipeline(pipe, preset="passing").success

    assert execute_pipeline(pipe, preset="passing_direct_dict").success
    assert execute_pipeline(pipe, preset="failing_1",
                            raise_on_error=False).success == False

    assert execute_pipeline(pipe, preset="failing_2",
                            raise_on_error=False).success == False

    with pytest.raises(DagsterInvariantViolationError,
                       match="Could not find preset"):
        execute_pipeline(pipe, preset="not_failing", raise_on_error=False)

    assert (execute_pipeline(pipe,
                             preset="passing_overide_to_fail",
                             raise_on_error=False).success == False)

    assert execute_pipeline(
        pipe,
        preset="passing",
        run_config={
            "solids": {
                "can_fail": {
                    "config": {
                        "error": False
                    }
                }
            }
        },
    ).success

    with pytest.raises(
            check.CheckError,
            match=re.escape(
                "The environment set in preset 'passing' does not agree with the environment passed "
                "in the `run_config` argument."),
    ):
        execute_pipeline(
            pipe,
            preset="passing",
            run_config={"solids": {
                "can_fail": {
                    "config": {
                        "error": True
                    }
                }
            }},
        )

    assert execute_pipeline(
        pipe,
        preset="subset",
        run_config={
            "solids": {
                "can_fail": {
                    "config": {
                        "error": False
                    }
                }
            }
        },
    ).success
Esempio n. 24
0

@pipeline(
    description=
    ("Demo pipeline that enables configurable types of errors thrown during pipeline execution, "
     "including solid execution errors, type errors, and resource initialization errors."
     ),
    mode_defs=[
        ModeDefinition(
            name="errorable_mode",
            resource_defs={"errorable_resource": define_errorable_resource()})
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            "passing",
            pkg_resource_defs=[("dagster_test.toys.environments", "error.yaml")
                               ],
            mode="errorable_mode",
        )
    ],
)
def error_monster():
    start = emit_num.alias("start")()
    middle = num_to_str.alias("middle")(num=start)
    str_to_num.alias("end")(string=middle)


if __name__ == "__main__":
    result = execute_pipeline(
        error_monster,
        {
            "solids": {
Esempio n. 25
0
                'field_five_int': Int,
                'field_six_nullable_int_list': Field([Noneable(int)], is_required=False),
            },
        },
    )
    def a_solid_with_multilayered_config(_):
        return None

    return a_solid_with_multilayered_config()


@pipeline(
    preset_defs=[
        PresetDefinition.from_files(
            name='prod',
            environment_files=[
                file_relative_path(__file__, '../environments/csv_hello_world_prod.yaml')
            ],
        ),
        PresetDefinition.from_files(
            name='test',
            environment_files=[
                file_relative_path(__file__, '../environments/csv_hello_world_test.yaml')
            ],
        ),
        PresetDefinition(
            name='test_inline',
            run_config={
                'solids': {
                    'sum_solid': {
                        'inputs': {'num': file_relative_path(__file__, '../data/num.csv')}
                    }
Esempio n. 26
0
from dagster import ModeDefinition, PresetDefinition, RepositoryDefinition, pipeline, solid

mode = ModeDefinition(
    name='prod',
    resource_defs={
        'pyspark_step_launcher': emr_pyspark_step_launcher,
        'pyspark': pyspark_resource,
        's3': s3_resource,
    },
    system_storage_defs=s3_plus_default_storage_defs,
)

preset = PresetDefinition.from_files(
    name='prod',
    mode='prod',
    environment_files=['prod_resources.yaml', 's3_storage.yaml'],
)


@solid(required_resource_keys={'pyspark_step_launcher'})
def hello(_):
    return 1


@pipeline(
    mode_defs=[mode],
    preset_defs=[preset],
)
def my_pipeline():
    hello()
Esempio n. 27
0
def define_multi_mode_with_resources_pipeline():
    # API red alert. One has to wrap a type in Field because it is callable
    @resource(config=Int)
    def adder_resource(init_context):
        return lambda x: x + init_context.resource_config

    @resource(config=Int)
    def multer_resource(init_context):
        return lambda x: x * init_context.resource_config

    @resource(config={'num_one': Int, 'num_two': Int})
    def double_adder_resource(init_context):
        return (lambda x: x + init_context.resource_config['num_one'] +
                init_context.resource_config['num_two'])

    @solid(required_resource_keys={'op'})
    def apply_to_three(context):
        return context.resources.op(3)

    return PipelineDefinition(
        name='multi_mode_with_resources',
        solid_defs=[apply_to_three],
        mode_defs=[
            ModeDefinition(name='add_mode',
                           resource_defs={'op': adder_resource}),
            ModeDefinition(name='mult_mode',
                           resource_defs={'op': multer_resource}),
            ModeDefinition(
                name='double_adder_mode',
                resource_defs={'op': double_adder_resource},
                description='Mode that adds two numbers to thing',
            ),
        ],
        preset_defs=[
            PresetDefinition.from_files(
                'add',
                mode='add_mode',
                environment_files=[
                    file_relative_path(
                        __file__,
                        './environments/multi_mode_with_resources/add_mode.yaml'
                    )
                ],
            ),
            PresetDefinition(
                'multiproc',
                mode='add_mode',
                environment_dict={
                    'resources': {
                        'op': {
                            'config': 2
                        }
                    },
                    'execution': {
                        'multiprocess': {}
                    },
                    'storage': {
                        'filesystem': {}
                    },
                },
            ),
        ],
    )
Esempio n. 28
0

@lambda_solid
def do_something():
    return 1


@lambda_solid
def do_input(x):
    return x


@pipeline(
    name="foo",
    preset_defs=[
        PresetDefinition(name="test", tags={"foo": "bar"}),
    ],
)
def foo_pipeline():
    do_input(do_something())


def define_foo_pipeline():
    return foo_pipeline


@pipeline(name="baz", description="Not much tbh")
def baz_pipeline():
    do_input()

Esempio n. 29
0
                "field_six_nullable_int_list":
                Field([Noneable(int)], is_required=False),
            },
        },
    )
    def a_solid_with_multilayered_config(_):
        return None

    a_solid_with_multilayered_config()


@pipeline(preset_defs=[
    PresetDefinition.from_files(
        name="prod",
        config_files=[
            file_relative_path(__file__,
                               "../environments/csv_hello_world_prod.yaml")
        ],
    ),
    PresetDefinition.from_files(
        name="test",
        config_files=[
            file_relative_path(__file__,
                               "../environments/csv_hello_world_test.yaml")
        ],
    ),
    PresetDefinition(
        name="test_inline",
        run_config={
            "solids": {
                "sum_solid": {
Esempio n. 30
0
def define_airline_demo_warehouse_pipeline():
    return PipelineDefinition(
        name="airline_demo_warehouse_pipeline",
        solids=[
            average_sfo_outbound_avg_delays_by_destination,
            delays_by_geography,
            delays_vs_fares,
            delays_vs_fares_nb,
            eastbound_delays,
            q2_sfo_outbound_flights,
            sfo_delays_by_destination,
            tickets_with_destination,
            put_object_to_s3_bytes,
            westbound_delays,
        ],
        dependencies={
            'q2_sfo_outbound_flights': {},
            'tickets_with_destination': {},
            'westbound_delays': {},
            'eastbound_delays': {},
            'average_sfo_outbound_avg_delays_by_destination': {
                'q2_sfo_outbound_flights':
                DependencyDefinition('q2_sfo_outbound_flights')
            },
            'delays_vs_fares': {
                'tickets_with_destination':
                DependencyDefinition('tickets_with_destination'),
                'average_sfo_outbound_avg_delays_by_destination':
                DependencyDefinition(
                    'average_sfo_outbound_avg_delays_by_destination'),
            },
            'fares_vs_delays': {
                'table_name': DependencyDefinition('delays_vs_fares')
            },
            'sfo_delays_by_destination': {
                'table_name':
                DependencyDefinition(
                    'average_sfo_outbound_avg_delays_by_destination')
            },
            'delays_by_geography': {
                'eastbound_delays': DependencyDefinition('eastbound_delays'),
                'westbound_delays': DependencyDefinition('westbound_delays'),
            },
            SolidInstance('put_object_to_s3_bytes',
                          alias='upload_outbound_avg_delay_pdf_plots'): {
                'file_obj': DependencyDefinition('sfo_delays_by_destination')
            },
            SolidInstance('put_object_to_s3_bytes',
                          alias='upload_delays_vs_fares_pdf_plots'): {
                'file_obj': DependencyDefinition('fares_vs_delays')
            },
            SolidInstance('put_object_to_s3_bytes',
                          alias='upload_delays_by_geography_pdf_plots'): {
                'file_obj': DependencyDefinition('delays_by_geography')
            },
        },
        mode_definitions=[test_mode, local_mode, prod_mode],
        preset_definitions=[
            PresetDefinition(
                name='local',
                mode='local',
                environment_files=[
                    file_relative_path(__file__,
                                       'environments/local_base.yaml'),
                    file_relative_path(__file__,
                                       'environments/local_warehouse.yaml'),
                ],
            )
        ],
    )