Beispiel #1
0
def test_from_pkg_resources():
    good = ("dagster_tests.core_tests.definitions_tests", "pass_env.yaml")
    res = PresetDefinition.from_pkg_resources("this_should_pass", [good])
    assert res.run_config == {
        "solids": {
            "can_fail": {
                "config": {
                    "error": False
                }
            }
        }
    }

    bad_defs = [
        ("dagster_tests.core_tests.definitions_tests", "does_not_exist.yaml"),
        ("dagster_tests.core_tests.definitions_tests", "bad_file_binary.yaml"),
        ("dagster_tests.core_tests.does_not_exist", "some_file.yaml"),
    ]

    for bad_def in bad_defs:
        with pytest.raises(
                DagsterInvariantViolationError,
                match="Encountered error attempting to parse yaml",
        ):
            PresetDefinition.from_pkg_resources("bad_def", [bad_def])
def test_from_pkg_resources():
    good = ('dagster_tests.core_tests.definitions_tests', 'pass_env.yaml')
    res = PresetDefinition.from_pkg_resources('pass', [good])
    assert res.run_config == {
        'solids': {
            'can_fail': {
                'config': {
                    'error': False
                }
            }
        }
    }

    bad_defs = [
        ('dagster_tests.core_tests.definitions_tests', 'does_not_exist.yaml'),
        ('dagster_tests.core_tests.definitions_tests', 'bad_file_binary.yaml'),
        ('dagster_tests.core_tests.does_not_exist', 'some_file.yaml'),
    ]

    for bad_def in bad_defs:
        with pytest.raises(
                DagsterInvariantViolationError,
                match='Encountered error attempting to parse yaml',
        ):
            PresetDefinition.from_pkg_resources('bad_def', [bad_def])
Beispiel #3
0
    with open(file_relative_path(__file__, 'sql/explore_visits_by_hour.sql'),
              'r') as f:
        query = f.read()

    return bq_solid_for_queries(
        [query]).alias('explore_visits_by_hour_internal')(start=start)


@pipeline(
    mode_defs=[
        ModeDefinition(
            name='default',
            resource_defs={
                'bigquery': bigquery_resource,
                'dataproc': dataproc_resource
            },
        )
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            'default',
            pkg_resource_defs=[
                ('dagster_examples.gcp_data_platform.environments',
                 'default.yaml'),
            ],
        )
    ],
)
def gcp_pipeline():
    return explore_visits_by_hour(bq_load_events(events_dataproc()))
Beispiel #4
0
        "file_manager": s3_file_manager,
    },
    intermediate_storage_defs=s3_plus_default_intermediate_storage_defs,
)


# start_pipelines_marker_0
@pipeline(
    # ordered so the local is first and therefore the default
    mode_defs=[local_mode, test_mode, prod_mode],
    # end_pipelines_marker_0
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            name="local_fast",
            mode="local",
            pkg_resource_defs=[
                ("airline_demo.environments", "local_base.yaml"),
                ("airline_demo.environments", "local_fast_ingest.yaml"),
            ],
        ),
        PresetDefinition.from_pkg_resources(
            name="local_full",
            mode="local",
            pkg_resource_defs=[
                ("airline_demo.environments", "local_base.yaml"),
                ("airline_demo.environments", "local_full_ingest.yaml"),
            ],
        ),
        PresetDefinition.from_pkg_resources(
            name="prod_fast",
            mode="prod",
            pkg_resource_defs=[
Beispiel #5
0
        'tempfile': tempfile_resource,
        'file_cache': s3_file_cache,
        'file_manager': s3_file_manager,
    },
    intermediate_storage_defs=s3_plus_default_intermediate_storage_defs,
)


@pipeline(
    # ordered so the local is first and therefore the default
    mode_defs=[local_mode, test_mode, prod_mode],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            name='local_fast',
            mode='local',
            pkg_resource_defs=[
                ('airline_demo.environments', 'local_base.yaml'),
                ('airline_demo.environments', 'local_fast_ingest.yaml'),
            ],
        ),
        PresetDefinition.from_pkg_resources(
            name='local_full',
            mode='local',
            pkg_resource_defs=[
                ('airline_demo.environments', 'local_base.yaml'),
                ('airline_demo.environments', 'local_full_ingest.yaml'),
            ],
        ),
        PresetDefinition.from_pkg_resources(
            name='prod_fast',
            mode='prod',
            pkg_resource_defs=[
Beispiel #6
0
            'gcs_client': gcs_client,
            'credentials_vault': credentials_vault,
            'volume': mount,
        },
        description='Mode to be used on a remote production server',
    ),
]

WEATHER_INGEST_PRESETS = [
    PresetDefinition.from_pkg_resources(
        'dev_weather_etl',
        mode='development',
        pkg_resource_defs=[
            ('dagster_examples.bay_bikes.environments',
             'dev_credentials_vault.yaml'),
            ('dagster_examples.bay_bikes.environments',
             'dev_database_resources.yaml'),
            ('dagster_examples.bay_bikes.environments',
             'file_system_resources.yaml'),
            ('dagster_examples.bay_bikes.environments', 'weather.yaml'),
        ],
        solid_selection=['weather_etl'],
    ),
    PresetDefinition.from_pkg_resources(
        'prod_weather_etl',
        mode='production',
        pkg_resource_defs=[
            ('dagster_examples.bay_bikes.environments',
             'prod_credentials_vault.yaml'),
            ('dagster_examples.bay_bikes.environments',
             'prod_database_resources.yaml'),
            ('dagster_examples.bay_bikes.environments',
Beispiel #7
0
    return people.count()


emr_mode = ModeDefinition(
    name="emr",
    resource_defs={
        "pyspark_step_launcher": emr_pyspark_step_launcher,
        "pyspark": pyspark_resource,
        "s3": s3_resource,
    },
    intermediate_storage_defs=s3_plus_default_intermediate_storage_defs,
)

emr_preset = PresetDefinition.from_pkg_resources(
    name="emr",
    mode="emr",
    pkg_resource_defs=[("emr_pyspark", "prod_resources.yaml"), ("emr_pyspark", "s3_storage.yaml")],
)


local_mode = ModeDefinition(
    name="local",
    resource_defs={"pyspark_step_launcher": no_step_launcher, "pyspark": pyspark_resource},
)


@pipeline(
    mode_defs=[emr_mode, local_mode], preset_defs=[emr_preset],
)
def my_pipeline():
    count_people(filter_over_50(make_people()))
Beispiel #8
0
        "pyspark_step_launcher": databricks_pyspark_step_launcher,
        "pyspark": pyspark_resource,
        "s3": s3_resource,
    },
    intermediate_storage_defs=s3_plus_default_intermediate_storage_defs,
)


@pipeline(
    mode_defs=[local_mode, prod_emr_mode, prod_databricks_mode],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            name="local",
            mode="local",
            pkg_resource_defs=[
                ("dagster_examples.simple_pyspark.environments", "local.yaml"),
                ("dagster_examples.simple_pyspark.environments",
                 "filesystem_storage.yaml"),
            ],
        ),
        PresetDefinition.from_pkg_resources(
            name="prod_emr",
            mode="prod_emr",
            pkg_resource_defs=[
                ("dagster_examples.simple_pyspark.environments",
                 "prod_emr.yaml"),
                ("dagster_examples.simple_pyspark.environments",
                 "s3_storage.yaml"),
            ],
        ),
        PresetDefinition.from_pkg_resources(
Beispiel #9
0

@pipeline(
    description=(
        'Demo pipeline that enables configurable types of errors thrown during pipeline execution, '
        'including solid execution errors, type errors, and resource initialization errors.'
    ),
    mode_defs=[
        ModeDefinition(
            name='errorable_mode', resource_defs={'errorable_resource': define_errorable_resource()}
        )
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            'passing',
            pkg_resource_defs=[('dagster_examples.toys.environments', 'error.yaml')],
            mode='errorable_mode',
        )
    ],
)
def error_monster():
    start = emit_num.alias('start')()
    middle = num_to_str.alias('middle')(num=start)
    str_to_num.alias('end')(string=middle)


if __name__ == '__main__':
    result = execute_pipeline(
        error_monster,
        {
            'solids': {
Beispiel #10
0
    ("Demo pipeline that enables configurable types of errors thrown during pipeline execution, "
     "including solid execution errors, type errors, and resource initialization errors."
     ),
    mode_defs=[
        ModeDefinition(
            name="errorable_mode",
            resource_defs={
                "errorable_resource": define_errorable_resource(),
                "io_manager": errorable_io_manager,
            },
        ),
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            "passing",
            pkg_resource_defs=[("dagster_test.toys.environments", "error.yaml")
                               ],
            mode="errorable_mode",
        )
    ],
    tags={"monster": "error"},
)
def error_monster():
    start = emit_num.alias("start")()
    middle = num_to_str.alias("middle")(num=start)
    str_to_num.alias("end")(string=middle)


if __name__ == "__main__":
    result = execute_pipeline(
        error_monster,
        {
Beispiel #11
0
@pipeline(
    mode_defs=[
        ModeDefinition(
            name="default",
            resource_defs={
                "s3": s3_resource,
                "snowflake": snowflake_resource,
                "spark": spark_resource,
            },
        )
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            "default",
            pkg_resource_defs=[
                ("dagster_examples.event_pipeline_demo.environments",
                 "default.yaml"),
            ],
        )
    ],
)
def event_ingest_pipeline():
    event_ingest = create_spark_solid(
        name="event_ingest",
        main_class="io.dagster.events.EventPipeline",
        description="Ingest events from JSON to Parquet",
    )

    @solid(input_defs=[InputDefinition("start", Nothing)],
           required_resource_keys={"snowflake"})
    def snowflake_load(context):
Beispiel #12
0
    return people.count()


emr_mode = ModeDefinition(
    name='emr',
    resource_defs={
        'pyspark_step_launcher': emr_pyspark_step_launcher,
        'pyspark': pyspark_resource,
        's3': s3_resource,
    },
    system_storage_defs=s3_plus_default_storage_defs,
)

emr_preset = PresetDefinition.from_pkg_resources(
    name='emr',
    mode='emr',
    pkg_resource_defs=[('emr_pyspark', 'prod_resources.yaml'),
                       ('emr_pyspark', 's3_storage.yaml')],
)

local_mode = ModeDefinition(
    name='local',
    resource_defs={
        'pyspark_step_launcher': no_step_launcher,
        'pyspark': pyspark_resource
    },
)


@pipeline(
    mode_defs=[emr_mode, local_mode],
    preset_defs=[emr_preset],
    return mult_df


@solid
def sum_sq_solid(_, sum_df: DataFrame, mult_df: DataFrame) -> DataFrame:
    sum_sq_df = sum_df.copy()
    sum_sq_df['sum_sq'] = sum_df['sum'] ** 2
    sum_sq_df['sum_mult_sq'] = sum_df['sum'] * mult_df['mult']
    return sum_sq_df


@pipeline(
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            'test',
            pkg_resource_defs=[
                ('dagster_examples.toys.environments', 'pandas_hello_world_test.yaml')
            ],
        ),
        PresetDefinition.from_pkg_resources(
            'prod',
            pkg_resource_defs=[
                ('dagster_examples.toys.environments', 'pandas_hello_world_prod.yaml')
            ],
        ),
    ]
)
def pandas_hello_world_pipeline():
    return sum_sq_solid(sum_df=sum_solid(), mult_df=mult_solid())


@pipeline
Beispiel #14
0
@pipeline(
    mode_defs=[
        ModeDefinition(
            name='default',
            resource_defs={
                's3': s3_resource,
                'snowflake': snowflake_resource,
                'spark': spark_resource,
            },
        )
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            'default',
            pkg_resource_defs=[
                ('dagster_examples.event_pipeline_demo.environments',
                 'default.yaml'),
            ],
        )
    ],
)
def event_ingest_pipeline():
    event_ingest = create_spark_solid(
        name='event_ingest',
        main_class='io.dagster.events.EventPipeline',
        description='Ingest events from JSON to Parquet',
    )

    @solid(input_defs=[InputDefinition('start', Nothing)],
           required_resource_keys={'snowflake'})
    def snowflake_load(context):
Beispiel #15
0
    resource_defs={
        'pyspark_step_launcher': emr_pyspark_step_launcher,
        'pyspark': pyspark_resource,
        's3': s3_resource,
    },
    system_storage_defs=s3_plus_default_storage_defs,
)


@pipeline(
    mode_defs=[local_mode, prod_mode],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            name='local',
            mode='local',
            pkg_resource_defs=[
                ('dagster_examples.simple_pyspark.environments', 'local.yaml'),
                ('dagster_examples.simple_pyspark.environments', 'filesystem_storage.yaml'),
            ],
        ),
        PresetDefinition.from_pkg_resources(
            name='prod',
            mode='prod',
            pkg_resource_defs=[
                ('dagster_examples.simple_pyspark.environments', 'prod.yaml'),
                ('dagster_examples.simple_pyspark.environments', 's3_storage.yaml'),
            ],
        ),
    ],
)
def simple_pyspark_sfo_weather_pipeline():
    '''Computes some basic statistics over weather data from SFO airport'''
Beispiel #16
0
            "gcs_client": gcs_client,
            "credentials_vault": credentials_vault,
            "volume": mount,
        },
        description="Mode to be used on a remote production server",
    ),
]

WEATHER_INGEST_PRESETS = [
    PresetDefinition.from_pkg_resources(
        "dev_weather_etl",
        mode="development",
        pkg_resource_defs=[
            ("dagster_examples.bay_bikes.environments",
             "dev_credentials_vault.yaml"),
            ("dagster_examples.bay_bikes.environments",
             "dev_database_resources.yaml"),
            ("dagster_examples.bay_bikes.environments",
             "file_system_resources.yaml"),
            ("dagster_examples.bay_bikes.environments", "weather.yaml"),
        ],
        solid_selection=["weather_etl"],
    ),
    PresetDefinition.from_pkg_resources(
        "prod_weather_etl",
        mode="production",
        pkg_resource_defs=[
            ("dagster_examples.bay_bikes.environments",
             "prod_credentials_vault.yaml"),
            ("dagster_examples.bay_bikes.environments",
             "prod_database_resources.yaml"),
            ("dagster_examples.bay_bikes.environments",
Beispiel #17
0
                    'read_csv': {
                        'inputs': {'csv_path': {'value': '../../cereal.csv'}}
                    }
                },
                'resources': {
                    'warehouse': {'config': {'conn_str': ':memory:'}}
                },
            },
            mode='unittest',
        ),
        PresetDefinition.from_pkg_resources(
            'dev',
            pkg_resource_defs=[
                (
                    'dagster_examples.intro_tutorial',
                    'presets_dev_warehouse.yaml',
                ),
                ('dagster_examples.intro_tutorial', 'presets_csv.yaml'),
            ],
            mode='dev',
        ),
    ],
)
def presets_pipeline():
    normalize_calories(read_csv())


if __name__ == '__main__':
    result = execute_pipeline(presets_pipeline, preset='unittest')
    assert result.success
Beispiel #18
0
    config_fn=explore_visits_by_hour_fn,
    config_schema={"table": str},
    input_defs=[InputDefinition("start", Nothing)],
    output_defs=[OutputDefinition(List[DataFrame])],
)
def explore_visits_by_hour(start):
    with open(file_relative_path(__file__, "sql/explore_visits_by_hour.sql"), "r") as f:
        query = f.read()

    return bq_solid_for_queries([query]).alias("explore_visits_by_hour_internal")(start=start)


@pipeline(
    mode_defs=[
        ModeDefinition(
            name="default",
            resource_defs={"bigquery": bigquery_resource, "dataproc": dataproc_resource},
        )
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            "default",
            pkg_resource_defs=[
                ("dagster_examples.gcp_data_platform.environments", "default.yaml"),
            ],
        )
    ],
)
def gcp_pipeline():
    return explore_visits_by_hour(bq_load_events(events_dataproc()))