Ejemplo n.º 1
0
def define_event_ingest_pipeline():
    event_ingest = SparkSolidDefinition(
        name='event_ingest',
        main_class='io.dagster.events.EventPipeline',
        description='Ingest events from JSON to Parquet',
    )

    # TODO: express dependency of this solid on event_ingest
    snowflake_load = SnowflakeLoadSolidDefinition(
        'snowflake_load',
        # TODO: need to pull this out to a config
        src='file:///tmp/dagster/events/data/output/2019/01/01/*.parquet',
        table='events',
    )

    return PipelineDefinition(
        name='event_ingest_pipeline',
        solids=[download_from_s3_to_file, gunzipper, event_ingest, snowflake_load],
        dependencies={
            SolidInstance('gunzipper'): {
                'gzip_file': DependencyDefinition('download_from_s3_to_file')
            },
            SolidInstance('event_ingest'): {'spark_inputs': DependencyDefinition('gunzipper')},
            SolidInstance('snowflake_load'): {
                'start': DependencyDefinition('event_ingest', 'paths')
            },
        },
        mode_definitions=[
            ModeDefinition(resources={'s3': s3_resource, 'snowflake': snowflake_resource})
        ],
    )
Ejemplo n.º 2
0
def test_run_invalid_jar():
    spark_solid = SparkSolidDefinition('spark_solid', main_class='something')
    pipeline = PipelineDefinition(solids=[spark_solid])
    environment_dict = yaml.load(
        CONFIG_FILE.format(path=script_relative_path('.')))
    with pytest.raises(SparkSolidError,
                       match='Spark job failed. Please consult your logs.'):
        execute_pipeline(pipeline, environment_dict)
Ejemplo n.º 3
0
def test_jar_not_found():
    spark_solid = SparkSolidDefinition('spark_solid', main_class='something')
    pipeline = PipelineDefinition(solid_defs=[spark_solid])
    # guid guaranteed to not exist
    environment_dict = yaml.load(CONFIG_FILE.format(path=str(uuid.uuid4())))
    with pytest.raises(
            SparkSolidError,
            match=
            'does not exist. A valid jar must be built before running this solid.',
    ):
        execute_pipeline(pipeline, environment_dict)
Ejemplo n.º 4
0
def event_ingest_pipeline():
    event_ingest = SparkSolidDefinition(
        name='event_ingest',
        main_class='io.dagster.events.EventPipeline',
        description='Ingest events from JSON to Parquet',
    )

    # TODO: express dependency of this solid on event_ingest
    snowflake_load = snowflake_load_parquet_solid_for_table(
        src='file:///tmp/dagster/events/data/output/2019/01/01/*.parquet', table='events'
    )
    # pylint: disable=no-value-for-parameter
    snowflake_load(event_ingest(spark_inputs=gunzipper(gzip_file=download_from_s3_to_file())))
Ejemplo n.º 5
0
def test_no_spark_home():
    if 'SPARK_HOME' in os.environ:
        del os.environ['SPARK_HOME']

    spark_solid = SparkSolidDefinition('spark_solid', main_class='something')
    pipeline = PipelineDefinition(solid_defs=[spark_solid])
    environment_dict = yaml.load(
        NO_SPARK_HOME_CONFIG_FILE.format(path=script_relative_path('.')))

    with pytest.raises(SparkSolidError) as exc_info:
        execute_pipeline(pipeline, environment_dict)

    assert str(exc_info.value) == (
        'No spark home set. You must either pass spark_home in config or set '
        '$SPARK_HOME in your environment (got None).')
Ejemplo n.º 6
0
def test_step_metadata():
    spark_solid = SparkSolidDefinition('spark_solid', main_class='something')
    pipeline = PipelineDefinition(solid_defs=[spark_solid])
    environment_dict = yaml.load(
        CONFIG_FILE.format(path=script_relative_path('fake.jar')))
    execution_plan = create_execution_plan(pipeline, environment_dict)

    step = execution_plan.get_step_by_key('spark_solid.compute')
    assert step.metadata == {
        'spark_submit_command':
        ('/your/spark_home/bin/spark-submit --class something '
         '--master local[*] --deploy-mode client --conf spark.app.name=test_app '
         + script_relative_path('fake.jar') +
         ' --local-path /tmp/dagster/events/data '
         '--date 2019-01-01')
    }
Ejemplo n.º 7
0
def test_jar_not_found():
    spark_solid = SparkSolidDefinition(
        'spark_solid',
        main_class='something',
        spark_outputs=["/tmp/dagster/events/data"])
    # guid guaranteed to not exist
    environment_dict = yaml.safe_load(
        CONFIG_FILE.format(path=str(uuid.uuid4())))

    result = execute_solid(spark_solid,
                           environment_dict=environment_dict,
                           raise_on_error=False)
    assert result.failure_data
    assert (
        'does not exist. A valid jar must be built before running this solid.'
        in result.failure_data.error.message)
Ejemplo n.º 8
0
def event_ingest_pipeline():
    event_ingest = SparkSolidDefinition(
        name='event_ingest',
        main_class='io.dagster.events.EventPipeline',
        description='Ingest events from JSON to Parquet',
    )

    @solid(input_defs=[InputDefinition('start', Nothing)], required_resource_keys={'snowflake'})
    def snowflake_load(context):
        # TODO: express dependency of this solid on event_ingest
        context.resources.snowflake.load_table_from_local_parquet(
            src='file:///tmp/dagster/events/data/output/2019/01/01/*.parquet',
            table='events',
            logger=context.log,
        )

    # pylint: disable=no-value-for-parameter
    snowflake_load(event_ingest(spark_inputs=gunzipper(gzip_file=download_from_s3_to_file())))
Ejemplo n.º 9
0
def test_no_spark_home():
    if 'SPARK_HOME' in os.environ:
        del os.environ['SPARK_HOME']

    spark_solid = SparkSolidDefinition(
        'spark_solid',
        main_class='something',
        spark_outputs=["/tmp/dagster/events/data"])
    environment_dict = yaml.safe_load(
        NO_SPARK_HOME_CONFIG_FILE.format(path=script_relative_path('.')))

    result = execute_solid(spark_solid,
                           environment_dict=environment_dict,
                           raise_on_error=False)
    assert result.failure_data
    assert (
        'No spark home set. You must either pass spark_home in config or set '
        '$SPARK_HOME in your environment (got None).'
        in result.failure_data.error.message)