def define_event_ingest_pipeline(): event_ingest = SparkSolidDefinition( name='event_ingest', main_class='io.dagster.events.EventPipeline', description='Ingest events from JSON to Parquet', ) # TODO: express dependency of this solid on event_ingest snowflake_load = SnowflakeLoadSolidDefinition( 'snowflake_load', # TODO: need to pull this out to a config src='file:///tmp/dagster/events/data/output/2019/01/01/*.parquet', table='events', ) return PipelineDefinition( name='event_ingest_pipeline', solids=[download_from_s3_to_file, gunzipper, event_ingest, snowflake_load], dependencies={ SolidInstance('gunzipper'): { 'gzip_file': DependencyDefinition('download_from_s3_to_file') }, SolidInstance('event_ingest'): {'spark_inputs': DependencyDefinition('gunzipper')}, SolidInstance('snowflake_load'): { 'start': DependencyDefinition('event_ingest', 'paths') }, }, mode_definitions=[ ModeDefinition(resources={'s3': s3_resource, 'snowflake': snowflake_resource}) ], )
def test_run_invalid_jar(): spark_solid = SparkSolidDefinition('spark_solid', main_class='something') pipeline = PipelineDefinition(solids=[spark_solid]) environment_dict = yaml.load( CONFIG_FILE.format(path=script_relative_path('.'))) with pytest.raises(SparkSolidError, match='Spark job failed. Please consult your logs.'): execute_pipeline(pipeline, environment_dict)
def test_jar_not_found(): spark_solid = SparkSolidDefinition('spark_solid', main_class='something') pipeline = PipelineDefinition(solid_defs=[spark_solid]) # guid guaranteed to not exist environment_dict = yaml.load(CONFIG_FILE.format(path=str(uuid.uuid4()))) with pytest.raises( SparkSolidError, match= 'does not exist. A valid jar must be built before running this solid.', ): execute_pipeline(pipeline, environment_dict)
def event_ingest_pipeline(): event_ingest = SparkSolidDefinition( name='event_ingest', main_class='io.dagster.events.EventPipeline', description='Ingest events from JSON to Parquet', ) # TODO: express dependency of this solid on event_ingest snowflake_load = snowflake_load_parquet_solid_for_table( src='file:///tmp/dagster/events/data/output/2019/01/01/*.parquet', table='events' ) # pylint: disable=no-value-for-parameter snowflake_load(event_ingest(spark_inputs=gunzipper(gzip_file=download_from_s3_to_file())))
def test_no_spark_home(): if 'SPARK_HOME' in os.environ: del os.environ['SPARK_HOME'] spark_solid = SparkSolidDefinition('spark_solid', main_class='something') pipeline = PipelineDefinition(solid_defs=[spark_solid]) environment_dict = yaml.load( NO_SPARK_HOME_CONFIG_FILE.format(path=script_relative_path('.'))) with pytest.raises(SparkSolidError) as exc_info: execute_pipeline(pipeline, environment_dict) assert str(exc_info.value) == ( 'No spark home set. You must either pass spark_home in config or set ' '$SPARK_HOME in your environment (got None).')
def test_step_metadata(): spark_solid = SparkSolidDefinition('spark_solid', main_class='something') pipeline = PipelineDefinition(solid_defs=[spark_solid]) environment_dict = yaml.load( CONFIG_FILE.format(path=script_relative_path('fake.jar'))) execution_plan = create_execution_plan(pipeline, environment_dict) step = execution_plan.get_step_by_key('spark_solid.compute') assert step.metadata == { 'spark_submit_command': ('/your/spark_home/bin/spark-submit --class something ' '--master local[*] --deploy-mode client --conf spark.app.name=test_app ' + script_relative_path('fake.jar') + ' --local-path /tmp/dagster/events/data ' '--date 2019-01-01') }
def test_jar_not_found(): spark_solid = SparkSolidDefinition( 'spark_solid', main_class='something', spark_outputs=["/tmp/dagster/events/data"]) # guid guaranteed to not exist environment_dict = yaml.safe_load( CONFIG_FILE.format(path=str(uuid.uuid4()))) result = execute_solid(spark_solid, environment_dict=environment_dict, raise_on_error=False) assert result.failure_data assert ( 'does not exist. A valid jar must be built before running this solid.' in result.failure_data.error.message)
def event_ingest_pipeline(): event_ingest = SparkSolidDefinition( name='event_ingest', main_class='io.dagster.events.EventPipeline', description='Ingest events from JSON to Parquet', ) @solid(input_defs=[InputDefinition('start', Nothing)], required_resource_keys={'snowflake'}) def snowflake_load(context): # TODO: express dependency of this solid on event_ingest context.resources.snowflake.load_table_from_local_parquet( src='file:///tmp/dagster/events/data/output/2019/01/01/*.parquet', table='events', logger=context.log, ) # pylint: disable=no-value-for-parameter snowflake_load(event_ingest(spark_inputs=gunzipper(gzip_file=download_from_s3_to_file())))
def test_no_spark_home(): if 'SPARK_HOME' in os.environ: del os.environ['SPARK_HOME'] spark_solid = SparkSolidDefinition( 'spark_solid', main_class='something', spark_outputs=["/tmp/dagster/events/data"]) environment_dict = yaml.safe_load( NO_SPARK_HOME_CONFIG_FILE.format(path=script_relative_path('.'))) result = execute_solid(spark_solid, environment_dict=environment_dict, raise_on_error=False) assert result.failure_data assert ( 'No spark home set. You must either pass spark_home in config or set ' '$SPARK_HOME in your environment (got None).' in result.failure_data.error.message)