def test_error_monster_wrong_mode(): with pytest.raises(DagsterInvariantViolationError): execute_pipeline_with_mode( pipeline=error_monster, mode='nope', environment_dict={ 'solids': { 'start': { 'config': { 'throw_in_solid': False, 'return_wrong_type': False } }, 'middle': { 'config': { 'throw_in_solid': False, 'return_wrong_type': False } }, 'end': { 'config': { 'throw_in_solid': False, 'return_wrong_type': False } }, }, 'resources': { 'errorable_resource': { 'config': { 'throw_on_resource_init': False } } }, }, )
def test_execute_multi_mode_with_resources(): pipeline_def = define_multi_mode_with_resources_pipeline() add_mode_result = execute_pipeline_with_mode( pipeline=pipeline_def, mode='add_mode', environment_dict={'resources': { 'op': { 'config': 2 } }}, ) assert add_mode_result.result_for_solid( 'apply_to_three').output_value() == 5 mult_mode_result = execute_pipeline_with_mode( pipeline=pipeline_def, mode='mult_mode', environment_dict={'resources': { 'op': { 'config': 3 } }}, ) assert mult_mode_result.result_for_solid( 'apply_to_three').output_value() == 9
def test_execute_multi_mode_loggers_with_multiple_loggers_single_config(): ( pipeline_def, foo_logger_captured_results, bar_logger_captured_results, ) = define_multi_mode_with_loggers_pipeline() execute_pipeline_with_mode( pipeline_def, mode='foo_bar_mode', environment_dict={ 'loggers': { 'foo': { 'config': { 'log_level': 'DEBUG' } } } }, ) foo_original_messages = parse_captured_results(foo_logger_captured_results) assert len([x for x in foo_original_messages if 'Here we are' in x]) == 1 assert not bar_logger_captured_results
def test_execute_multi_mode_errors(): multi_mode_pipeline = define_multi_mode_pipeline() with pytest.raises(DagsterInvariantViolationError): execute_pipeline(multi_mode_pipeline) with pytest.raises(DagsterInvariantViolationError): execute_pipeline_with_mode(pipeline=multi_mode_pipeline, mode='wrong_mode')
def test_execute_pipeline_with_non_existant_mode(): with pytest.raises(DagsterInvariantViolationError): execute_pipeline_with_mode( pipeline_with_mode, 'BAD', environment_dict={ 'solids': {'solid_that_uses_adder_resource': {'inputs': {'number': {'value': 4}}}} }, )
def test_execute_multi_mode(): multi_mode_pipeline = define_multi_mode_pipeline() assert (execute_pipeline_with_mode( pipeline=multi_mode_pipeline, mode='mode_one').result_for_solid('return_three').output_value() == 3) assert (execute_pipeline_with_mode( pipeline=multi_mode_pipeline, mode='mode_two').result_for_solid('return_three').output_value() == 3)
def test_local(): result = execute_pipeline_with_mode( pipeline=example_pipe, mode='local', environment_dict={'solids': {'blah': {'config': {'foo': 'a string', 'bar': 123}}},}, ) assert result.success
def test_warehouse_resource(postgres): environment_dict = { 'solids': { 'read_csv': { 'inputs': { 'csv_path': { 'value': 'cereal.csv' } } } }, 'resources': { 'warehouse': { 'config': { 'conn_str': postgres } } }, } with pushd(script_relative_path('../../dagster_examples/intro_tutorial/')): result = execute_pipeline_with_mode( pipeline=modes_pipeline, mode='dev', environment_dict=environment_dict, ) assert result.success if not BUILDKITE: with pushd( script_relative_path( '../../dagster_examples/intro_tutorial/')): result = execute_pipeline_with_preset(presets_pipeline, preset_name='dev') assert result.success
def test_default_pyspark_decorator(): @pyspark_solid(pyspark_resource_key='first_pyspark') def first_pyspark_job(context): list_p = [('Michelle', 19), ('Austin', 29), ('Lydia', 35)] rdd = context.resources.first_pyspark.spark_context.parallelize(list_p) res = rdd.take(2) for name, age in res: print('%s: %d' % (name, age)) @pyspark_solid(pyspark_resource_key='last_pyspark') def last_pyspark_job(context): list_p = [('John', 19), ('Jennifer', 29), ('Adam', 35), ('Henry', 50)] rdd = context.resources.last_pyspark.spark_context.parallelize(list_p) res = rdd.take(2) for name, age in res: print('%s: %d' % (name, age)) @pipeline(mode_defs=[ ModeDefinition( 'default', resource_defs={ 'first_pyspark': pyspark_resource, 'last_pyspark': pyspark_resource }, ) ]) def pipe(): first_pyspark_job() last_pyspark_job() assert execute_pipeline_with_mode(pipeline=pipe, mode='default').success
def test_named_pyspark_decorator(): @pyspark_solid(name='blah', description='foo bar', config={'foo': Field(str)}) def pyspark_job(context): rdd = context.resources.pyspark.spark_context.parallelize(range(10)) for item in rdd.collect(): print(item) @pipeline(mode_defs=[ ModeDefinition('default', resource_defs={'pyspark': pyspark_resource}) ]) def pipe(): pyspark_job() assert execute_pipeline_with_mode( pipeline=pipe, mode='default', environment_dict={ 'solids': { 'blah': { 'config': { 'foo': 'baz' } } } }, ).success
def test_execute_single_mode(): single_mode_pipeline = define_single_mode_pipeline() assert single_mode_pipeline.is_single_mode is True assert execute_pipeline(single_mode_pipeline).result_for_solid( 'return_two').output_value() == 2 assert (execute_pipeline_with_mode( single_mode_pipeline, mode='the_mode').result_for_solid('return_two').output_value() == 2)
def test_airline_pipeline_1_warehouse(postgres, pg_hostname): warehouse_config_object = load_yaml_from_globs( config_path('test_base.yaml'), config_path('local_warehouse.yaml')) result_warehouse = execute_pipeline_with_mode( pipeline=warehouse_pipeline_def, mode='local', environment_dict=warehouse_config_object, instance=DagsterInstance.local_temp(), ) assert result_warehouse.success
def test_ingest_pipeline_fast(postgres, pg_hostname): ingest_config_dict = load_yaml_from_globs( config_path('test_base.yaml'), config_path('local_fast_ingest.yaml')) result_ingest = execute_pipeline_with_mode( pipeline=ingest_pipeline_def, mode='local', environment_dict=ingest_config_dict, instance=DagsterInstance.local_temp(), ) assert result_ingest.success
def test_execute_pipeline_with_mode(): pipeline_result = execute_pipeline_with_mode( pipeline_with_mode, environment_dict={ 'solids': {'solid_that_uses_adder_resource': {'inputs': {'number': {'value': 4}}}} }, mode='add_one', ) assert pipeline_result.success assert pipeline_result.result_for_solid('solid_that_uses_adder_resource').output_value() == 5 pipeline_result = execute_pipeline_with_mode( pipeline_with_mode, environment_dict={ 'solids': {'solid_that_uses_adder_resource': {'inputs': {'number': {'value': 4}}}} }, mode='add_two', ) assert pipeline_result.success assert pipeline_result.result_for_solid('solid_that_uses_adder_resource').output_value() == 6
def test_execute_multi_mode_loggers_with_single_logger_extra_config(): pipeline_def, _, __ = define_multi_mode_with_loggers_pipeline() with pytest.raises(DagsterInvalidConfigError): execute_pipeline_with_mode( pipeline=pipeline_def, mode='foo_mode', environment_dict={ 'loggers': { 'foo': { 'config': { 'log_level': 'DEBUG' } }, 'bar': { 'config': { 'log_level': 'DEBUG' } }, } }, )
def test_bad_requirements_txt(): with pytest.raises(DagsterInvalidDefinitionError) as exc_info: execute_pipeline_with_mode( pipeline=example_pipe, mode='prod', environment_dict={ 'solids': {'blah': {'config': {'foo': 'a string', 'bar': 123}}}, 'resources': { 'pyspark': { 'config': { 'requirements_file_path': 'DOES_NOT_EXIST', 'pipeline_file': __file__, 'pipeline_fn_name': 'example_pipe', 'cluster_id': 'some_cluster_id', 'staging_bucket': 'dagster-scratch-80542c2', 'region_name': 'us-west-1', } } }, }, ) assert 'The requirements.txt file that was specified does not exist' in str(exc_info.value)
def test_simple_pyspark_decorator(): @pyspark_solid def pyspark_job(context): rdd = context.resources.pyspark.spark_context.parallelize(range(10)) for item in rdd.collect(): print(item) @pipeline(mode_defs=[ ModeDefinition('default', resource_defs={'pyspark': pyspark_resource}) ]) def pipe(): pyspark_job() assert execute_pipeline_with_mode(pipeline=pipe, mode='default').success
def test_pyspark_decorator_with_arguments(): @solid def produce_number(_): return 10 @pyspark_solid(input_defs=[InputDefinition('count', int)]) def pyspark_job(context, count): rdd = context.resources.pyspark.spark_context.parallelize(range(count)) for item in rdd.collect(): print(item) @pipeline(mode_defs=[ ModeDefinition('default', resource_defs={'pyspark': pyspark_resource}) ]) def pipe(): pyspark_job(produce_number()) assert execute_pipeline_with_mode(pipeline=pipe, mode='default').success
def test_pyspark_emr(mock_wait): run_job_flow_args = dict( Instances={ 'InstanceCount': 1, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': {'AvailabilityZone': 'us-west-1a'}, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log', Name='cluster', ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region='us-west-1') context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context, run_job_flow_args) result = execute_pipeline_with_mode( pipeline=example_pipe, mode='prod', environment_dict={ 'solids': {'blah': {'config': {'foo': 'a string', 'bar': 123}}}, 'resources': { 'pyspark': { 'config': { 'pipeline_file': __file__, 'pipeline_fn_name': 'example_pipe', 'cluster_id': cluster_id, 'staging_bucket': 'dagster-scratch-80542c2', 'region_name': 'us-west-1', } } }, }, ) assert result.success assert mock_wait.called_once
def test_do_it_live_emr(): result = execute_pipeline_with_mode( pipeline=example_pipe, mode='prod', environment_dict={ 'solids': {'blah': {'config': {'foo': 'a string', 'bar': 123}}}, 'resources': { 'pyspark': { 'config': { 'pipeline_file': __file__, 'pipeline_fn_name': 'example_pipe', 'cluster_id': os.environ.get('AWS_EMR_JOB_FLOW_ID'), 'staging_bucket': 'dagster-scratch-80542c2', 'region_name': 'us-west-1', 'wait_for_logs': True, } } }, }, ) assert result.success
def test_error_monster_success(): assert execute_pipeline( error_monster, environment_dict={ 'solids': { 'start': { 'config': { 'throw_in_solid': False, 'return_wrong_type': False } }, 'middle': { 'config': { 'throw_in_solid': False, 'return_wrong_type': False } }, 'end': { 'config': { 'throw_in_solid': False, 'return_wrong_type': False } }, }, 'resources': { 'errorable_resource': { 'config': { 'throw_on_resource_init': False } } }, }, ).success assert execute_pipeline_with_mode( pipeline=error_monster, mode='errorable_mode', environment_dict={ 'solids': { 'start': { 'config': { 'throw_in_solid': False, 'return_wrong_type': False } }, 'middle': { 'config': { 'throw_in_solid': False, 'return_wrong_type': False } }, 'end': { 'config': { 'throw_in_solid': False, 'return_wrong_type': False } }, }, 'resources': { 'errorable_resource': { 'config': { 'throw_on_resource_init': False } } }, }, ).success
def test_generate_training_set(mocker): mocker.patch('dagster_examples.bay_bikes.solids.read_sql_table', side_effect=mock_read_sql) # Execute Pipeline test_pipeline_result = execute_pipeline_with_mode( pipeline=generate_test_training_set_pipeline, mode='testing', environment_dict=compose_training_data_env_dict(), ) assert test_pipeline_result.success # Check solids EXPECTED_TRAFFIC_RECORDS = [ { 'interval_date': date(2019, 7, 31), 'peak_traffic_load': 1, 'time': Timestamp('2019-07-31 00:00:00'), }, { 'interval_date': date(2019, 8, 31), 'peak_traffic_load': 1, 'time': Timestamp('2019-08-31 00:00:00'), }, ] traffic_dataset = test_pipeline_result.output_for_solid( 'transform_into_traffic_dataset', output_name='traffic_dataframe').to_dict('records') assert all(record in EXPECTED_TRAFFIC_RECORDS for record in traffic_dataset) EXPECTED_WEATHER_RECORDS = [ { 'time': Timestamp('2019-08-31 00:00:00'), 'summary': 'Clear throughout the day.', 'icon': 'clear-day', 'sunriseTime': 1546269960, 'sunsetTime': 1546304520, 'precipIntensity': 0.0007, 'precipIntensityMax': 0.0019, 'precipProbability': 0.05, 'precipType': 'rain', 'temperatureHigh': 56.71, 'temperatureHighTime': 1546294020, 'temperatureLow': 44.75, 'temperatureLowTime': 1546358040, 'dewPoint': 28.34, 'humidity': 0.43, 'pressure': 1017.7, 'windSpeed': 12.46, 'windGust': 26.85, 'windGustTime': 1546289220, 'windBearing': 0, 'cloudCover': 0.11, 'uvIndex': 2, 'uvIndexTime': 1546287180, 'visibility': 10, 'ozone': 314.4, }, { 'time': Timestamp('2019-07-31 00:00:00'), 'summary': 'Clear throughout the day.', 'icon': 'clear-day', 'sunriseTime': 1546356420, 'sunsetTime': 1546390920, 'precipIntensity': 0.0005, 'precipIntensityMax': 0.0016, 'precipProbability': 0.02, 'precipType': 'sunny', 'temperatureHigh': 55.91, 'temperatureHighTime': 1546382040, 'temperatureLow': 41.18, 'temperatureLowTime': 1546437660, 'dewPoint': 20.95, 'humidity': 0.33, 'pressure': 1023.3, 'windSpeed': 6.77, 'windGust': 22.08, 'windGustTime': 1546343340, 'windBearing': 22, 'cloudCover': 0.1, 'uvIndex': 2, 'uvIndexTime': 1546373580, 'visibility': 10, 'ozone': 305.3, }, ] weather_dataset = test_pipeline_result.output_for_solid( 'produce_weather_dataset', output_name='weather_dataframe').to_dict('records') assert all(record in EXPECTED_WEATHER_RECORDS for record in weather_dataset) # Ensure we are generating the expected training set training_set, labels = test_pipeline_result.output_for_solid( 'produce_training_set') assert len(labels) == 1 and labels[0] == 1 assert array_equal( training_set, [[ [ 1546356420.0, 1546390920.0, 0.0005, 0.0016, 0.02, 55.91, 1546382040.0, 41.18, 1546437660.0, 20.95, 0.33, 1023.3, 6.77, 22.08, 1546343340.0, 22.0, 0.1, 2.0, 1546373580.0, 10.0, 305.3, ], [ 1546269960.0, 1546304520.0, 0.0007, 0.0019, 0.05, 56.71, 1546294020.0, 44.75, 1546358040.0, 28.34, 0.43, 1017.7, 12.46, 26.85, 1546289220.0, 0.0, 0.11, 2.0, 1546287180.0, 10.0, 314.4, ], ]], ) materialization_events = [ event for event in test_pipeline_result.step_event_list if event.solid_name == 'upload_training_set_to_gcs' and event.event_type_value == 'STEP_MATERIALIZATION' ] assert len(materialization_events) == 1 materialization = materialization_events[ 0].event_specific_data.materialization assert materialization.label == 'GCS Blob' materialization_event_metadata = materialization.metadata_entries assert len(materialization_event_metadata) == 1 assert materialization_event_metadata[ 0].label == 'google cloud storage URI' assert materialization_event_metadata[0].entry_data.text.startswith( 'gs://dagster-scratch-ccdfe1e/training_data') # Clean up shutil.rmtree(os.path.join(tempfile.gettempdir(), 'testing-storage'), ignore_errors=True)
mode_defs=[ ModeDefinition( name='unittest', resource_defs={'warehouse': local_sqlite_warehouse_resource}, ), ModeDefinition( name='dev', resource_defs={ 'warehouse': sqlachemy_postgres_warehouse_resource }, ), ] ) def modes_pipeline(): normalize_calories(read_csv()) if __name__ == '__main__': environment_dict = { 'solids': { 'read_csv': {'inputs': {'csv_path': {'value': 'cereal.csv'}}} }, 'resources': {'warehouse': {'config': {'conn_str': ':memory:'}}}, } result = execute_pipeline_with_mode( pipeline=modes_pipeline, mode='unittest', environment_dict=environment_dict, ) assert result.success
def test_wrong_single_mode(): with pytest.raises(DagsterInvariantViolationError): assert (execute_pipeline_with_mode( pipeline=define_single_mode_pipeline(), mode='wrong_mode').result_for_solid('return_two').output_value() == 2)