def test_ecs_run_launcher_inits(): DagsterInstance.local_temp( overrides={ "run_launcher": { "module": "dagster_aws.ecs.launcher", "class": "ECSRunLauncher" } })
def test_yielded_results_config(): run_config = { 'resources': { 'ge_data_context': { 'config': { 'ge_root_dir': file_relative_path(__file__, "./great_expectations") } } } } result = execute_pipeline( reconstructable(hello_world_pipeline), run_config=run_config, mode='basic', instance=DagsterInstance.local_temp(), ) assert result.result_for_solid( "reyielder").output_value()[0]["success_percent"] == 100 expectations = result.result_for_solid( "ge_validation_solid").expectation_results_during_compute assert len(expectations) == 1 mainexpect = expectations[0] assert mainexpect.success metadata = mainexpect.metadata_entries[0].entry_data.data assert metadata['overall'] == { 'evaluated_expectations': 11, 'success_percent': 100.0, 'successful_expectations': 11, 'unsuccessful_expectations': 0, }
def temp_instance(): with seven.TemporaryDirectory() as temp_dir: instance = DagsterInstance.local_temp(temp_dir) try: yield instance finally: instance.run_launcher.join()
def test_multiprocess_executor(): result = execute_pipeline( run_config={ # This section controls how the run will be executed. # The multiprocess executor runs each step in its own sub process. "execution": { "multiprocess": {} }, # This section controls how values will be passed from one solid to the next. # The default is in memory, so here we set it to filesystem to allow the # separate subprocess to get the values "intermediate_storage": { "filesystem": {} }, }, # The default instance for this API is an in memory ephemeral one. # To allow the multiple processes to coordinate we use one here # backed by a temporary directory. instance=DagsterInstance.local_temp(), # A ReconstructablePipeline is necessary to load the pipeline in child processes. # reconstructable() is a utility function that captures where the # PipelineDefinition came from. pipeline=reconstructable(predict_color), ) assert result.success
def test_pipeline_run_creation_race(): with seven.TemporaryDirectory() as tempdir: instance = DagsterInstance.local_temp(tempdir) run_id = 'run_id' # Spy on the result of add_run add_run_spy = Spy(instance._run_storage.add_run) # pylint: disable=protected-access add_run_mock = mock.MagicMock(side_effect=add_run_spy) instance._run_storage.add_run = add_run_mock # pylint: disable=protected-access # This invocation should successfully add the run to run storage pipeline_run = register_managed_run_for_test(instance, run_id=run_id) assert len(add_run_mock.call_args_list) == 1 assert instance.has_run(run_id) # Check that add_run did not receive DagsterRunAlreadyExists exception and that # it successfully returned assert add_run_spy.exceptions == [] assert len(add_run_spy.return_values) == 1 # (*) Simulate a race where second invocation receives has_run() is False fetched_pipeline_run = '' with mock.patch.object(instance, 'has_run', mock.MagicMock(return_value=False)): fetched_pipeline_run = register_managed_run_for_test(instance, run_id=run_id) # Check that add_run received DagsterRunAlreadyExists exception and did not return value assert len(add_run_mock.call_args_list) == 2 assert add_run_spy.exceptions == [DagsterRunAlreadyExists] assert len(add_run_spy.return_values) == 1 assert pipeline_run == fetched_pipeline_run assert instance.has_run(run_id) assert len(instance.get_runs()) == 1
def test_multiple_local_cluster(): cluster_configs = [ { "n_workers": 1, "threads_per_worker": 2, "dashboard_address": None, }, { "n_workers": 2, "threads_per_worker": 1, "dashboard_address": None, }, ] for cluster_config in cluster_configs: run_config = { "resources": { "dask": { "config": { "cluster": { "local": cluster_config } } } } } result = execute_pipeline( scheduler_info_pipeline, run_config=run_config, instance=DagsterInstance.local_temp(), ) _assert_scheduler_info_result(result, cluster_config)
def test_execute_pipeline_iterator(): records = [] def event_callback(record): assert isinstance(record, EventRecord) records.append(record) pipeline = PipelineDefinition( name='basic_resource_pipeline', solid_defs=[resource_solid], mode_defs=[ ModeDefinition( resource_defs={'a': resource_a, 'b': resource_b}, logger_defs={'callback': construct_event_logger(event_callback)}, ) ], ) iterator = execute_pipeline_iterator( pipeline, environment_dict={'loggers': {'callback': {}}}, instance=DagsterInstance.local_temp(), ) event_type = None while event_type != 'STEP_START': event = next(iterator) event_type = event.event_type_value iterator.close() events = [record.dagster_event for record in records if record.is_dagster_event] messages = [record.user_message for record in records if not record.is_dagster_event] assert len([event for event in events if event.is_pipeline_failure]) > 0 assert len([message for message in messages if message == 'CLEANING A']) > 0 assert len([message for message in messages if message == 'CLEANING B']) > 0
def test_multiple_outputs_only_emit_one_multiproc(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_multi_out' ).build_pipeline_definition() result = execute_pipeline( pipe, environment_dict={'storage': {'filesystem': {}}, 'execution': {'multiprocess': {}}}, instance=DagsterInstance.local_temp(), ) assert result.success solid_result = result.result_for_solid('multiple_outputs') assert set(solid_result.output_values.keys()) == set(['output_one']) with pytest.raises( DagsterInvariantViolationError, match="Output 'not_defined' not defined in solid 'multiple_outputs'", ): solid_result.output_value('not_defined') with pytest.raises(DagsterInvariantViolationError, match='Did not find result output_two'): solid_result.output_value('output_two') with pytest.raises( DagsterInvariantViolationError, match=( 'Tried to get result for solid not_present in multiple_outputs_only_emit_one_pipeline. ' 'No such top level solid.' ), ): result.result_for_solid('not_present') assert result.result_for_solid('downstream_two').skipped
def test_sync_run_launcher_run(): with seven.TemporaryDirectory() as temp_dir: instance = DagsterInstance.local_temp( temp_dir, overrides={ "run_launcher": { "module": "dagster.core.launcher.sync_in_memory_run_launcher", "class": "SyncInMemoryRunLauncher", } }, ) external_repo = get_main_external_repo(instance) external_pipeline = external_repo.get_full_external_pipeline( "noop_pipeline") run = create_run_for_test(instance=instance, pipeline_name=external_pipeline.name) run = instance.run_launcher.launch_run( instance=instance, run=run, external_pipeline=external_pipeline) completed_run = instance.get_run_by_id(run.run_id) assert completed_run.is_success
def test_multiproc_markers(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_pipeline' ).build_pipeline_definition() instance = DagsterInstance.local_temp() result = execute_pipeline( pipe, instance=instance, environment_dict={'execution': {'multiprocess': {}}, 'storage': {'filesystem': {}}}, ) assert result.success events = instance.all_logs(result.run_id) start_markers = {} end_markers = {} for event in events: dagster_event = event.dagster_event if dagster_event.is_engine_event: if dagster_event.engine_event_data.marker_start: key = '{step}.{marker}'.format( step=event.step_key, marker=dagster_event.engine_event_data.marker_start ) start_markers[key] = event.timestamp if dagster_event.engine_event_data.marker_end: key = '{step}.{marker}'.format( step=event.step_key, marker=dagster_event.engine_event_data.marker_end ) end_markers[key] = event.timestamp seen = set() assert set(start_markers.keys()) == set(end_markers.keys()) for key in end_markers: assert end_markers[key] - start_markers[key] > 0 seen.add(key) assert 'ping.compute.multiprocess_subprocess_init' in end_markers
def test_yielded_results_config_pandas(snapshot): run_config = { "resources": { "ge_data_context": { "config": { "ge_root_dir": file_relative_path(__file__, "./great_expectations") } } } } result = execute_pipeline( reconstructable(hello_world_pandas_pipeline), run_config=run_config, mode="basic", instance=DagsterInstance.local_temp(), ) assert result.result_for_solid( "reyielder").output_value()[0]["success_percent"] == 100 expectations = result.result_for_solid( "ge_validation_solid").expectation_results_during_compute assert len(expectations) == 1 mainexpect = expectations[0] assert mainexpect.success # purge system specific metadata for testing metadata = mainexpect.metadata_entries[0].entry_data.md_str.split( "### Info")[0] snapshot.assert_match(metadata)
def test_execute_celery_docker(): docker_image = test_project_docker_image() docker_config = { "image": docker_image, "env_vars": ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"], } if IS_BUILDKITE: ecr_client = boto3.client("ecr", region_name="us-west-1") token = ecr_client.get_authorization_token() username, password = (base64.b64decode( token["authorizationData"][0] ["authorizationToken"]).decode().split(":")) registry = token["authorizationData"][0]["proxyEndpoint"] docker_config["registry"] = { "url": registry, "username": username, "password": password, } else: try: client = docker.from_env() client.images.get(docker_image) print( # pylint: disable=print-call "Found existing image tagged {image}, skipping image build. To rebuild, first run: " "docker rmi {image}".format(image=docker_image)) except docker.errors.ImageNotFound: build_and_tag_test_image(docker_image) with seven.TemporaryDirectory() as temp_dir: run_config = merge_dicts( merge_yamls([ os.path.join(test_project_environments_path(), "env.yaml"), os.path.join(test_project_environments_path(), "env_s3.yaml"), ]), { "execution": { "celery-docker": { "config": { "docker": docker_config, "config_source": { "task_always_eager": True }, } } }, }, ) result = execute_pipeline( get_test_project_recon_pipeline("docker_celery_pipeline"), run_config=run_config, instance=DagsterInstance.local_temp(temp_dir), ) assert result.success
def test_execute_celery_docker(): docker_image = test_project_docker_image() docker_config = { 'image': docker_image, 'env_vars': ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'], } if IS_BUILDKITE: ecr_client = boto3.client('ecr', region_name='us-west-1') token = ecr_client.get_authorization_token() username, password = (base64.b64decode( token['authorizationData'][0] ['authorizationToken']).decode().split(':')) registry = token['authorizationData'][0]['proxyEndpoint'] docker_config['registry'] = { 'url': registry, 'username': username, 'password': password, } else: try: client = docker.from_env() client.images.get(docker_image) print( # pylint: disable=print-call 'Found existing image tagged {image}, skipping image build. To rebuild, first run: ' 'docker rmi {image}'.format(image=docker_image)) except docker.errors.ImageNotFound: build_and_tag_test_image(docker_image) with seven.TemporaryDirectory() as temp_dir: run_config = merge_dicts( merge_yamls([ os.path.join(test_project_environments_path(), 'env.yaml'), os.path.join(test_project_environments_path(), 'env_s3.yaml'), ]), { 'execution': { 'celery-docker': { 'config': { 'docker': docker_config, 'config_source': { 'task_always_eager': True }, } } }, }, ) result = execute_pipeline( get_test_project_recon_pipeline('docker_celery_pipeline'), run_config=run_config, instance=DagsterInstance.local_temp(temp_dir), ) assert result.success
def test_execute_plan_iterator(): records = [] def event_callback(record): assert isinstance(record, EventRecord) records.append(record) instance = DagsterInstance.local_temp() pipeline = PipelineDefinition( name='basic_resource_pipeline', solid_defs=[resource_solid], mode_defs=[ ModeDefinition( resource_defs={ 'a': resource_a, 'b': resource_b }, logger_defs={ 'callback': construct_event_logger(event_callback) }, ) ], ) environment_dict = {'loggers': {'callback': {}}} pipeline_run = instance.create_run( PipelineRun( pipeline_name=pipeline.name, run_id=make_new_run_id(), environment_dict={'loggers': { 'callback': {} }}, mode='default', status=PipelineRunStatus.NOT_STARTED, )) execution_plan = create_execution_plan(pipeline, environment_dict) iterator = execute_plan_iterator(execution_plan, pipeline_run, instance, environment_dict=environment_dict) event_type = None while event_type != 'STEP_START': event = next(iterator) event_type = event.event_type_value iterator.close() messages = [ record.user_message for record in records if not record.is_dagster_event ] assert len([message for message in messages if message == 'CLEANING A']) > 0 assert len([message for message in messages if message == 'CLEANING B']) > 0
def test_execute_plan_iterator(): records = [] def event_callback(record): assert isinstance(record, EventRecord) records.append(record) instance = DagsterInstance.local_temp() pipeline = PipelineDefinition( name='basic_resource_pipeline', solid_defs=[resource_solid], mode_defs=[ ModeDefinition( resource_defs={ 'a': resource_a, 'b': resource_b }, logger_defs={ 'callback': construct_event_logger(event_callback) }, ) ], ) run_config = {'loggers': {'callback': {}}} execution_plan = create_execution_plan(pipeline, run_config=run_config) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline, run_config={'loggers': { 'callback': {} }}, execution_plan=execution_plan, ) iterator = execute_plan_iterator(execution_plan, pipeline_run, instance, run_config=run_config) event_type = None while event_type != 'STEP_START': event = next(iterator) event_type = event.event_type_value iterator.close() messages = [ record.user_message for record in records if not record.is_dagster_event ] assert len([message for message in messages if message == 'CLEANING A']) > 0 assert len([message for message in messages if message == 'CLEANING B']) > 0
def test_execute_run_iterator(): records = [] def event_callback(record): assert isinstance(record, EventRecord) records.append(record) instance = DagsterInstance.local_temp() pipeline_def = PipelineDefinition( name="basic_resource_pipeline", solid_defs=[resource_solid], mode_defs=[ ModeDefinition( resource_defs={ "a": resource_a, "b": resource_b }, logger_defs={ "callback": construct_event_logger(event_callback) }, ) ], ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, run_config={"loggers": { "callback": {} }}, mode="default", ) iterator = execute_run_iterator(InMemoryExecutablePipeline(pipeline_def), pipeline_run, instance=instance) event_type = None while event_type != "STEP_START": event = next(iterator) event_type = event.event_type_value iterator.close() events = [ record.dagster_event for record in records if record.is_dagster_event ] messages = [ record.user_message for record in records if not record.is_dagster_event ] assert len([event for event in events if event.is_pipeline_failure]) > 0 assert len([message for message in messages if message == "CLEANING A"]) > 0 assert len([message for message in messages if message == "CLEANING B"]) > 0
def test_dask_pipeline(): run_config = load_yaml_from_globs( file_relative_path(__file__, "../../docs_snippets/deploying/dask_hello_world.yaml") ) result = execute_pipeline( reconstructable(dask_pipeline), run_config=run_config, instance=DagsterInstance.local_temp(), ) assert result.success assert result.result_for_solid("hello_world").output_value() == "Hello, World!"
def test_execute_pipeline_iterator(): records = [] def event_callback(record): assert isinstance(record, EventRecord) records.append(record) pipeline = PipelineDefinition( name="basic_resource_pipeline", solid_defs=[resource_solid], mode_defs=[ ModeDefinition( resource_defs={ "a": resource_a, "b": resource_b }, logger_defs={ "callback": construct_event_logger(event_callback) }, ) ], ) iterator = execute_pipeline_iterator( pipeline, run_config={"loggers": { "callback": {} }}, instance=DagsterInstance.local_temp(), ) event_type = None while event_type != "STEP_START": event = next(iterator) event_type = event.event_type_value iterator.close() events = [ record.dagster_event for record in records if record.is_dagster_event ] messages = [ record.user_message for record in records if not record.is_dagster_event ] pipeline_failure_events = [ event for event in events if event.is_pipeline_failure ] assert len(pipeline_failure_events) == 1 assert "GeneratorExit" in pipeline_failure_events[ 0].pipeline_failure_data.error.message assert len([message for message in messages if message == "CLEANING A"]) > 0 assert len([message for message in messages if message == "CLEANING B"]) > 0
def test_pipeline(pg_hostname, postgres): # pylint: disable=unused-argument reconstructable_pipeline = ReconstructablePipeline.for_module( "dbt_example", "dbt_example_pipeline") assert set([ solid.name for solid in reconstructable_pipeline.get_definition().solids ]) == { "download_file", "load_cereals_from_csv", "run_cereals_models", "test_cereals_models", "analyze_cereals", "post_plot_to_slack", } with TemporaryDirectory() as tempdir: res = execute_pipeline( ReconstructablePipeline.for_module("dbt_example", "dbt_example_pipeline"), instance=DagsterInstance.local_temp(tempdir=tempdir), mode="dev", run_config={ "solids": { "download_file": { "config": { "url": CEREALS_DATASET_URL, "target_path": "cereals.csv", } }, "post_plot_to_slack": { "config": { "channels": ["foo_channel"] } }, }, "resources": { "db": { "config": { "db_url": (f"postgresql://*****:*****@{pg_hostname}" ":5432/dbt_example") } }, "slack": { "config": { "token": "nonce" } }, }, }, raise_on_error=False, ) assert res.success
def test_pipeline_tags(): dag = get_dag() instance = DagsterInstance.local_temp() manager = instance.compute_log_manager # When mode is default and tags are set, run with tags result = execute_pipeline( pipeline=make_dagster_pipeline_from_airflow_dag( dag, {AIRFLOW_EXECUTION_DATE_STR: EXECUTION_DATE_MINUS_WEEK_FMT}), instance=instance, ) check_compute_logs(manager, result, EXECUTION_DATE_MINUS_WEEK_FMT)
def test_pipelines_success(file_path, run_config_path): with pushd( file_relative_path(__file__, '../../../docs_snippets/legacy/data_science/')): instance = DagsterInstance.local_temp() run_config = load_yaml_from_path( run_config_path) if run_config_path else None recon_pipeline = ReconstructablePipeline.for_file( file_path, 'iris_pipeline') pipeline_result = execute_pipeline(recon_pipeline, run_config=run_config, instance=instance) assert pipeline_result.success
def test_sync_run_launcher_from_configurable_class(): with seven.TemporaryDirectory() as temp_dir: instance_no_hijack = DagsterInstance.local_temp( temp_dir, overrides={ "run_launcher": { "module": "dagster.core.launcher.sync_in_memory_run_launcher", "class": "SyncInMemoryRunLauncher", } }, ) assert isinstance(instance_no_hijack.run_launcher, SyncInMemoryRunLauncher)
def test_priorities_mp(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'priority_test' ).build_pipeline_definition() result = execute_pipeline( pipe, { 'execution': {'multiprocess': {'config': {'max_concurrent': 1}}}, 'storage': {'filesystem': {}}, }, instance=DagsterInstance.local_temp(), ) assert result.success assert [ str(event.solid_handle) for event in result.step_event_list if event.is_step_success ] == ['high', 'high_2', 'none', 'none_2', 'low', 'low_2']
def test_pipelines_success(file_path, run_config_path): with pushd( file_relative_path(__file__, "../../../docs_snippets/legacy/data_science/")): instance = DagsterInstance.local_temp() run_config = load_yaml_from_path( run_config_path) if run_config_path else None recon_pipeline = ReconstructablePipeline.for_file( file_path, "iris_pipeline") pipeline_result = execute_pipeline( recon_pipeline, run_config=run_config, instance=instance, solid_selection=["k_means_iris"], # skip download_file in tests ) assert pipeline_result.success
def test_yielded_results_config(): run_config = { "resources": { "ge_data_context": { "config": {"ge_root_dir": file_relative_path(__file__, "./great_expectations")} } } } result = execute_pipeline( reconstructable(hello_world_pipeline), run_config=run_config, mode="basic", instance=DagsterInstance.local_temp(), ) assert result.result_for_solid("reyielder").output_value()[0]["success_percent"] == 100 expectations = result.result_for_solid("ge_validation_solid").expectation_results_during_compute assert len(expectations) == 1 mainexpect = expectations[0] assert mainexpect.success
def test_execute_run_bad_state(): records = [] def event_callback(record): assert isinstance(record, EventRecord) records.append(record) instance = DagsterInstance.local_temp() pipeline_def = PipelineDefinition( name="basic_resource_pipeline", solid_defs=[resource_solid], mode_defs=[ ModeDefinition( resource_defs={ "a": resource_a, "b": resource_b }, logger_defs={ "callback": construct_event_logger(event_callback) }, ) ], ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, run_config={ "loggers": { "callback": {} } }, mode="default", ).with_status(PipelineRunStatus.SUCCESS) with pytest.raises( check.CheckError, match=r"Pipeline run basic_resource_pipeline \({}\) in state" r" PipelineRunStatus.SUCCESS, expected PipelineRunStatus.NOT_STARTED" .format(pipeline_run.run_id), ): execute_run(InMemoryPipeline(pipeline_def), pipeline_run, instance=instance)
def execute_multiprocessing(): from dagster import reconstructable, DagsterInstance execute_pipeline( # A ReconstructablePipeline is necessary to load the pipeline in child processes. # reconstructable() is a utility function that captures where the # PipelineDefinition came from. reconstructable(parallel_pipeline), run_config={ # This section controls how the run will be executed. # The multiprocess executor runs each solid in its own sub process. "execution": { "multiprocess": {} }, }, # The default instance for this API is an in memory ephemeral one. # To allow the multiple processes to coordinate we use one here # backed by a temporary directory. instance=DagsterInstance.local_temp(), )
def test_multiple_outputs_only_emit_one_multiproc(): pipe = reconstructable(define_multi_out) result = execute_pipeline( pipe, run_config={ "storage": { "filesystem": {} }, "execution": { "multiprocess": {} } }, instance=DagsterInstance.local_temp(), ) assert result.success solid_result = result.result_for_solid("multiple_outputs") assert set(solid_result.output_values.keys()) == set(["output_one"]) with pytest.raises( DagsterInvariantViolationError, match= "Output 'not_defined' not defined in solid 'multiple_outputs'", ): solid_result.output_value("not_defined") with pytest.raises(DagsterInvariantViolationError, match="Did not find result output_two"): solid_result.output_value("output_two") with pytest.raises( DagsterInvariantViolationError, match=re.escape( "Tried to get result for solid 'not_present' in " "'multiple_outputs_only_emit_one_pipeline'. No such top level solid." ), ): result.result_for_solid("not_present") assert result.result_for_solid("downstream_two").skipped
def test_multiple_outputs_only_emit_one_multiproc(): pipe = reconstructable(define_multi_out) result = execute_pipeline( pipe, environment_dict={ 'storage': { 'filesystem': {} }, 'execution': { 'multiprocess': {} } }, instance=DagsterInstance.local_temp(), ) assert result.success solid_result = result.result_for_solid('multiple_outputs') assert set(solid_result.output_values.keys()) == set(['output_one']) with pytest.raises( DagsterInvariantViolationError, match= "Output 'not_defined' not defined in solid 'multiple_outputs'", ): solid_result.output_value('not_defined') with pytest.raises(DagsterInvariantViolationError, match='Did not find result output_two'): solid_result.output_value('output_two') with pytest.raises( DagsterInvariantViolationError, match=re.escape( 'Tried to get result for solid \'not_present\' in ' '\'multiple_outputs_only_emit_one_pipeline\'. No such top level solid.' ), ): result.result_for_solid('not_present') assert result.result_for_solid('downstream_two').skipped
def test_multiproc_markers(): instance = DagsterInstance.local_temp() result = execute_pipeline( reconstructable(define_pipeline), instance=instance, run_config={ "execution": { "multiprocess": {} }, "storage": { "filesystem": {} } }, ) assert result.success events = instance.all_logs(result.run_id) start_markers = {} end_markers = {} for event in events: dagster_event = event.dagster_event if dagster_event.is_engine_event: if dagster_event.engine_event_data.marker_start: key = "{step}.{marker}".format( step=event.step_key, marker=dagster_event.engine_event_data.marker_start) start_markers[key] = event.timestamp if dagster_event.engine_event_data.marker_end: key = "{step}.{marker}".format( step=event.step_key, marker=dagster_event.engine_event_data.marker_end) end_markers[key] = event.timestamp seen = set() assert set(start_markers.keys()) == set(end_markers.keys()) for key in end_markers: assert end_markers[key] - start_markers[key] > 0 seen.add(key) assert "ping.compute.multiprocess_subprocess_init" in end_markers