return result_df @solid def train_model(context, df): context.log.info("{}".format(df)) model = train(df) return model @pipeline( mode_defs=[ ModeDefinition("test", resource_defs={"asset_store": fs_asset_store}), ModeDefinition("local", resource_defs={"asset_store": local_asset_store}), ], ) def asset_store_pipeline(): train_model(parse_df(call_api())) @repository def asset_store_pipeline_repo(): return [asset_store_pipeline] if __name__ == "__main__": instance = DagsterInstance.ephemeral() result = execute_pipeline(asset_store_pipeline, mode="local", instance=instance)
def test_s3_io_manager_execution(mock_s3_bucket): pipeline_def = define_inty_pipeline() run_config = { "resources": { "io_manager": { "config": { "s3_bucket": mock_s3_bucket.name } } } } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") io_manager = PickledObjectS3IOManager(mock_s3_bucket.name, construct_s3_client(max_attempts=5), s3_prefix="dagster") step_output_handle = StepOutputHandle("return_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("return_one"), config={}, metadata={}, upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, metadata={}, mapping_key=None, config=None, solid_def=pipeline_def.solid_def_named("return_one"), ), ) assert io_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) step_output_handle = StepOutputHandle("add_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("add_one"), config={}, metadata={}, upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, metadata={}, mapping_key=None, config=None, solid_def=pipeline_def.solid_def_named("add_one"), ), ) assert get_step_output(add_one_step_events, "add_one") assert io_manager.load_input(context) == 2
def define_test_snapshot_context(): return DagsterGraphQLContext( instance=DagsterInstance.ephemeral(), locations=[InProcessRepositoryLocation(create_main_recon_repo())], )
def instance(): return DagsterInstance.ephemeral()
def test_adls2_pickle_io_manager_execution(storage_account, file_system, credential): pipeline_def = define_inty_pipeline() run_config = { "resources": { "io_manager": { "config": { "adls2_file_system": file_system } }, "adls2": { "config": { "storage_account": storage_account, "credential": { "key": credential } } }, } } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") step_output_handle = StepOutputHandle("return_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("return_one"), upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, solid_def=pipeline_def.solid_def_named("return_one"), ), log_manager=DagsterLogManager(run_id=pipeline_run.run_id, logging_tags={}, loggers=[]), ) io_manager = PickledObjectADLS2IOManager( file_system=file_system, adls2_client=create_adls2_client(storage_account, credential), blob_client=create_blob_client(storage_account, credential), ) assert io_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"]), pipeline_run=pipeline_run, run_config=run_config, instance=instance, )) step_output_handle = StepOutputHandle("add_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("add_one"), upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, solid_def=pipeline_def.solid_def_named("add_one"), ), log_manager=DagsterLogManager(run_id=pipeline_run.run_id, logging_tags={}, loggers=[]), ) assert get_step_output(add_one_step_events, "add_one") assert io_manager.load_input(context) == 2
def test_fan_in_should_skip_step(): @lambda_solid def one(): return 1 @solid(output_defs=[OutputDefinition(is_required=False)]) def skip(_): return yield # pylint: disable=unreachable @solid def fan_in(_context, items): return items @composite_solid(output_defs=[OutputDefinition(is_required=False)]) def composite_all_upstream_skip(): return fan_in([skip(), skip()]) @composite_solid(output_defs=[OutputDefinition(is_required=False)]) def composite_one_upstream_skip(): return fan_in([one(), skip()]) @pipeline def optional_outputs_composite(): composite_all_upstream_skip() composite_one_upstream_skip() instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name="optional_outputs_composite", run_id=make_new_run_id()) execute_plan( create_execution_plan( optional_outputs_composite, step_keys_to_execute=[ "composite_all_upstream_skip.skip", "composite_all_upstream_skip.skip_2", ], ), InMemoryPipeline(optional_outputs_composite), instance, pipeline_run, ) # skip when all the step's sources weren't yield assert should_skip_step( create_execution_plan( optional_outputs_composite, step_keys_to_execute=["composite_all_upstream_skip.fan_in"], ), instance, pipeline_run.run_id, ) execute_plan( create_execution_plan( optional_outputs_composite, step_keys_to_execute=[ "composite_one_upstream_skip.one", "composite_one_upstream_skip.skip", ], ), InMemoryPipeline(optional_outputs_composite), instance, pipeline_run, ) # do not skip when some of the sources exist assert not should_skip_step( create_execution_plan( optional_outputs_composite, step_keys_to_execute=["composite_one_upstream_skip.fan_in"], ), instance, pipeline_run.run_id, )
def test_adls2_pickle_io_manager_execution(storage_account, file_system, credential): job = define_inty_job() run_config = { "resources": { "io_manager": { "config": { "adls2_file_system": file_system } }, "adls2": { "config": { "storage_account": storage_account, "credential": { "key": credential } } }, } } run_id = make_new_run_id() resolved_run_config = ResolvedRunConfig.build(job, run_config=run_config) execution_plan = ExecutionPlan.build(InMemoryPipeline(job), resolved_run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=job.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys, job, resolved_run_config), pipeline=InMemoryPipeline(job), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") context = build_input_context(upstream_output=build_output_context( step_key="return_one", name="result", run_id=run_id, )) io_manager = PickledObjectADLS2IOManager( file_system=file_system, adls2_client=create_adls2_client(storage_account, credential), blob_client=create_blob_client(storage_account, credential), ) assert io_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"], job, resolved_run_config), pipeline=InMemoryPipeline(job), pipeline_run=pipeline_run, run_config=run_config, instance=instance, )) context = build_input_context(upstream_output=build_output_context( step_key="add_one", name="result", run_id=run_id, mapping_key="foo", )) assert get_step_output(add_one_step_events, "add_one") assert io_manager.load_input(context) == 2
def test_gcs_asset_store_execution(gcs_bucket): pipeline_def = define_inty_pipeline() run_config = { "resources": { "asset_store": { "config": { "gcs_bucket": gcs_bucket, } } } } run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one.compute") step_keys = ["return_one.compute"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one.compute") asset_store = PickledObjectGCSAssetStore(gcs_bucket, storage.Client()) step_output_handle = StepOutputHandle("return_one.compute") context = AssetStoreContext( step_output_handle.step_key, step_output_handle.output_name, {}, pipeline_def.name, pipeline_def.solid_def_named("return_one"), run_id, ) assert asset_store.get_asset(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one.compute"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) step_output_handle = StepOutputHandle("add_one.compute") context = AssetStoreContext( step_output_handle.step_key, step_output_handle.output_name, {}, pipeline_def.name, pipeline_def.solid_def_named("add_one"), run_id, ) assert get_step_output(add_one_step_events, "add_one.compute") assert asset_store.get_asset(context) == 2
def test_gcs_object_manager_execution(gcs_bucket): pipeline_def = define_inty_pipeline() run_config = {"resources": {"object_manager": {"config": {"gcs_bucket": gcs_bucket,}}}} run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config ) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(return_one_step_events, "return_one") object_manager = PickledObjectGCSObjectManager(gcs_bucket, storage.Client()) step_output_handle = StepOutputHandle("return_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("return_one"), upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, solid_def=pipeline_def.solid_def_named("return_one"), ), ) assert object_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) step_output_handle = StepOutputHandle("add_one") context = InputContext( pipeline_name=pipeline_def.name, solid_def=pipeline_def.solid_def_named("add_one"), upstream_output=OutputContext( step_key=step_output_handle.step_key, name=step_output_handle.output_name, pipeline_name=pipeline_def.name, run_id=run_id, solid_def=pipeline_def.solid_def_named("add_one"), ), ) assert get_step_output(add_one_step_events, "add_one") assert object_manager.load_input(context) == 2
def test_gcs_pickle_io_manager_execution(gcs_bucket): inty_job = define_inty_job() run_config = { "resources": { "io_manager": { "config": { "gcs_bucket": gcs_bucket, } } } } run_id = make_new_run_id() resolved_run_config = ResolvedRunConfig.build(inty_job, run_config=run_config) execution_plan = ExecutionPlan.build(InMemoryPipeline(inty_job), resolved_run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=inty_job.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys, inty_job, resolved_run_config), pipeline=InMemoryPipeline(inty_job), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(return_one_step_events, "return_one") io_manager = PickledObjectGCSIOManager(gcs_bucket, storage.Client()) step_output_handle = StepOutputHandle("return_one") context = build_input_context( upstream_output=build_output_context( step_key=step_output_handle.step_key, name=step_output_handle.output_name, run_id=run_id, ) ) assert io_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"], inty_job, resolved_run_config), pipeline=InMemoryPipeline(inty_job), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) step_output_handle = StepOutputHandle("add_one") context = build_input_context( upstream_output=build_output_context( step_key=step_output_handle.step_key, name=step_output_handle.output_name, run_id=run_id, ) ) assert get_step_output(add_one_step_events, "add_one") assert io_manager.load_input(context) == 2
def define_test_snapshot_context(): return DagsterSnapshotGraphQLContext( instance=DagsterInstance.ephemeral(), execution_manager=SynchronousExecutionManager(), repository_snapshot=RepositorySnapshot.from_repository_definition(define_repository()), )