def test_pipelines_success(file_path, run_config_path): with pushd( file_relative_path(__file__, '../../../docs_snippets/legacy/data_science/')): instance = DagsterInstance.local_temp() run_config = load_yaml_from_path( run_config_path) if run_config_path else None recon_pipeline = ReconstructablePipeline.for_file( file_path, 'iris_pipeline') pipeline_result = execute_pipeline(recon_pipeline, run_config=run_config, instance=instance) assert pipeline_result.success
def exec_for_test(fn_name, env=None, raise_on_error=True, **kwargs): result = None recon_pipeline = ReconstructablePipeline.for_module( "dagstermill.examples.repository", fn_name) try: result = execute_pipeline(recon_pipeline, env, instance=DagsterInstance.local_temp(), raise_on_error=raise_on_error, **kwargs) yield result finally: if result: cleanup_result_notebook(result)
def test_engine_error(): with pytest.raises(DagsterSubprocessError): with seven.TemporaryDirectory() as tempdir: storage = os.path.join(tempdir, "flakey_storage") execute_pipeline( ReconstructablePipeline.for_file(REPO_FILE, "engine_error"), run_config={ "storage": {"filesystem": {"config": {"base_dir": storage}}}, "execution": { "celery": {"config": {"config_source": {"task_always_eager": True}}} }, "solids": {"destroy": {"config": storage}}, }, instance=DagsterInstance.local_temp(tempdir=tempdir), )
def test_yield_unserializable_result(): manager = Manager() assert manager.yield_result(threading.Lock()) with in_pipeline_manager( pipeline_name="hello_world_output_pipeline", solid_handle=NodeHandle("hello_world_output", None), executable_dict=ReconstructablePipeline.for_module( "dagstermill.examples.repository", "hello_world_output_pipeline", ).to_dict(), step_key="hello_world_output", ) as manager: with pytest.raises(TypeError): manager.yield_result(threading.Lock())
def test_dask_terminate(): run_config = { "solids": { "sleepy_dask_solid": { "inputs": { "df": { "csv": { "path": file_relative_path(__file__, "ex*.csv") } } } } } } interrupt_thread = None result_types = [] received_interrupt = False with instance_for_test() as instance: try: for result in execute_pipeline_iterator( pipeline=ReconstructablePipeline.for_file( __file__, sleepy_dask_pipeline.name), run_config=run_config, instance=instance, ): # Interrupt once the first step starts if result.event_type == DagsterEventType.STEP_START and not interrupt_thread: interrupt_thread = Thread(target=send_interrupt, args=()) interrupt_thread.start() if result.event_type == DagsterEventType.STEP_FAILURE: assert ("DagsterExecutionInterruptedError" in result.event_specific_data.error.message) result_types.append(result.event_type) assert False except DagsterExecutionInterruptedError: received_interrupt = True assert received_interrupt interrupt_thread.join() assert DagsterEventType.STEP_FAILURE in result_types assert DagsterEventType.PIPELINE_FAILURE in result_types
def step_context_to_step_run_ref(step_context, prior_attempts_count, package_dir=None): ''' Args: step_context (SystemStepExecutionContext): The step context. prior_attempts_count (int): The number of times this time has been tried before in the same pipeline run. package_dir (Optional[str]): If set, the reconstruction file code pointer will be converted to be relative a module pointer relative to the package root. This enables executing steps in remote setups where the package containing the pipeline resides at a different location on the filesystem in the remote environment than in the environment executing the plan process. Returns (StepRunRef): A reference to the step. ''' check.inst_param(step_context, 'step_context', SystemStepExecutionContext) check.int_param(prior_attempts_count, 'prior_attempts_count') retries = step_context.retries recon_pipeline = step_context.pipeline if package_dir: if isinstance(recon_pipeline, ReconstructablePipeline) and isinstance( recon_pipeline.repository.pointer, FileCodePointer ): recon_pipeline = ReconstructablePipeline( repository=ReconstructableRepository( pointer=ModuleCodePointer( _module_in_package_dir( recon_pipeline.repository.pointer.python_file, package_dir ), recon_pipeline.repository.pointer.fn_name, ), ), pipeline_name=recon_pipeline.pipeline_name, solids_to_execute=recon_pipeline.solids_to_execute, ) return StepRunRef( run_config=step_context.run_config, pipeline_run=step_context.pipeline_run, run_id=step_context.pipeline_run.run_id, step_key=step_context.step.key, retries=retries, recon_pipeline=recon_pipeline, prior_attempts_count=prior_attempts_count, )
def test_pipelines_success(file_path, run_config_path): with pushd(file_relative_path(__file__, "../../../docs_snippets/legacy/data_science/")): with instance_for_test() as instance: run_config = load_yaml_from_path(run_config_path) if run_config_path else {} recon_pipeline = ReconstructablePipeline.for_file(file_path, "iris_pipeline") with tempfile.TemporaryDirectory() as temp_dir: run_config["resources"] = {"io_manager": {"config": {"base_dir": temp_dir}}} pipeline_result = execute_pipeline( recon_pipeline, run_config=run_config, instance=instance, solid_selection=["k_means_iris"], # skip download_file in tests ) assert pipeline_result.success
def in_pipeline_manager( pipeline_name="hello_world_pipeline", solid_handle=NodeHandle("hello_world", None), step_key="hello_world", executable_dict=None, mode=None, **kwargs, ): manager = Manager() run_id = make_new_run_id() with instance_for_test() as instance: marshal_dir = tempfile.mkdtemp() if not executable_dict: executable_dict = ReconstructablePipeline.for_module( "dagstermill.examples.repository", "hello_world_pipeline" ).to_dict() pipeline_run_dict = pack_value( PipelineRun( pipeline_name=pipeline_name, run_id=run_id, mode=mode or "default", run_config=None, step_keys_to_execute=None, status=PipelineRunStatus.NOT_STARTED, ) ) try: with safe_tempfile_path() as output_log_file_path: context_dict = { "pipeline_run_dict": pipeline_run_dict, "solid_handle_kwargs": solid_handle._asdict(), "executable_dict": executable_dict, "marshal_dir": marshal_dir, "run_config": {}, "output_log_path": output_log_file_path, "instance_ref_dict": pack_value(instance.get_ref()), "step_key": step_key, } manager.reconstitute_pipeline_context(**dict(context_dict, **kwargs)) yield manager finally: shutil.rmtree(marshal_dir)
def _execute_plan(_self, instance_ref_dict, executable_dict, run_id, step_keys, retries_dict): check.dict_param(instance_ref_dict, 'instance_ref_dict') check.dict_param(executable_dict, 'executable_dict') check.str_param(run_id, 'run_id') check.list_param(step_keys, 'step_keys', of_type=str) check.dict_param(retries_dict, 'retries_dict') instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline = ReconstructablePipeline.from_dict(executable_dict) retries = Retries.from_config(retries_dict) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_keys_str = ", ".join(step_keys) execution_plan = create_execution_plan( pipeline, pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ).build_subset_plan(step_keys) engine_event = instance.report_engine_event( 'Executing steps {} in celery worker'.format(step_keys_str), pipeline_run, EngineEventData( [EventMetadataEntry.text(step_keys_str, 'step_keys'),], marker_end=DELEGATE_MARKER, ), CeleryEngine, step_key=execution_plan.step_key_for_single_step_plans(), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan, pipeline_run=pipeline_run, run_config=pipeline_run.run_config, instance=instance, retries=retries, ): events.append(step_event) serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def execute_pipeline_on_celery( pipeline_name, instance=None, run_config=None, tempdir=None, tags=None, subset=None ): with tempdir_wrapper(tempdir) as tempdir: pipeline_def = ReconstructablePipeline.for_file( REPO_FILE, pipeline_name ).subset_for_execution(subset) instance = instance or DagsterInstance.local_temp(tempdir=tempdir) run_config = run_config or { "storage": {"filesystem": {"config": {"base_dir": tempdir}}}, "execution": {"celery": {}}, } result = execute_pipeline( pipeline_def, run_config=run_config, instance=instance, tags=tags, ) yield result
def test_pipelines_success(file_path, run_config_path): with pushd( file_relative_path(__file__, "../../../docs_snippets/legacy/data_science/")): instance = DagsterInstance.local_temp() run_config = load_yaml_from_path( run_config_path) if run_config_path else None recon_pipeline = ReconstructablePipeline.for_file( file_path, "iris_pipeline") pipeline_result = execute_pipeline( recon_pipeline, run_config=run_config, instance=instance, solid_selection=["k_means_iris"], # skip download_file in tests ) assert pipeline_result.success
def execute_pipeline_on_celery(pipeline_name, ): with seven.TemporaryDirectory() as tempdir: result = execute_pipeline( ReconstructablePipeline(FileCodePointer(__file__, pipeline_name)), environment_dict={ 'storage': { 'filesystem': { 'config': { 'base_dir': tempdir } } }, 'execution': { 'celery': {} }, }, instance=DagsterInstance.local_temp(tempdir=tempdir), ) yield result
def in_pipeline_manager(pipeline_name='hello_world_pipeline', solid_handle=SolidHandle('hello_world', None), executable_dict=None, mode=None, **kwargs): manager = Manager() run_id = make_new_run_id() instance = DagsterInstance.local_temp() marshal_dir = tempfile.mkdtemp() if not executable_dict: executable_dict = ReconstructablePipeline.for_module( 'dagstermill.examples.repository', 'define_hello_world_pipeline').to_dict() pipeline_run_dict = pack_value( PipelineRun( pipeline_name=pipeline_name, run_id=run_id, mode=mode or 'default', run_config=None, step_keys_to_execute=None, status=PipelineRunStatus.NOT_STARTED, )) try: with safe_tempfile_path() as output_log_file_path: context_dict = { 'pipeline_run_dict': pipeline_run_dict, 'solid_handle_kwargs': solid_handle._asdict(), 'executable_dict': executable_dict, 'marshal_dir': marshal_dir, 'run_config': {}, 'output_log_path': output_log_file_path, 'instance_ref_dict': pack_value(instance.get_ref()), } manager.reconstitute_pipeline_context( **dict(context_dict, **kwargs)) yield manager finally: shutil.rmtree(marshal_dir)
def test_dask(): run_config = { "solids": { "dask_solid": { "inputs": { "df": { "read": { "csv": { "path": file_relative_path(__file__, "ex*.csv") } } } } } } } with instance_for_test() as instance: result = execute_pipeline( ReconstructablePipeline.for_file(__file__, dask_pipeline.name), run_config={ "intermediate_storage": { "filesystem": {} }, "execution": { "dask": { "config": { "cluster": { "local": { "timeout": 30 } } } } }, **run_config, }, instance=instance, ) assert result.success
def test_in_pipeline_manager_with_resources(): with tempfile.NamedTemporaryFile() as fd: path = fd.name try: with in_pipeline_manager( pipeline_name="resource_pipeline", executable_dict=ReconstructablePipeline.for_module( "dagstermill.examples.repository", "resource_pipeline", ).to_dict(), solid_handle=NodeHandle("hello_world_resource", None), run_config={"resources": { "list": { "config": path } }}, mode="prod", ) as manager: assert "list" in manager.context.resources._asdict() with open(path, "rb") as fd: messages = pickle.load(fd) messages = [message.split(": ") for message in messages] assert len(messages) == 1 assert messages[0][1] == "Opened" manager.teardown_resources() with open(path, "rb") as fd: messages = pickle.load(fd) messages = [message.split(": ") for message in messages] assert len(messages) == 2 assert messages[1][1] == "Closed" finally: if os.path.exists(path): os.unlink(path)
def test_in_pipeline_manager_with_resources(): with tempfile.NamedTemporaryFile() as fd: path = fd.name try: with in_pipeline_manager( pipeline_name='resource_pipeline', executable_dict=ReconstructablePipeline.for_module( 'dagstermill.examples.repository', 'define_resource_pipeline', ).to_dict(), solid_handle=SolidHandle('hello_world_resource', None), run_config={'resources': { 'list': { 'config': path } }}, mode='prod', ) as manager: assert len(manager.context.resources._asdict()) == 1 with open(path, 'rb') as fd: messages = pickle.load(fd) messages = [message.split(': ') for message in messages] assert len(messages) == 1 assert messages[0][1] == 'Opened' manager.teardown_resources() with open(path, 'rb') as fd: messages = pickle.load(fd) messages = [message.split(': ') for message in messages] assert len(messages) == 2 assert messages[1][1] == 'Closed' finally: if os.path.exists(path): os.unlink(path)
def test_pandas_dask(): environment_dict = { 'solids': { 'pandas_solid': { 'inputs': {'df': {'csv': {'path': file_relative_path(__file__, 'ex.csv')}}} } } } result = execute_pipeline( ReconstructablePipeline(FileCodePointer(__file__, pandas_pipeline.name)), environment_dict={ 'storage': {'filesystem': {}}, 'execution': {'dask': {'config': {'timeout': 30}}}, **environment_dict, }, instance=DagsterInstance.local_temp(), ) assert result.success
def test_pandas_dask(): run_config = { "solids": { "pandas_solid": { "inputs": {"df": {"csv": {"path": file_relative_path(__file__, "ex.csv")}}} } } } result = execute_pipeline( ReconstructablePipeline.for_file(__file__, pandas_pipeline.name), run_config={ "storage": {"filesystem": {}}, "execution": {"dask": {"config": {"cluster": {"local": {"timeout": 30}}}}}, **run_config, }, instance=DagsterInstance.local_temp(), ) assert result.success
def execute_pipeline_on_celery(tempdir, pipeline_name, tags=None): pipe = ReconstructablePipeline(FileCodePointer(__file__, pipeline_name)) instance = DagsterInstance.local_temp(tempdir=tempdir) return execute_pipeline( pipe, environment_dict={ 'storage': { 'filesystem': { 'config': { 'base_dir': tempdir } } }, 'execution': { 'celery': {} }, }, instance=instance, tags=tags, )
def test_pandas_dask(): run_config = { 'solids': { 'pandas_solid': { 'inputs': {'df': {'csv': {'path': file_relative_path(__file__, 'ex.csv')}}} } } } result = execute_pipeline( ReconstructablePipeline.for_file(__file__, pandas_pipeline.name), run_config={ 'storage': {'filesystem': {}}, 'execution': {'dask': {'config': {'cluster': {'local': {'timeout': 30}}}}}, **run_config, }, instance=DagsterInstance.local_temp(), ) assert result.success
def execute_pipeline_on_celery(tempdir, pipeline_name, tags=None): pipe = ReconstructablePipeline.for_file(__file__, pipeline_name) instance = DagsterInstance.local_temp(tempdir=tempdir) return execute_pipeline( pipe, run_config={ 'storage': { 'filesystem': { 'config': { 'base_dir': tempdir } } }, 'execution': { 'celery': {} }, }, instance=instance, tags=tags, )
def test_engine_error(): with seven.mock.patch( "dagster.core.execution.context.system.SystemExecutionContextData.raise_on_error", return_value=True, ): with pytest.raises(DagsterSubprocessError): with tempfile.TemporaryDirectory() as tempdir: with instance_for_test_tempdir(tempdir) as instance: storage = os.path.join(tempdir, "flakey_storage") execute_pipeline( ReconstructablePipeline.for_file(REPO_FILE, "engine_error"), run_config={ "intermediate_storage": { "filesystem": {"config": {"base_dir": storage}} }, "execution": { "celery": {"config": {"config_source": {"task_always_eager": True}}} }, "solids": {"destroy": {"config": storage}}, }, instance=instance, )
def test_cli_execute(): # currently paths in env files have to be relative to where the # script has launched so we have to simulate that cwd = os.getcwd() try: os.chdir(file_relative_path(__file__, '../..')) do_execute_command( pipeline=ReconstructablePipeline.for_module( 'dagster_pandas.examples.pandas_hello_world.pipeline', 'pandas_hello_world'), env_file_list=[ file_relative_path( __file__, '../../dagster_pandas/examples/pandas_hello_world/*.yaml') ], ) finally: # restore cwd os.chdir(cwd)
def execute_pipeline_on_celery(pipeline_name): with seven.TemporaryDirectory() as tempdir: pipeline_def = ReconstructablePipeline.for_file( __file__, pipeline_name) instance = DagsterInstance.local_temp(tempdir=tempdir) result = execute_pipeline( pipeline_def, run_config={ 'storage': { 'filesystem': { 'config': { 'base_dir': tempdir } } }, 'execution': { 'celery': {} }, }, instance=instance, ) yield result
def test_execute_execute_plan_mutation_raw(): pipeline_name = 'sleepy_pipeline' pipeline = ReconstructablePipeline.for_module('dagster_examples.toys.sleepy', pipeline_name) instance = DagsterInstance.local_temp() pipeline_run = instance.create_run_for_pipeline(pipeline_def=pipeline.get_definition()) variables = { 'executionParams': { 'runConfigData': {}, 'mode': 'default', 'selector': {'name': pipeline_name}, 'executionMetadata': {'runId': pipeline_run.run_id}, } } result = execute_execute_plan_mutation_raw( pipeline.get_reconstructable_repository(), variables, instance_ref=instance.get_ref() ) seen_events = set() for event in result: seen_events.add((event.dagster_event.event_type_value, event.step_key)) assert seen_events == EXPECTED_EVENTS
def test_papermill_pandas_hello_world_pipeline(): pipeline = ReconstructablePipeline.for_module( "dagster_pandas.examples", "papermill_pandas_hello_world_pipeline") with tempfile.TemporaryDirectory() as temp_dir: with instance_for_test() as instance: pipeline_result = execute_pipeline( pipeline, { "solids": { "papermill_pandas_hello_world": { "inputs": { "df": { "csv": { "path": file_relative_path( __file__, "num_prod.csv") } } }, } }, "resources": { "io_manager": { "config": { "base_dir": temp_dir }, }, }, }, instance=instance, ) assert pipeline_result.success solid_result = pipeline_result.result_for_solid( "papermill_pandas_hello_world") expected = pd.read_csv(file_relative_path(__file__, "num_prod.csv")) + 1 assert solid_result.output_value().equals(expected)
def test_execute_pipeline(): environment = { 'solids': { 'sum_solid': { 'inputs': { 'num': { 'csv': { 'path': file_relative_path(__file__, 'num.csv') } } } } } } result = execute_pipeline( ReconstructablePipeline.for_module( 'dagster_pandas.examples.pandas_hello_world.pipeline', 'pandas_hello_world'), run_config=environment, ) assert result.success assert result.result_for_solid('sum_solid').output_value().to_dict( 'list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], } assert result.result_for_solid('sum_sq_solid').output_value().to_dict( 'list') == { 'num1': [1, 3], 'num2': [2, 4], 'sum': [3, 7], 'sum_sq': [9, 49], }
def test_execute_pipeline(): environment = { "solids": { "sum_solid": { "inputs": { "num": { "csv": { "path": file_relative_path(__file__, "num.csv") } } } } } } result = execute_pipeline( ReconstructablePipeline.for_module( "dagster_pandas.examples.pandas_hello_world.pipeline", "pandas_hello_world"), run_config=environment, ) assert result.success assert result.result_for_solid("sum_solid").output_value().to_dict( "list") == { "num1": [1, 3], "num2": [2, 4], "sum": [3, 7], } assert result.result_for_solid("sum_sq_solid").output_value().to_dict( "list") == { "num1": [1, 3], "num2": [2, 4], "sum": [3, 7], "sum_sq": [9, 49], }
def test_event_callback_logging(): events = defaultdict(list) def _event_callback(record): assert isinstance(record, EventLogEntry) if record.is_dagster_event: events[record.dagster_event.event_type].append(record) pipeline = ReconstructablePipeline.for_module( "dagstermill.examples.repository", "hello_logging_pipeline", ) pipeline_def = pipeline.get_definition() with instance_for_test() as instance: pipeline_run = instance.create_run_for_pipeline(pipeline_def) instance.watch_event_logs(pipeline_run.run_id, -1, _event_callback) res = execute_run( pipeline, pipeline_run, instance, ) assert res.success passed_before_timeout = False retries = 5 while retries > 0: time.sleep(0.333) if DagsterEventType.PIPELINE_FAILURE in events.keys(): break if DagsterEventType.PIPELINE_SUCCESS in events.keys(): passed_before_timeout = True break retries -= 1 assert passed_before_timeout
def test_cli_execute(): # currently paths in env files have to be relative to where the # script has launched so we have to simulate that cwd = os.getcwd() try: os.chdir(file_relative_path(__file__, "../..")) do_execute_command( pipeline=ReconstructablePipeline.for_module( "dagster_pandas.examples.pandas_hello_world.pipeline", "pandas_hello_world"), instance=DagsterInstance.get(), config=[ file_relative_path( __file__, "../../dagster_pandas/examples/pandas_hello_world/*.yaml") ], ) finally: # restore cwd os.chdir(cwd)