def test_pipeline_step_key_subset_execution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs(
        {'solids': {
            'add_one': {
                'inputs': {
                    'num': {
                        'value': 3
                    }
                }
            }
        }})
    result = execute_pipeline(pipeline_def,
                              run_config=run_config,
                              instance=instance)

    assert result.success

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle('add_one.compute')).obj == 4)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle('add_two.compute')).obj == 6)

    ## re-execute add_two

    pipeline_reexecution_result = reexecute_pipeline(
        pipeline_def,
        parent_run_id=result.run_id,
        run_config=run_config,
        step_keys_to_execute=['add_two.compute'],
        instance=instance,
    )

    assert pipeline_reexecution_result.success

    step_events = pipeline_reexecution_result.step_event_list
    assert step_events

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle('add_one.compute')).obj == 4)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle('add_two.compute')).obj == 6)

    assert not get_step_output_event(step_events, 'add_one.compute')
    assert get_step_output_event(step_events, 'add_two.compute')

    with pytest.raises(DagsterExecutionStepNotFoundError,
                       match='Execution plan does not contain step'):
        reexecute_pipeline(
            pipeline_def,
            parent_run_id=result.run_id,
            run_config=run_config,
            step_keys_to_execute=['nope.compute'],
            instance=instance,
        )
def test_execution_plan_reexecution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs(
        {"solids": {
            "add_one": {
                "inputs": {
                    "num": {
                        "value": 3
                    }
                }
            }
        }})
    result = execute_pipeline(
        pipeline_def,
        run_config=run_config,
        instance=instance,
    )

    assert result.success

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one.compute")).obj == 4)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two.compute")).obj == 6)

    ## re-execute add_two

    execution_plan = create_execution_plan(pipeline_def, run_config=run_config)

    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def,
        execution_plan=execution_plan,
        run_config=run_config,
        parent_run_id=result.run_id,
        root_run_id=result.run_id,
    )

    step_events = execute_plan(
        execution_plan.build_subset_plan(["add_two.compute"]),
        run_config=run_config,
        pipeline_run=pipeline_run,
        instance=instance,
    )

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one.compute")).obj == 4)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two.compute")).obj == 6)

    assert not get_step_output_event(step_events, "add_one.compute")
    assert get_step_output_event(step_events, "add_two.compute")
def test_using_intermediates_to_override():
    pipeline = define_inty_pipeline()

    run_config = {
        "storage": {
            "filesystem": {}
        },
        "intermediate_storage": {
            "in_memory": {}
        }
    }

    instance = DagsterInstance.ephemeral()
    execution_plan = create_execution_plan(
        pipeline,
        run_config=run_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)
    assert execution_plan.get_step_by_key("return_one")

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["return_one"]),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        ))

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)
    assert get_step_output(return_one_step_events, "return_one")
    assert not intermediate_storage.has_intermediate(
        None, StepOutputHandle("return_one"))
def test_successful_one_part_execute_plan(graphql_context, snapshot):
    instance = graphql_context.instance
    run_config = csv_hello_world_solids_config_fs_storage()
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=csv_hello_world, run_config=run_config)
    selector = infer_pipeline_selector(graphql_context, 'csv_hello_world')

    result = execute_dagster_graphql(
        graphql_context,
        EXECUTE_PLAN_QUERY,
        variables={
            'executionParams': {
                'selector': selector,
                'runConfigData': run_config,
                'stepKeys': ['sum_solid.compute'],
                'executionMetadata': {
                    'runId': pipeline_run.run_id
                },
                'mode': 'default',
            },
        },
    )

    query_result = result.data['executePlan']

    assert query_result['__typename'] == 'ExecutePlanSuccess'
    assert query_result['pipeline']['name'] == 'csv_hello_world'
    assert query_result['hasFailures'] is False

    step_events = query_result['stepEvents']

    assert [se['__typename'] for se in step_events] == [
        'ExecutionStepStartEvent',
        'ExecutionStepInputEvent',
        'ExecutionStepOutputEvent',
        'ObjectStoreOperationEvent',
        'ExecutionStepSuccessEvent',
    ]

    assert step_events[1]['stepKey'] == 'sum_solid.compute'
    assert step_events[2]['outputName'] == 'result'

    expected_value_repr = (
        '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), '''
        '''OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]''')

    assert step_events[3]['stepKey'] == 'sum_solid.compute'
    assert step_events[4]['stepKey'] == 'sum_solid.compute'

    snapshot.assert_match(clean_log_messages(result.data))

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)
    assert intermediate_storage.has_intermediate(
        None, StepOutputHandle('sum_solid.compute'))
    assert (str(
        intermediate_storage.get_intermediate(
            None, PoorMansDataFrame,
            StepOutputHandle('sum_solid.compute')).obj) == expected_value_repr)
Exemple #5
0
def test_successful_one_part_execute_plan(graphql_context, snapshot):
    instance = graphql_context.instance
    run_config = csv_hello_world_solids_config_fs_storage()
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=csv_hello_world, run_config=run_config)
    selector = infer_pipeline_selector(graphql_context, "csv_hello_world")

    result = execute_dagster_graphql(
        graphql_context,
        EXECUTE_PLAN_QUERY,
        variables={
            "executionParams": {
                "selector": selector,
                "runConfigData": run_config,
                "stepKeys": ["sum_solid.compute"],
                "executionMetadata": {
                    "runId": pipeline_run.run_id
                },
                "mode": "default",
            },
        },
    )

    query_result = result.data["executePlan"]

    assert query_result["__typename"] == "ExecutePlanSuccess"
    assert query_result["pipeline"]["name"] == "csv_hello_world"
    assert query_result["hasFailures"] is False

    step_events = query_result["stepEvents"]

    assert [se["__typename"] for se in step_events] == [
        "ExecutionStepStartEvent",
        "ExecutionStepInputEvent",
        "ExecutionStepOutputEvent",
        "ObjectStoreOperationEvent",
        "ExecutionStepSuccessEvent",
    ]

    assert step_events[1]["stepKey"] == "sum_solid.compute"
    assert step_events[2]["outputName"] == "result"

    expected_value_repr = (
        """[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), """
        """OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]""")

    assert step_events[3]["stepKey"] == "sum_solid.compute"
    assert step_events[4]["stepKey"] == "sum_solid.compute"

    snapshot.assert_match(clean_log_messages(result.data))

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)
    assert intermediate_storage.has_intermediate(
        None, StepOutputHandle("sum_solid.compute"))
    assert (str(
        intermediate_storage.get_intermediate(
            None, PoorMansDataFrame,
            StepOutputHandle("sum_solid.compute")).obj) == expected_value_repr)
def test_using_intermediate_file_system_for_subplan_multiprocessing():
    with instance_for_test() as instance:

        run_config = {"intermediate_storage": {"filesystem": {}}}

        pipeline = reconstructable(define_inty_pipeline)

        environment_config = EnvironmentConfig.build(
            pipeline.get_definition(),
            run_config=run_config,
        )
        execution_plan = ExecutionPlan.build(
            pipeline,
            environment_config,
        )
        pipeline_run = instance.create_run_for_pipeline(
            pipeline_def=pipeline.get_definition(),
            execution_plan=execution_plan)

        assert execution_plan.get_step_by_key("return_one")

        return_one_step_events = list(
            execute_plan(
                execution_plan.build_subset_plan(["return_one"],
                                                 pipeline.get_definition(),
                                                 environment_config),
                pipeline,
                instance,
                run_config=dict(run_config, execution={"multiprocess": {}}),
                pipeline_run=pipeline_run,
            ))

        intermediate_storage = build_fs_intermediate_storage(
            instance.intermediates_directory, pipeline_run.run_id)

        assert get_step_output(return_one_step_events, "return_one")
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle("return_one"))
        assert (intermediate_storage.get_intermediate(
            None, Int, StepOutputHandle("return_one")).obj == 1)

        add_one_step_events = list(
            execute_plan(
                execution_plan.build_subset_plan(["add_one"],
                                                 pipeline.get_definition(),
                                                 environment_config),
                pipeline,
                instance,
                run_config=dict(run_config, execution={"multiprocess": {}}),
                pipeline_run=pipeline_run,
            ))

        assert get_step_output(add_one_step_events, "add_one")
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle("add_one"))
        assert (intermediate_storage.get_intermediate(
            None, Int, StepOutputHandle("add_one")).obj == 2)
def define_intermediate_storage(type_storage_plugin_registry=None):
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory,
        run_id=run_id,
        type_storage_plugin_registry=type_storage_plugin_registry,
    )
    return run_id, instance, intermediate_storage
def test_using_intermediates_file_system_for_subplan():
    pipeline = define_inty_pipeline()

    run_config = {"intermediate_storage": {"filesystem": {}}}

    instance = DagsterInstance.ephemeral()
    environment_config = EnvironmentConfig.build(
        pipeline,
        run_config=run_config,
    )

    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline),
        environment_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)
    assert execution_plan.get_step_by_key("return_one")

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["return_one"], pipeline,
                                             environment_config),
            InMemoryPipeline(pipeline),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        ))

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)
    assert get_step_output(return_one_step_events, "return_one")
    assert intermediate_storage.has_intermediate(
        None, StepOutputHandle("return_one"))
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("return_one")).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], pipeline,
                                             environment_config),
            InMemoryPipeline(pipeline),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        ))

    assert get_step_output(add_one_step_events, "add_one")
    assert intermediate_storage.has_intermediate(None,
                                                 StepOutputHandle("add_one"))
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 2
Exemple #9
0
def test_address_operation_using_intermediates_file_system():
    with seven.TemporaryDirectory() as tmpdir_path:
        output_address = os.path.join(tmpdir_path, "solid1.output")
        output_value = 5

        instance = DagsterInstance.ephemeral()
        intermediate_storage = build_fs_intermediate_storage(
            instance.intermediates_directory, run_id="some_run_id")

        object_operation_result = intermediate_storage.set_intermediate_to_address(
            context=None,
            dagster_type=Int,
            step_output_handle=StepOutputHandle("solid1.compute"),
            value=output_value,
            address=output_address,
        )

        assert object_operation_result.key == output_address
        assert object_operation_result.obj == output_value

        assert (intermediate_storage.get_intermediate_from_address(
            context=None,
            dagster_type=Int,
            step_output_handle=StepOutputHandle("solid1.compute"),
            address=output_address,
        ).obj == output_value)

        with pytest.raises(
                DagsterAddressIOError,
                match="No such file or directory",
        ):
            intermediate_storage.set_intermediate_to_address(
                context=None,
                dagster_type=Int,
                step_output_handle=StepOutputHandle("solid1.compute"),
                value=1,
                address="invalid_address",
            )

        with pytest.raises(
                DagsterAddressIOError,
                match="No such file or directory",
        ):
            intermediate_storage.get_intermediate_from_address(
                context=None,
                dagster_type=Int,
                step_output_handle=StepOutputHandle("solid1.compute"),
                address=os.path.join(tmpdir_path, "invalid.output"),
            )
Exemple #10
0
def test_spark_data_frame_serialization_file_system_file_handle(spark_config):
    @solid
    def nonce(_):
        return LocalFileHandle(file_relative_path(__file__, 'data/test.csv'))

    @pipeline(mode_defs=[spark_local_fs_mode])
    def spark_df_test_pipeline():
        ingest_csv_file_handle_to_spark(nonce())

    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(
        spark_df_test_pipeline,
        mode='spark',
        run_config={
            'intermediate_storage': {
                'filesystem': {}
            },
            'resources': {
                'pyspark': {
                    'config': {
                        'spark_conf': spark_config
                    }
                }
            },
        },
        instance=instance,
    )

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, run_id=result.run_id)

    assert result.success
    result_dir = os.path.join(
        intermediate_storage.root,
        'intermediates',
        'ingest_csv_file_handle_to_spark.compute',
        'result',
    )

    assert '_SUCCESS' in os.listdir(result_dir)

    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(result_dir)
    assert isinstance(df, pyspark.sql.dataframe.DataFrame)
    assert df.head()[0] == '1'
Exemple #11
0
def test_spark_data_frame_serialization_file_system_file_handle(spark_config):
    @solid
    def nonce(_):
        return LocalFileHandle(file_relative_path(__file__, "data/test.csv"))

    @pipeline(mode_defs=[spark_local_fs_mode])
    def spark_df_test_pipeline():
        ingest_csv_file_handle_to_spark(nonce())

    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(
        spark_df_test_pipeline,
        mode="spark",
        run_config={
            "intermediate_storage": {
                "filesystem": {}
            },
            "resources": {
                "pyspark": {
                    "config": {
                        "spark_conf": spark_config
                    }
                }
            },
        },
        instance=instance,
    )

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, run_id=result.run_id)

    assert result.success
    result_dir = os.path.join(
        intermediate_storage.root,
        "intermediates",
        "ingest_csv_file_handle_to_spark",
        "result",
    )

    assert "_SUCCESS" in os.listdir(result_dir)

    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(result_dir)
    assert isinstance(df, pyspark.sql.dataframe.DataFrame)
    assert df.head()[0] == "1"
def test_success_whole_execution_plan_with_in_memory_config(
        graphql_context, snapshot):
    instance = graphql_context.instance
    selector = infer_pipeline_selector(graphql_context, 'csv_hello_world')
    run_config = merge_dicts(csv_hello_world_solids_config(),
                             {'storage': {
                                 'in_memory': {}
                             }})
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=csv_hello_world, run_config=run_config)
    result = execute_dagster_graphql(
        graphql_context,
        EXECUTE_PLAN_QUERY,
        variables={
            'executionParams': {
                'selector': selector,
                'runConfigData': run_config,
                'stepKeys': None,
                'executionMetadata': {
                    'runId': pipeline_run.run_id
                },
                'mode': 'default',
            },
        },
    )

    query_result = result.data['executePlan']

    assert query_result['__typename'] == 'ExecutePlanSuccess'
    assert query_result['pipeline']['name'] == 'csv_hello_world'
    assert query_result['hasFailures'] is False
    step_events = {
        step_event['stepKey']: step_event
        for step_event in query_result['stepEvents'] if step_event['stepKey']
    }
    assert 'sum_solid.compute' in step_events
    assert 'sum_sq_solid.compute' in step_events

    snapshot.assert_match(clean_log_messages(result.data))
    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)
    assert not intermediate_storage.has_intermediate(
        None, StepOutputHandle('sum_solid.compute'))
    assert not intermediate_storage.has_intermediate(
        None, StepOutputHandle('sum_sq_solid.compute'))
Exemple #13
0
def test_success_whole_execution_plan_with_in_memory_config(
        graphql_context, snapshot):
    instance = graphql_context.instance
    selector = infer_pipeline_selector(graphql_context, "csv_hello_world")
    run_config = merge_dicts(csv_hello_world_solids_config(),
                             {"storage": {
                                 "in_memory": {}
                             }})
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=csv_hello_world, run_config=run_config)
    result = execute_dagster_graphql(
        graphql_context,
        EXECUTE_PLAN_QUERY,
        variables={
            "executionParams": {
                "selector": selector,
                "runConfigData": run_config,
                "stepKeys": None,
                "executionMetadata": {
                    "runId": pipeline_run.run_id
                },
                "mode": "default",
            },
        },
    )

    query_result = result.data["executePlan"]

    assert query_result["__typename"] == "ExecutePlanSuccess"
    assert query_result["pipeline"]["name"] == "csv_hello_world"
    assert query_result["hasFailures"] is False
    step_events = {
        step_event["stepKey"]: step_event
        for step_event in query_result["stepEvents"] if step_event["stepKey"]
    }
    assert "sum_solid.compute" in step_events
    assert "sum_sq_solid.compute" in step_events

    snapshot.assert_match(clean_log_messages(result.data))
    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)
    assert not intermediate_storage.has_intermediate(
        None, StepOutputHandle("sum_solid.compute"))
    assert not intermediate_storage.has_intermediate(
        None, StepOutputHandle("sum_sq_solid.compute"))
Exemple #14
0
def test_using_file_system_for_subplan():
    pipeline = define_inty_pipeline()

    run_config = {'storage': {'filesystem': {}}}

    instance = DagsterInstance.ephemeral()
    execution_plan = create_execution_plan(
        pipeline,
        run_config=run_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)
    assert execution_plan.get_step_by_key('return_one.compute')

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['return_one.compute']),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        ))

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)
    assert get_step_output(return_one_step_events, 'return_one.compute')
    assert intermediate_storage.has_intermediate(
        None, StepOutputHandle('return_one.compute'))
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle('return_one.compute')).obj == 1)

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    assert intermediate_storage.has_intermediate(
        None, StepOutputHandle('add_one.compute'))
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle('add_one.compute')).obj == 2)
Exemple #15
0
def test_using_intermediate_file_system_for_subplan_multiprocessing():

    run_config = {'intermediate_storage': {'filesystem': {}}}
    instance = DagsterInstance.local_temp()

    pipeline = reconstructable(define_inty_pipeline)

    execution_plan = create_execution_plan(pipeline, run_config=run_config)
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline.get_definition(), execution_plan=execution_plan)

    assert execution_plan.get_step_by_key('return_one.compute')

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['return_one.compute']),
            instance,
            run_config=dict(run_config, execution={'multiprocess': {}}),
            pipeline_run=pipeline_run,
        ))

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)

    assert get_step_output(return_one_step_events, 'return_one.compute')
    assert intermediate_storage.has_intermediate(
        None, StepOutputHandle('return_one.compute'))
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle('return_one.compute')).obj == 1)

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            instance,
            run_config=dict(run_config, execution={'multiprocess': {}}),
            pipeline_run=pipeline_run,
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    assert intermediate_storage.has_intermediate(
        None, StepOutputHandle('add_one.compute'))
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle('add_one.compute')).obj == 2)
    def test_successful_pipeline_reexecution(self, graphql_context):
        selector = infer_pipeline_selector(graphql_context, "csv_hello_world")
        run_id = make_new_run_id()
        result_one = execute_dagster_graphql_and_finish_runs(
            graphql_context,
            LAUNCH_PIPELINE_EXECUTION_MUTATION,
            variables={
                "executionParams": {
                    "selector": selector,
                    "runConfigData":
                    csv_hello_world_solids_config_fs_storage(),
                    "executionMetadata": {
                        "runId": run_id
                    },
                    "mode": "default",
                }
            },
        )

        assert (result_one.data["launchPipelineExecution"]["__typename"] ==
                "LaunchPipelineRunSuccess")

        expected_value_repr = (
            """[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), """
            """('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), """
            """('sum_sq', 49)])]""")

        instance = graphql_context.instance

        intermediate_storage = build_fs_intermediate_storage(
            instance.intermediates_directory, run_id)
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle("sum_solid.compute"))
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle("sum_sq_solid.compute"))
        assert (str(
            intermediate_storage.get_intermediate(
                None, PoorMansDataFrame,
                StepOutputHandle("sum_sq_solid.compute")).obj) ==
                expected_value_repr)

        # retry
        new_run_id = make_new_run_id()

        result_two = execute_dagster_graphql_and_finish_runs(
            graphql_context,
            LAUNCH_PIPELINE_REEXECUTION_MUTATION,
            variables={
                "executionParams": {
                    "selector": selector,
                    "runConfigData":
                    csv_hello_world_solids_config_fs_storage(),
                    "stepKeys": ["sum_sq_solid.compute"],
                    "executionMetadata": {
                        "runId": new_run_id,
                        "rootRunId": run_id,
                        "parentRunId": run_id,
                        "tags": [{
                            "key": RESUME_RETRY_TAG,
                            "value": "true"
                        }],
                    },
                    "mode": "default",
                }
            },
        )

        query_result = result_two.data["launchPipelineReexecution"]
        assert query_result["__typename"] == "LaunchPipelineRunSuccess"

        result = get_all_logs_for_finished_run_via_subscription(
            graphql_context, new_run_id)
        logs = result["pipelineRunLogs"]["messages"]

        assert isinstance(logs, list)
        assert has_event_of_type(logs, "PipelineStartEvent")
        assert has_event_of_type(logs, "PipelineSuccessEvent")
        assert not has_event_of_type(logs, "PipelineFailureEvent")

        assert not get_step_output_event(logs, "sum_solid.compute")
        assert get_step_output_event(logs, "sum_sq_solid.compute")

        intermediate_storage = build_fs_intermediate_storage(
            instance.intermediates_directory, new_run_id)
        assert not intermediate_storage.has_intermediate(
            None,
            StepOutputHandle("sum_solid.inputs.num.read",
                             "input_thunk_output"))
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle("sum_solid.compute"))
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle("sum_sq_solid.compute"))
        assert (str(
            intermediate_storage.get_intermediate(
                None, PoorMansDataFrame,
                StepOutputHandle("sum_sq_solid.compute")).obj) ==
                expected_value_repr)
def test_execution_plan_reexecution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs(
        {"solids": {
            "add_one": {
                "inputs": {
                    "num": {
                        "value": 3
                    }
                }
            }
        }})
    result = execute_pipeline(
        pipeline_def,
        run_config=run_config,
        instance=instance,
    )

    assert result.success

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    ## re-execute add_two

    environment_config = EnvironmentConfig.build(
        pipeline_def,
        run_config=run_config,
    )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline_def),
        environment_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def,
        execution_plan=execution_plan,
        run_config=run_config,
        parent_run_id=result.run_id,
        root_run_id=result.run_id,
        step_keys_to_execute=["add_two"],
    )

    step_events = execute_plan(
        execution_plan.build_subset_plan(["add_two"], pipeline_def,
                                         environment_config),
        InMemoryPipeline(pipeline_def),
        run_config=run_config,
        pipeline_run=pipeline_run,
        instance=instance,
    )

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    assert not get_step_output_event(step_events, "add_one")
    assert get_step_output_event(step_events, "add_two")
Exemple #18
0
def test_pipeline_step_key_subset_execution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs(
        {"solids": {
            "add_one": {
                "inputs": {
                    "num": {
                        "value": 3
                    }
                }
            }
        }})
    result = execute_pipeline(pipeline_def,
                              run_config=run_config,
                              instance=instance)

    assert result.success

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    ## re-execute add_two

    pipeline_reexecution_result = reexecute_pipeline(
        pipeline_def,
        parent_run_id=result.run_id,
        run_config=run_config,
        step_selection=["add_two"],
        instance=instance,
    )

    assert pipeline_reexecution_result.success

    step_events = pipeline_reexecution_result.step_event_list
    assert step_events

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    assert not get_step_output_event(step_events, "add_one")
    assert get_step_output_event(step_events, "add_two")

    with pytest.raises(
            DagsterExecutionStepNotFoundError,
            match="Can not build subset plan from unknown step: nope",
    ):
        reexecute_pipeline(
            pipeline_def,
            parent_run_id=result.run_id,
            run_config=run_config,
            step_selection=["nope"],
            instance=instance,
        )
def test_pipeline_step_key_subset_execution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs(
        {"solids": {
            "add_one": {
                "inputs": {
                    "num": {
                        "value": 3
                    }
                }
            }
        }})
    result = execute_pipeline(pipeline_def,
                              run_config=run_config,
                              instance=instance)

    assert result.success

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one.compute")).obj == 4)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two.compute")).obj == 6)

    ## re-execute add_two

    pipeline_reexecution_result = reexecute_pipeline(
        pipeline_def,
        parent_run_id=result.run_id,
        run_config=run_config,
        step_selection=["add_two.compute"],
        instance=instance,
    )

    assert pipeline_reexecution_result.success

    step_events = pipeline_reexecution_result.step_event_list
    assert step_events

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one.compute")).obj == 4)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two.compute")).obj == 6)

    assert not get_step_output_event(step_events, "add_one.compute")
    assert get_step_output_event(step_events, "add_two.compute")

    with pytest.raises(
            DagsterInvalidSubsetError,
            match="No qualified steps to execute found for step_selection"):
        reexecute_pipeline(
            pipeline_def,
            parent_run_id=result.run_id,
            run_config=run_config,
            step_selection=["nope.compute"],
            instance=instance,
        )
    def test_successful_pipeline_reexecution(self, graphql_context):
        selector = infer_pipeline_selector(graphql_context, 'csv_hello_world')
        run_id = make_new_run_id()
        result_one = execute_dagster_graphql_and_finish_runs(
            graphql_context,
            LAUNCH_PIPELINE_EXECUTION_MUTATION,
            variables={
                'executionParams': {
                    'selector': selector,
                    'runConfigData':
                    csv_hello_world_solids_config_fs_storage(),
                    'executionMetadata': {
                        'runId': run_id
                    },
                    'mode': 'default',
                }
            },
        )

        assert (result_one.data['launchPipelineExecution']['__typename'] ==
                'LaunchPipelineRunSuccess')

        expected_value_repr = (
            '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), '''
            '''('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), '''
            '''('sum_sq', 49)])]''')

        instance = graphql_context.instance

        intermediate_storage = build_fs_intermediate_storage(
            instance.intermediates_directory, run_id)
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle('sum_solid.compute'))
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle('sum_sq_solid.compute'))
        assert (str(
            intermediate_storage.get_intermediate(
                None, PoorMansDataFrame,
                StepOutputHandle('sum_sq_solid.compute')).obj) ==
                expected_value_repr)

        # retry
        new_run_id = make_new_run_id()

        result_two = execute_dagster_graphql_and_finish_runs(
            graphql_context,
            LAUNCH_PIPELINE_REEXECUTION_MUTATION,
            variables={
                'executionParams': {
                    'selector': selector,
                    'runConfigData':
                    csv_hello_world_solids_config_fs_storage(),
                    'stepKeys': ['sum_sq_solid.compute'],
                    'executionMetadata': {
                        'runId': new_run_id,
                        'rootRunId': run_id,
                        'parentRunId': run_id,
                        'tags': [{
                            'key': RESUME_RETRY_TAG,
                            'value': 'true'
                        }],
                    },
                    'mode': 'default',
                }
            },
        )

        query_result = result_two.data['launchPipelineReexecution']
        assert query_result['__typename'] == 'LaunchPipelineRunSuccess'

        result = get_all_logs_for_finished_run_via_subscription(
            graphql_context, new_run_id)
        logs = result['pipelineRunLogs']['messages']

        assert isinstance(logs, list)
        assert has_event_of_type(logs, 'PipelineStartEvent')
        assert has_event_of_type(logs, 'PipelineSuccessEvent')
        assert not has_event_of_type(logs, 'PipelineFailureEvent')

        assert not get_step_output_event(logs, 'sum_solid.compute')
        assert get_step_output_event(logs, 'sum_sq_solid.compute')

        intermediate_storage = build_fs_intermediate_storage(
            instance.intermediates_directory, new_run_id)
        assert not intermediate_storage.has_intermediate(
            None,
            StepOutputHandle('sum_solid.inputs.num.read',
                             'input_thunk_output'))
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle('sum_solid.compute'))
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle('sum_sq_solid.compute'))
        assert (str(
            intermediate_storage.get_intermediate(
                None, PoorMansDataFrame,
                StepOutputHandle('sum_sq_solid.compute')).obj) ==
                expected_value_repr)