Beispiel #1
0
def test_using_s3_for_subplan(s3_bucket):
    pipeline_def = define_inty_pipeline()

    environment_dict = {
        'storage': {
            's3': {
                'config': {
                    's3_bucket': s3_bucket
                }
            }
        }
    }

    run_id = str(uuid.uuid4())

    execution_plan = create_execution_plan(pipeline_def,
                                           environment_dict=environment_dict,
                                           run_config=RunConfig(run_id=run_id))

    assert execution_plan.get_step_by_key('return_one.compute')

    step_keys = ['return_one.compute']
    instance = DagsterInstance.ephemeral()

    return_one_step_events = list(
        execute_plan(
            execution_plan,
            environment_dict=environment_dict,
            run_config=RunConfig(run_id=run_id),
            step_keys_to_execute=step_keys,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, 'return_one.compute')
    with scoped_pipeline_context(pipeline_def, environment_dict,
                                 RunConfig(run_id=run_id),
                                 instance) as context:
        store = S3IntermediateStore(
            s3_bucket,
            run_id,
            s3_session=context.scoped_resources_builder.build().s3.session)
        assert store.has_intermediate(context, 'return_one.compute')
        assert store.get_intermediate(context, 'return_one.compute',
                                      Int).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan,
            environment_dict=environment_dict,
            run_config=RunConfig(run_id=run_id),
            step_keys_to_execute=['add_one.compute'],
            instance=instance,
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    with scoped_pipeline_context(pipeline_def, environment_dict,
                                 RunConfig(run_id=run_id),
                                 instance) as context:
        assert store.has_intermediate(context, 'add_one.compute')
        assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
def test_using_s3_for_subplan(s3_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {"storage": {"s3": {"config": {"s3_bucket": s3_bucket}}}}

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def, run_config=run_config)

    assert execution_plan.get_step_by_key("return_one.compute")

    step_keys = ["return_one.compute"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(
        pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config
    )

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(return_one_step_events, "return_one.compute")
    with scoped_pipeline_context(
        execution_plan.build_subset_plan(["return_one.compute"]),
        run_config,
        pipeline_run,
        instance,
    ) as context:

        intermediates_manager = S3IntermediateStorage(
            s3_bucket,
            run_id,
            s3_session=context.scoped_resources_builder.build(required_resource_keys={"s3"},).s3,
        )
        step_output_handle = StepOutputHandle("return_one.compute")
        assert intermediates_manager.has_intermediate(context, step_output_handle)
        assert intermediates_manager.get_intermediate(context, Int, step_output_handle).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one.compute"]),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(add_one_step_events, "add_one.compute")
    with scoped_pipeline_context(
        execution_plan.build_subset_plan(["add_one.compute"]), run_config, pipeline_run, instance,
    ) as context:
        step_output_handle = StepOutputHandle("add_one.compute")
        assert intermediates_manager.has_intermediate(context, step_output_handle)
        assert intermediates_manager.get_intermediate(context, Int, step_output_handle).obj == 2
def test_using_gcs_for_subplan(gcs_bucket):
    pipeline_def = define_inty_pipeline()

    environment_dict = {
        'storage': {
            'gcs': {
                'config': {
                    'gcs_bucket': gcs_bucket
                }
            }
        }
    }

    run_id = str(uuid.uuid4())

    execution_plan = create_execution_plan(pipeline_def,
                                           environment_dict=environment_dict,
                                           run_config=RunConfig(run_id=run_id))

    assert execution_plan.get_step_by_key('return_one.compute')

    step_keys = ['return_one.compute']
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun.create_empty_run(
        pipeline_def.name, run_id=run_id, environment_dict=environment_dict)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, 'return_one.compute')
    with scoped_pipeline_context(pipeline_def, environment_dict, pipeline_run,
                                 instance) as context:
        store = GCSIntermediateStore(
            gcs_bucket,
            run_id,
            client=context.scoped_resources_builder.build().gcs.client)
        assert store.has_intermediate(context, 'return_one.compute')
        assert store.get_intermediate(context, 'return_one.compute',
                                      Int).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    with scoped_pipeline_context(pipeline_def, environment_dict, pipeline_run,
                                 instance) as context:
        assert store.has_intermediate(context, 'add_one.compute')
        assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
Beispiel #4
0
def test_using_s3_for_subplan(s3_bucket):
    pipeline = define_inty_pipeline()

    environment_dict = {'storage': {'s3': {'s3_bucket': s3_bucket}}}

    execution_plan = create_execution_plan(pipeline, environment_dict=environment_dict)

    assert execution_plan.get_step_by_key('return_one.compute')

    step_keys = ['return_one.compute']

    run_id = str(uuid.uuid4())

    store = S3IntermediateStore(s3_bucket, run_id)

    try:
        return_one_step_events = list(
            execute_plan(
                execution_plan,
                environment_dict=environment_dict,
                run_config=RunConfig(run_id=run_id),
                step_keys_to_execute=step_keys,
            )
        )

        assert get_step_output(return_one_step_events, 'return_one.compute')
        with scoped_pipeline_context(
            pipeline, environment_dict, RunConfig(run_id=run_id)
        ) as context:
            assert store.has_intermediate(context, 'return_one.compute')
            assert store.get_intermediate(context, 'return_one.compute', Int) == 1

        add_one_step_events = list(
            execute_plan(
                execution_plan,
                environment_dict=environment_dict,
                run_config=RunConfig(run_id=run_id),
                step_keys_to_execute=['add_one.compute'],
            )
        )

        assert get_step_output(add_one_step_events, 'add_one.compute')
        with scoped_pipeline_context(
            pipeline, environment_dict, RunConfig(run_id=run_id)
        ) as context:
            assert store.has_intermediate(context, 'add_one.compute')
            assert store.get_intermediate(context, 'add_one.compute', Int) == 2
    finally:
        with scoped_pipeline_context(
            pipeline, environment_dict, RunConfig(run_id=run_id)
        ) as context:
            store.rm_intermediate(context, 'return_one.compute')
            store.rm_intermediate(context, 'add_one.compute')
Beispiel #5
0
    def define_out_of_pipeline_context(self, config=None):
        '''Defines a context to be used in a notebook (i.e., not in pipeline execution).

        '''
        config = check.opt_dict_param(config, 'config')
        pipeline_def = PipelineDefinition([], name='Ephemeral Notebook Pipeline')

        if config.keys():
            warnings.warn(
                'Config keys will not be respected for in-notebook '
                'execution: [{keys}]'.format(
                    keys=', '.join(['\'{key}\''.format(key=key) for key in config.keys()])
                )
            )

            config = {}

        run_config = RunConfig()

        with scoped_pipeline_context(
            pipeline_def, config, run_config, scoped_resources_builder_cm=self.setup_resources
        ) as pipeline_context:
            self.context = DagstermillInNotebookExecutionContext(
                pipeline_context, out_of_pipeline=True
            )

        if self.context.resources:  # pylint: disable=protected-access
            warnings.warn(
                'Call dagstermill.teardown() to finalize resources attached to the context.'
            )
        return self.context
Beispiel #6
0
    def define_out_of_pipeline_context(self, config=None):
        '''Defines a context to be used in a notebook (i.e., not in pipeline execution).

        '''
        config = check.opt_dict_param(config, 'config')
        pipeline_def = PipelineDefinition([],
                                          name='Ephemeral Notebook Pipeline')

        # BUG: If the context cleans up after itself (e.g. closes a db connection or similar)
        # This will instigate that process *before* return. We are going to have to
        # manage this manually (without an if block) in order to make this work.
        # See https://github.com/dagster-io/dagster/issues/796

        if config.keys():
            warnings.warn(
                'Config keys will not be respected for in-notebook '
                'execution: [{keys}]'.format(keys=', '.join(
                    ['\'{key}\''.format(key=key) for key in config.keys()])))

            config = {}

        with scoped_pipeline_context(pipeline_def, config,
                                     RunConfig(run_id='')) as pipeline_context:
            self.context = DagstermillInNotebookExecutionContext(
                pipeline_context, out_of_pipeline=True)
        return self.context
def test_gcs_pipeline_with_custom_prefix(gcs_bucket):
    run_id = str(uuid.uuid4())
    gcs_prefix = 'custom_prefix'

    pipe = define_inty_pipeline(should_throw=False)
    environment_dict = {
        'storage': {'gcs': {'config': {'gcs_bucket': gcs_bucket, 'gcs_prefix': gcs_prefix}}}
    }

    pipeline_run = PipelineRun.create_empty_run(
        pipe.name, run_id=run_id, environment_dict=environment_dict
    )
    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(
        pipe, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id),
    )
    assert result.success

    execution_plan = create_execution_plan(pipe, environment_dict, run_config=pipeline_run)
    with scoped_pipeline_context(
        pipe, environment_dict, pipeline_run, instance, execution_plan
    ) as context:
        store = GCSIntermediateStore(
            run_id=run_id,
            gcs_bucket=gcs_bucket,
            gcs_prefix=gcs_prefix,
            client=context.scoped_resources_builder.build(
                mapper_fn=SolidInvocation.default_resource_mapper_fn,
                required_resource_keys={'gcs'},
            ).gcs.client,
        )
        assert store.root == '/'.join(['custom_prefix', 'storage', run_id])
        assert store.get_intermediate(context, 'return_one.compute', Int).obj == 1
        assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
Beispiel #8
0
def yield_empty_pipeline_context(run_id=None, instance=None):
    pipeline = InMemoryPipeline(PipelineDefinition([]))
    pipeline_def = pipeline.get_definition()
    instance = check.opt_inst_param(instance,
                                    "instance",
                                    DagsterInstance,
                                    default=DagsterInstance.ephemeral())

    execution_plan = create_execution_plan(pipeline)

    pipeline_run = instance.create_run(
        pipeline_name="<empty>",
        run_id=run_id,
        run_config=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
        execution_plan_snapshot=snapshot_from_execution_plan(
            execution_plan, pipeline_def.get_pipeline_snapshot_id()),
        parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),
    )
    with scoped_pipeline_context(execution_plan, {}, pipeline_run,
                                 instance) as context:
        yield context
Beispiel #9
0
    def get_context(self,
                    solid_config=None,
                    mode_def=None,
                    environment_dict=None):
        '''Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            environment_dict(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :class:`dagstermill.DagstermillExecutionContext`
        '''
        check.opt_inst_param(mode_def, 'mode_def', ModeDefinition)
        environment_dict = check.opt_dict_param(environment_dict,
                                                'environment_dict',
                                                key_type=str)

        solid_def = SolidDefinition(
            name='this_solid',
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description=
            'Ephemeral solid constructed by dagstermill.get_context()',
        )

        if not mode_def:
            mode_def = ModeDefinition(
                logger_defs={'dagstermill': colored_console_logger})
            environment_dict['loggers'] = {'dagstermill': {}}

        pipeline_def = PipelineDefinition(
            [solid_def],
            mode_defs=[mode_def],
            name='ephemeral_dagstermill_pipeline')

        run_config = RunConfig(mode=mode_def.name)

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        with scoped_pipeline_context(
                self.pipeline_def,
                environment_dict,
                run_config,
                instance=DagsterInstance.ephemeral(),
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillExecutionContext(pipeline_context,
                                                       solid_config)

        return self.context
Beispiel #10
0
def test_s3_pipeline_with_custom_prefix(s3_bucket):
    run_id = make_new_run_id()
    s3_prefix = 'custom_prefix'

    pipe = define_inty_pipeline(should_throw=False)
    environment_dict = {
        'storage': {'s3': {'config': {'s3_bucket': s3_bucket, 's3_prefix': s3_prefix}}}
    }

    pipeline_run = PipelineRun.create_empty_run(
        pipe.name, run_id=run_id, environment_dict=environment_dict
    )
    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(
        pipe, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id),
    )
    assert result.success

    execution_plan = create_execution_plan(pipe, environment_dict, RunConfig(run_id=run_id))
    with scoped_pipeline_context(
        pipe, environment_dict, pipeline_run, instance, execution_plan
    ) as context:
        store = S3IntermediateStore(
            run_id=run_id,
            s3_bucket=s3_bucket,
            s3_prefix=s3_prefix,
            s3_session=context.scoped_resources_builder.build(required_resource_keys={'s3'}).s3,
        )
        assert store.root == '/'.join(['custom_prefix', 'storage', run_id])
        assert store.get_intermediate(context, 'return_one.compute', Int).obj == 1
        assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
Beispiel #11
0
def yield_empty_pipeline_context(run_id=None, instance=None):
    pipeline = InMemoryExecutablePipeline(PipelineDefinition([]))
    instance = check.opt_inst_param(instance,
                                    'instance',
                                    DagsterInstance,
                                    default=DagsterInstance.ephemeral())
    pipeline_run = instance.create_run(
        pipeline_name='<empty>',
        run_id=run_id,
        environment_dict=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        pipeline_snapshot=None,
        execution_plan_snapshot=None,
        parent_pipeline_snapshot=None,
    )
    with scoped_pipeline_context(
            create_execution_plan(pipeline),
        {},
            pipeline_run,
            instance,
    ) as context:
        yield context
def test_s3_pipeline_with_custom_prefix(s3_bucket):
    s3_prefix = "custom_prefix"

    pipe = define_inty_pipeline(should_throw=False)
    run_config = {"storage": {"s3": {"config": {"s3_bucket": s3_bucket, "s3_prefix": s3_prefix}}}}

    pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config)
    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(pipe, run_config=run_config,)
    assert result.success

    execution_plan = create_execution_plan(pipe, run_config)
    with scoped_pipeline_context(execution_plan, run_config, pipeline_run, instance,) as context:
        intermediates_manager = S3IntermediateStorage(
            run_id=result.run_id,
            s3_bucket=s3_bucket,
            s3_prefix=s3_prefix,
            s3_session=context.scoped_resources_builder.build(required_resource_keys={"s3"}).s3,
        )
        assert intermediates_manager.root == "/".join(["custom_prefix", "storage", result.run_id])
        assert (
            intermediates_manager.get_intermediate(
                context, Int, StepOutputHandle("return_one.compute")
            ).obj
            == 1
        )
        assert (
            intermediates_manager.get_intermediate(
                context, Int, StepOutputHandle("add_one.compute")
            ).obj
            == 2
        )
Beispiel #13
0
def yield_empty_pipeline_context(run_id=None, instance=None):
    with scoped_pipeline_context(
        PipelineDefinition([]),
        {},
        RunConfig(run_id=run_id),
        instance or DagsterInstance.ephemeral(),
    ) as context:
        yield context
Beispiel #14
0
def yield_empty_pipeline_context(run_id=None, instance=None):
    with scoped_pipeline_context(
        PipelineDefinition([]),
        {},
        PipelineRun.create_empty_run('empty', run_id=run_id),
        instance or DagsterInstance.ephemeral(),
    ) as context:
        yield context
Beispiel #15
0
def execute_on_dask(handle,
                    env_config=None,
                    run_config=None,
                    dask_config=None):  # pylint: disable=too-many-locals
    check.inst_param(handle, 'handle', ExecutionTargetHandle)
    check.opt_dict_param(env_config, 'env_config', key_type=str)
    dask_config = check.opt_inst_param(dask_config, 'dask_config', DaskConfig,
                                       DaskConfig())
    run_config = check.opt_inst_param(run_config, 'run_config', RunConfig,
                                      RunConfig(executor_config=dask_config))

    check.inst(
        run_config.executor_config,
        DaskConfig,
        'run_config.executor_config should be instance of DaskConfig to execute on Dask',
    )

    pipeline_def = handle.build_pipeline_definition()

    execution_plan = create_execution_plan(pipeline_def,
                                           env_config,
                                           run_config=run_config)

    with scoped_pipeline_context(pipeline_def, env_config,
                                 run_config) as pipeline_context:
        events = list(
            DaskEngine.execute(pipeline_context, execution_plan, None))

        return PipelineExecutionResult(
            pipeline_def,
            run_config.run_id,
            events,
            lambda: scoped_pipeline_context(
                pipeline_def,
                env_config,
                run_config,
                system_storage_data=SystemStorageData(
                    intermediates_manager=pipeline_context.
                    intermediates_manager,
                    run_storage=pipeline_context.run_storage,
                    file_manager=pipeline_context.file_manager,
                ),
            ),
        )
Beispiel #16
0
def yield_empty_pipeline_context(run_id=None, instance=None):
    pipeline = PipelineDefinition([])
    with scoped_pipeline_context(
        pipeline,
        {},
        PipelineRun.create_empty_run('empty', run_id=run_id if run_id is not None else 'TESTING',),
        instance or DagsterInstance.ephemeral(),
        create_execution_plan(pipeline),
    ) as context:
        yield context
Beispiel #17
0
def test_adls2_pipeline_with_custom_prefix(storage_account, file_system):
    adls2_prefix = "custom_prefix"

    pipe = define_inty_pipeline(should_throw=False)
    run_config = {
        "resources": {
            "adls2": {
                "config": {
                    "storage_account": storage_account,
                    "credential": get_azure_credential()
                }
            }
        },
        "intermediate_storage": {
            "adls2": {
                "config": {
                    "adls2_file_system": file_system,
                    "adls2_prefix": adls2_prefix
                }
            }
        },
    }

    pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config)
    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(
        pipe,
        run_config=run_config,
    )
    assert result.success

    execution_plan = create_execution_plan(pipe, run_config)
    with scoped_pipeline_context(
            execution_plan,
            InMemoryPipeline(pipe),
            run_config,
            pipeline_run,
            instance,
    ) as context:
        resource = context.scoped_resources_builder.build(
            required_resource_keys={"adls2"}).adls2
        intermediate_storage = ADLS2IntermediateStorage(
            run_id=result.run_id,
            file_system=file_system,
            prefix=adls2_prefix,
            adls2_client=resource.adls2_client,
            blob_client=resource.blob_client,
        )
        assert intermediate_storage.root == "/".join(
            ["custom_prefix", "storage", result.run_id])
        assert (intermediate_storage.get_intermediate(
            context, Int, StepOutputHandle("return_one")).obj == 1)
        assert (intermediate_storage.get_intermediate(
            context, Int, StepOutputHandle("add_one")).obj == 2)
def test_adls2_pipeline_with_custom_prefix(storage_account, file_system):
    adls2_prefix = 'custom_prefix'

    pipe = define_inty_pipeline(should_throw=False)
    run_config = {
        'resources': {
            'adls2': {
                'config': {
                    'storage_account': storage_account,
                    'credential': get_azure_credential()
                }
            }
        },
        'storage': {
            'adls2': {
                'config': {
                    'adls2_file_system': file_system,
                    'adls2_prefix': adls2_prefix
                }
            }
        },
    }

    pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config)
    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(
        pipe,
        run_config=run_config,
    )
    assert result.success

    execution_plan = create_execution_plan(pipe, run_config)
    with scoped_pipeline_context(
            execution_plan,
            run_config,
            pipeline_run,
            instance,
    ) as context:
        resource = context.scoped_resources_builder.build(
            required_resource_keys={'adls2'}).adls2
        store = ADLS2IntermediateStore(
            run_id=result.run_id,
            file_system=file_system,
            prefix=adls2_prefix,
            adls2_client=resource.adls2_client,
            blob_client=resource.blob_client,
        )
        intermediates_manager = IntermediateStoreIntermediatesManager(store)
        assert store.root == '/'.join(
            ['custom_prefix', 'storage', result.run_id])
        assert (intermediates_manager.get_intermediate(
            context, Int, StepOutputHandle('return_one.compute')).obj == 1)
        assert (intermediates_manager.get_intermediate(
            context, Int, StepOutputHandle('add_one.compute')).obj == 2)
Beispiel #19
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        environment_dict=None,
        handle=None,
        run_config=None,
        solid_subset=None,
        solid_handle=None,
    ):
        '''Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        '''
        check.opt_str_param(output_log_path, 'output_log_path')
        check.opt_str_param(marshal_dir, 'marshal_dir')
        environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
        check.inst_param(run_config, 'run_config', RunConfig)
        check.inst_param(handle, 'handle', ExecutionTargetHandle)
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        check.inst_param(solid_handle, 'solid_handle', SolidHandle)

        pipeline_def = check.inst_param(
            handle.build_pipeline_definition(),
            'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()),
            PipelineDefinition,
        ).build_sub_pipeline(solid_subset)

        solid_def = pipeline_def.get_solid(solid_handle)

        run_config = run_config.with_log_sink(construct_sqlite_logger(output_log_path))

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        with scoped_pipeline_context(
            self.pipeline_def,
            environment_dict,
            run_config,
            scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillExecutionContext(pipeline_context)

        return self.context
Beispiel #20
0
def yield_empty_pipeline_context(run_id=None, instance=None):
    pipeline = PipelineDefinition([])
    instance = check.opt_inst_param(
        instance, 'instance', DagsterInstance, default=DagsterInstance.ephemeral()
    )
    pipeline_run = instance.get_or_create_run(
        run_id=run_id, pipeline_name='<empty>', pipeline_snapshot=None
    )
    with scoped_pipeline_context(
        pipeline, {}, pipeline_run, instance, create_execution_plan(pipeline),
    ) as context:
        yield context
def test_s3_pipeline_with_custom_prefix(s3_bucket):
    s3_prefix = 'custom_prefix'

    pipe = define_inty_pipeline(should_throw=False)
    environment_dict = {
        'storage': {
            's3': {
                'config': {
                    's3_bucket': s3_bucket,
                    's3_prefix': s3_prefix
                }
            }
        }
    }

    pipeline_run = PipelineRun(pipeline_name=pipe.name,
                               environment_dict=environment_dict)
    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(
        pipe,
        environment_dict=environment_dict,
    )
    assert result.success

    execution_plan = create_execution_plan(pipe, environment_dict)
    with scoped_pipeline_context(
            execution_plan,
            environment_dict,
            pipeline_run,
            instance,
    ) as context:
        store = S3IntermediateStore(
            run_id=result.run_id,
            s3_bucket=s3_bucket,
            s3_prefix=s3_prefix,
            s3_session=context.scoped_resources_builder.build(
                required_resource_keys={'s3'}).s3,
        )
        intermediates_manager = IntermediateStoreIntermediatesManager(store)
        assert store.root == '/'.join(
            ['custom_prefix', 'storage', result.run_id])
        assert (intermediates_manager.get_intermediate(
            context, Int, StepOutputHandle('return_one.compute')).obj == 1)
        assert (intermediates_manager.get_intermediate(
            context, Int, StepOutputHandle('add_one.compute')).obj == 2)
Beispiel #22
0
    def get_context(self,
                    solid_def=None,
                    mode_def=None,
                    environment_dict=None):
        check.opt_inst_param(solid_def, 'solid_def', SolidDefinition)
        check.opt_inst_param(mode_def, 'mode_def', ModeDefinition)
        environment_dict = check.opt_dict_param(environment_dict,
                                                'environment_dict',
                                                key_type=str)

        if solid_def is None:
            solid_def = SolidDefinition(
                name='this_solid',
                input_defs=[],
                compute_fn=lambda *args, **kwargs: None,
                output_defs=[],
                description=
                'Ephemeral solid constructed by dagstermill.get_context()',
            )

        if not mode_def:
            mode_def = ModeDefinition(
                logger_defs={'dagstermill': colored_console_logger})
            environment_dict['loggers'] = {'dagstermill': {}}

        pipeline_def = PipelineDefinition(
            [solid_def],
            mode_defs=[mode_def],
            name='ephemeral_dagstermill_pipeline')

        run_config = RunConfig(mode=mode_def.name)

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        with scoped_pipeline_context(
                self.pipeline_def,
                environment_dict,
                run_config,
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillInPipelineExecutionContext(
                pipeline_context)

        return self.context
Beispiel #23
0
def test_gcs_pipeline_with_custom_prefix(gcs_bucket):
    gcs_prefix = 'custom_prefix'

    pipe = define_inty_pipeline(should_throw=False)
    run_config = {
        'storage': {
            'gcs': {
                'config': {
                    'gcs_bucket': gcs_bucket,
                    'gcs_prefix': gcs_prefix
                }
            }
        }
    }

    pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config)
    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(
        pipe,
        run_config=run_config,
    )
    assert result.success

    execution_plan = create_execution_plan(pipe, run_config)
    with scoped_pipeline_context(
            execution_plan,
            run_config,
            pipeline_run,
            instance,
    ) as context:
        intermediate_storage = GCSIntermediateStorage(
            run_id=result.run_id,
            gcs_bucket=gcs_bucket,
            gcs_prefix=gcs_prefix,
            client=context.scoped_resources_builder.build(
                required_resource_keys={'gcs'}, ).gcs,
        )
        assert intermediate_storage.root == '/'.join(
            ['custom_prefix', 'storage', result.run_id])
        assert (intermediate_storage.get_intermediate(
            context, Int, StepOutputHandle('return_one.compute')).obj == 1)
        assert (intermediate_storage.get_intermediate(
            context, Int, StepOutputHandle('add_one.compute')).obj == 2)
def test_gcs_pipeline_with_custom_prefix(gcs_bucket):
    gcs_prefix = "custom_prefix"

    pipe = define_inty_pipeline(should_throw=False)
    run_config = {
        "intermediate_storage": {
            "gcs": {
                "config": {
                    "gcs_bucket": gcs_bucket,
                    "gcs_prefix": gcs_prefix
                }
            }
        }
    }

    pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config)
    instance = DagsterInstance.ephemeral()

    result = execute_pipeline(
        pipe,
        run_config=run_config,
    )
    assert result.success

    execution_plan = create_execution_plan(pipe, run_config)
    with scoped_pipeline_context(
            execution_plan,
            run_config,
            pipeline_run,
            instance,
    ) as context:
        intermediate_storage = GCSIntermediateStorage(
            run_id=result.run_id,
            gcs_bucket=gcs_bucket,
            gcs_prefix=gcs_prefix,
            client=context.scoped_resources_builder.build(
                required_resource_keys={"gcs"}, ).gcs,
        )
        assert intermediate_storage.root == "/".join(
            ["custom_prefix", "storage", result.run_id])
        assert (intermediate_storage.get_intermediate(
            context, Int, StepOutputHandle("return_one")).obj == 1)
        assert (intermediate_storage.get_intermediate(
            context, Int, StepOutputHandle("add_one")).obj == 2)
Beispiel #25
0
    def execute(self):
        from dagster.core.execution.api import scoped_pipeline_context

        check.inst(self.run_config.executor_config, MultiprocessExecutorConfig)
        pipeline = self.run_config.executor_config.handle.build_pipeline_definition(
        )

        with scoped_pipeline_context(
                pipeline, self.environment_dict,
                self.run_config.with_tags(
                    pid=str(os.getpid()))) as pipeline_context:

            execution_plan = ExecutionPlan.build(
                pipeline_context.pipeline_def,
                pipeline_context.environment_config)

            for step_event in InProcessEngine.execute(
                    pipeline_context,
                    execution_plan,
                    step_keys_to_execute=[self.step_key]):
                yield step_event
Beispiel #26
0
def yield_empty_pipeline_context(run_id=None):
    with scoped_pipeline_context(PipelineDefinition([]), {},
                                 RunConfig(run_id=run_id)) as context:
        yield context
Beispiel #27
0
    def populate_context(
        self,
        run_id=None,
        mode=None,
        solid_def_name=None,
        pipeline_name=None,
        marshal_dir=None,
        environment_config=None,
        input_name_type_dict=None,
        output_name_type_dict=None,
        output_log_path=None,
        **_kwargs
    ):
        check.str_param(run_id, 'run_id')
        check.str_param(mode, 'mode')
        check.str_param(solid_def_name, 'solid_def_name')
        check.str_param(pipeline_name, 'pipeline_name')
        check.str_param(marshal_dir, 'marshal_dir')
        check.dict_param(environment_config, 'environment_config')
        check.dict_param(input_name_type_dict, 'input_name_type_dict')
        check.dict_param(output_name_type_dict, 'output_name_type_dict')
        check.str_param(output_log_path, 'output_log_path')

        self.populated_by_papermill = True
        self.solid_def_name = solid_def_name
        self.marshal_dir = marshal_dir

        logger_def = construct_logger(output_log_path)
        loggers = {'dagstermill': logger_def}

        if self.repository_def is None:
            self.solid_def = None
            self.pipeline_def = PipelineDefinition(
                [],
                mode_definitions=[ModeDefinition(loggers=loggers)],
                name='Dummy Pipeline (No Repo Registration)',
            )
            self.input_name_type_dict = dict_to_enum(input_name_type_dict)
            self.output_name_type_dict = dict_to_enum(output_name_type_dict)
            for _, runtime_type_enum in self.input_name_type_dict.items():
                if runtime_type_enum == SerializableRuntimeType.NONE:
                    raise DagstermillError(
                        'If Dagstermill solids have inputs that require serialization strategies '
                        'that are not pickling, then you must register a repository within '
                        'notebook by calling dagstermill.register_repository(repository_def)'
                    )
            for _, runtime_type_enum in self.output_name_type_dict.items():
                if runtime_type_enum == SerializableRuntimeType.NONE:
                    raise DagstermillError(
                        'If Dagstermill solids have outputs that require serialization strategies '
                        'that are not pickling, then you must register a repository within '
                        'notebook by calling dagstermill.register_repository(repository_def).'
                    )
            environment_config = {'loggers': {'dagstermill': {}}}
            run_config = RunConfig(run_id=run_id, mode=mode)

        else:
            self.pipeline_def = self.repository_def.get_pipeline(pipeline_name)
            check.invariant(
                self.pipeline_def.has_solid_def(solid_def_name),
                'solid {} not found'.format(solid_def_name),
            )
            self.solid_def = self.pipeline_def.solid_def_named(solid_def_name)

            logger = logger_def.logger_fn(
                InitLoggerContext({}, self.pipeline_def, logger_def, run_id)
            )

            run_config = RunConfig(run_id, loggers=[logger], mode=mode)

        with scoped_pipeline_context(
            self.pipeline_def,
            environment_config,
            run_config,
            scoped_resources_builder_cm=self.setup_resources,
        ) as pipeline_context:
            self.context = DagstermillInNotebookExecutionContext(pipeline_context)

        return self.context
Beispiel #28
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        run_config=None,
        executable_dict=None,
        pipeline_run_dict=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        """Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        """
        check.opt_str_param(output_log_path, "output_log_path")
        check.opt_str_param(marshal_dir, "marshal_dir")
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)
        check.dict_param(pipeline_run_dict, "pipeline_run_dict")
        check.dict_param(executable_dict, "executable_dict")
        check.dict_param(solid_handle_kwargs, "solid_handle_kwargs")
        check.dict_param(instance_ref_dict, "instance_ref_dict")

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        pipeline_def = pipeline.get_definition()

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            raise DagstermillError(
                "Error when attempting to resolve DagsterInstance from serialized InstanceRef"
            ) from err

        pipeline_run = unpack_value(pipeline_run_dict)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline = pipeline

        environment_config = EnvironmentConfig.build(pipeline_def,
                                                     run_config,
                                                     mode=pipeline_run.mode)

        execution_plan = ExecutionPlan.build(
            self.pipeline,
            environment_config,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        )

        with scoped_pipeline_context(
                execution_plan,
                pipeline,
                run_config,
                pipeline_run,
                instance,
                scoped_resources_builder_cm=self._setup_resources,
                # Set this flag even though we're not in test for clearer error reporting
                raise_on_error=True,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                pipeline_def=pipeline_def,
                solid_config=run_config.get("solids",
                                            {}).get(solid_def.name,
                                                    {}).get("config"),
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_def,
                    environment_config,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
Beispiel #29
0
def execute_on_dask(
    handle, env_config=None, run_config=None, mode=None, dask_config=None
):  # pylint: disable=too-many-locals
    check.inst_param(handle, 'handle', ExecutionTargetHandle)

    env_config = check.opt_dict_param(env_config, 'env_config', key_type=str)
    dask_config = check.opt_inst_param(dask_config, 'dask_config', DaskConfig, DaskConfig())
    run_config = check.opt_inst_param(
        run_config, 'run_config', RunConfig, RunConfig(storage_mode=RunStorageMode.FILESYSTEM)
    )
    pipeline = handle.build_pipeline_definition()
    mode = check.opt_str_param(mode, 'mode', pipeline.get_default_mode_name())

    # Checks to ensure storage is compatible with Dask configuration
    storage = env_config.get('storage')
    check.invariant(storage.keys(), 'Must specify storage to use Dask execution')

    if dask_config.is_remote_execution:
        check.invariant(
            storage.get('s3'),
            'Must use S3 storage with non-local Dask address {dask_address}'.format(
                dask_address=dask_config.address
            ),
        )
    else:
        check.invariant(
            not storage.get('in_memory'),
            'Cannot use in-memory storage with Dask, use filesystem or S3',
        )

    execution_plan = create_execution_plan(pipeline, env_config, mode=mode)

    step_levels = execution_plan.topological_step_levels()

    query = build_graphql_query()

    with scoped_pipeline_context(pipeline, env_config, run_config) as pipeline_context:
        with dask.distributed.Client(**dask_config.build_dict(pipeline.name)) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    step_context = pipeline_context.for_step(step)

                    check.invariant(
                        not step_context.run_config.loggers,
                        'Cannot inject loggers via RunConfig with the Dask executor',
                    )

                    check.invariant(
                        not step_context.event_callback,
                        'Cannot use event_callback with Dask executor',
                    )

                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = [
                        execution_futures_dict[ni.prev_output_handle.step_key]
                        for ni in step.step_inputs
                    ]

                    variables = {
                        'executionParams': {
                            'selector': {'name': pipeline.name},
                            'environmentConfigData': env_config,
                            'mode': mode,
                            'executionMetadata': {'runId': run_config.run_id},
                            'stepKeys': [step.key],
                        }
                    }

                    future = client.submit(
                        query_on_dask_worker, handle, query, variables, dependencies
                    )

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the master
            execution_step_events = client.gather(execution_futures)

            # execution_step_events is now a list of lists, the inner lists contain the dagster
            # events emitted by each step
            event_list = list(itertools.chain.from_iterable(execution_step_events))

            return PipelineExecutionResult(
                pipeline,
                run_config.run_id,
                event_list,
                lambda: scoped_pipeline_context(
                    pipeline,
                    env_config,
                    run_config,
                    intermediates_manager=pipeline_context.intermediates_manager,
                ),
            )
Beispiel #30
0
    def get_context(self, solid_config=None, mode_def=None, run_config=None):
        """Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            run_config(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :py:class:`~dagstermill.DagstermillExecutionContext`
        """
        check.opt_inst_param(mode_def, "mode_def", ModeDefinition)
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)

        # If we are running non-interactively, and there is already a context reconstituted, return
        # that context rather than overwriting it.
        if self.context is not None and isinstance(
                self.context, DagstermillRuntimeExecutionContext):
            return self.context

        if not mode_def:
            mode_def = ModeDefinition(
                logger_defs={"dagstermill": colored_console_logger})
            run_config["loggers"] = {"dagstermill": {}}

        solid_def = SolidDefinition(
            name="this_solid",
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description=
            "Ephemeral solid constructed by dagstermill.get_context()",
            required_resource_keys=mode_def.resource_key_set,
        )

        pipeline_def = PipelineDefinition(
            [solid_def],
            mode_defs=[mode_def],
            name="ephemeral_dagstermill_pipeline")

        run_id = make_new_run_id()

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=mode_def.name,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline = pipeline_def

        environment_config = EnvironmentConfig.build(pipeline_def,
                                                     run_config,
                                                     mode=mode_def.name)

        pipeline = InMemoryPipeline(pipeline_def)
        execution_plan = ExecutionPlan.build(pipeline, environment_config)

        with scoped_pipeline_context(
                execution_plan,
                pipeline,
                run_config,
                pipeline_run,
                DagsterInstance.ephemeral(),
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:

            self.context = DagstermillExecutionContext(
                pipeline_context=pipeline_context,
                pipeline_def=pipeline_def,
                solid_config=solid_config,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_def,
                    environment_config,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context