コード例 #1
0
def create_test_pipeline_execution_context(logger_defs=None):
    loggers = check.opt_dict_param(
        logger_defs, 'logger_defs', key_type=str, value_type=LoggerDefinition
    )
    mode_def = ModeDefinition(logger_defs=loggers)
    pipeline_def = PipelineDefinition(
        name='test_legacy_context', solid_defs=[], mode_defs=[mode_def]
    )
    environment_dict = {'loggers': {key: {} for key in loggers}}
    pipeline_run = PipelineRun(
        pipeline_name='test_legacy_context', environment_dict=environment_dict
    )
    instance = DagsterInstance.ephemeral()
    execution_plan = create_execution_plan(pipeline=pipeline_def, environment_dict=environment_dict)
    creation_data = create_context_creation_data(
        pipeline_def, environment_dict, pipeline_run, instance, execution_plan
    )
    log_manager = create_log_manager(creation_data)
    scoped_resources_builder = ScopedResourcesBuilder()
    executor_config = create_executor_config(creation_data)
    return construct_pipeline_execution_context(
        context_creation_data=creation_data,
        scoped_resources_builder=scoped_resources_builder,
        system_storage_data=SystemStorageData(
            intermediates_manager=InMemoryIntermediatesManager(),
            file_manager=LocalFileManager.for_instance(instance, pipeline_run.run_id),
        ),
        log_manager=log_manager,
        executor_config=executor_config,
        raise_on_error=True,
    )
コード例 #2
0
def create_test_pipeline_execution_context(
    logger_defs=None, scoped_resources_builder=None, tags=None, run_config_loggers=None
):
    run_id = str(uuid.uuid4())
    loggers = check.opt_dict_param(
        logger_defs, 'logger_defs', key_type=str, value_type=LoggerDefinition
    )
    mode_def = ModeDefinition(logger_defs=loggers)
    pipeline_def = PipelineDefinition(
        name='test_legacy_context', solid_defs=[], mode_defs=[mode_def]
    )
    run_config_loggers = check.opt_list_param(
        run_config_loggers, 'run_config_loggers', of_type=logging.Logger
    )
    run_config = RunConfig(run_id, tags=tags, loggers=run_config_loggers)
    environment_dict = {'loggers': {key: {} for key in loggers}}
    creation_data = create_context_creation_data(pipeline_def, environment_dict, run_config)
    log_manager = create_log_manager(creation_data)

    scoped_resources_builder = check.opt_inst_param(
        scoped_resources_builder,
        'scoped_resources_builder',
        ScopedResourcesBuilder,
        default=ScopedResourcesBuilder(),
    )
    return construct_pipeline_execution_context(
        context_creation_data=creation_data,
        scoped_resources_builder=scoped_resources_builder,
        system_storage_data=SystemStorageData(
            run_storage=InMemoryRunStorage(),
            intermediates_manager=InMemoryIntermediatesManager(),
            file_manager=LocalFileManager.for_run_id(run_id),
        ),
        log_manager=log_manager,
    )
コード例 #3
0
ファイル: __init__.py プロジェクト: nuruladroady/dagster
def create_test_pipeline_execution_context(logger_defs=None):
    from dagster.core.storage.intermediate_storage import build_in_mem_intermediates_storage

    loggers = check.opt_dict_param(
        logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition
    )
    mode_def = ModeDefinition(logger_defs=loggers)
    pipeline_def = PipelineDefinition(
        name="test_legacy_context", solid_defs=[], mode_defs=[mode_def]
    )
    run_config = {"loggers": {key: {} for key in loggers}}
    pipeline_run = PipelineRun(pipeline_name="test_legacy_context", run_config=run_config)
    instance = DagsterInstance.ephemeral()
    execution_plan = create_execution_plan(pipeline=pipeline_def, run_config=run_config)
    creation_data = create_context_creation_data(execution_plan, run_config, pipeline_run, instance)
    log_manager = create_log_manager(creation_data)
    scoped_resources_builder = ScopedResourcesBuilder()
    executor = create_executor(creation_data)

    return SystemPipelineExecutionContext(
        construct_execution_context_data(
            context_creation_data=creation_data,
            scoped_resources_builder=scoped_resources_builder,
            intermediate_storage=build_in_mem_intermediates_storage(pipeline_run.run_id),
            system_storage_data=SystemStorageData(
                intermediate_storage=build_in_mem_intermediates_storage(pipeline_run.run_id),
                file_manager=LocalFileManager.for_instance(instance, pipeline_run.run_id),
            ),
            log_manager=log_manager,
            retries=executor.retries,
            raise_on_error=True,
        ),
        executor=executor,
        log_manager=log_manager,
    )
コード例 #4
0
ファイル: system_storage.py プロジェクト: varokas/dagster-1
def adls2_system_storage(init_context):
    '''Persistent system storage using Azure Data Lake Storage Gen2 for storage.

    Suitable for intermediates storage for distributed executors, so long as
    each execution node has network connectivity and credentials for ADLS and
    the backing container.

    Attach this system storage definition, as well as the :py:data:`~dagster_azure.adls2_resource`
    it requires, to a :py:class:`~dagster.ModeDefinition` in order to make it available to your
    pipeline:

    .. code-block:: python

        pipeline_def = PipelineDefinition(
            mode_defs=[
                ModeDefinition(
                    resource_defs={'adls2': adls2_resource, ...},
                    system_storage_defs=default_system_storage_defs + [adls2_system_storage, ...],
                    ...
                ), ...
            ], ...
        )

    You may configure this storage as follows:

    .. code-block:: YAML

        storage:
          adls2:
            config:
              adls2_sa: my-best-storage-account
              adls2_file_system: my-cool-file-system
              adls2_prefix: good/prefix-for-files-
    '''
    resource = init_context.resources.adls2
    adls2_base = '{prefix}/storage/{run_id}/files'.format(
        prefix=init_context.system_storage_config['adls2_prefix'],
        run_id=init_context.pipeline_run.run_id,
    )
    return SystemStorageData(
        file_manager=ADLS2FileManager(
            adls2_client=resource.adls2_client,
            file_system=init_context.
            system_storage_config['adls2_file_system'],
            prefix=adls2_base,
        ),
        intermediates_manager=IntermediateStoreIntermediatesManager(
            ADLS2IntermediateStore(
                file_system=init_context.
                system_storage_config['adls2_file_system'],
                run_id=init_context.pipeline_run.run_id,
                adls2_client=resource.adls2_client,
                blob_client=resource.blob_client,
                prefix=init_context.system_storage_config['adls2_prefix'],
                type_storage_plugin_registry=init_context.
                type_storage_plugin_registry,
            )),
    )
コード例 #5
0
def s3_system_storage(init_context):
    '''Persistent system storage using S3 for storage.
    
    Suitable for intermediates storage for distributed executors, so long as
    each execution node has network connectivity and credentials for S3 and
    the backing bucket.

    Attach this system storage definition, as well as the :py:data:`~dagster_aws.s3_resource` it
    requires, to a :py:class:`~dagster.ModeDefinition` in order to make it available to your
    pipeline:

    .. code-block:: python

        pipeline_def = PipelineDefinition(
            mode_defs=[
                ModeDefinition(
                    resource_defs={'s3': s3_resource, ...},
                    system_storage_defs=default_system_storage_defs + [s3_system_storage, ...],
                    ...
                ), ...
            ], ...
        )

    You may configure this storage as follows:

    .. code-block:: YAML
    
        storage:
          s3:
            config:
              s3_bucket: my-cool-bucket
              s3_prefix: good/prefix-for-files-
    '''
    s3_session = init_context.resources.s3.session
    s3_key = '{prefix}/storage/{run_id}/files'.format(
        prefix=init_context.system_storage_config['s3_prefix'],
        run_id=init_context.pipeline_run.run_id,
    )
    return SystemStorageData(
        file_manager=S3FileManager(
            s3_session=s3_session,
            s3_bucket=init_context.system_storage_config['s3_bucket'],
            s3_base_key=s3_key,
        ),
        intermediates_manager=IntermediateStoreIntermediatesManager(
            S3IntermediateStore(
                s3_session=s3_session,
                s3_bucket=init_context.system_storage_config['s3_bucket'],
                s3_prefix=init_context.system_storage_config['s3_prefix'],
                run_id=init_context.pipeline_run.run_id,
                type_storage_plugin_registry=init_context.
                type_storage_plugin_registry,
            )),
    )
コード例 #6
0
def s3_system_storage(init_context):
    s3_session = init_context.resources.s3.session
    s3_key = 'dagster/storage/{run_id}/files'.format(run_id=init_context.pipeline_run.run_id)
    return SystemStorageData(
        file_manager=S3FileManager(
            s3_session=s3_session,
            s3_bucket=init_context.system_storage_config['s3_bucket'],
            s3_base_key=s3_key,
        ),
        intermediates_manager=IntermediateStoreIntermediatesManager(
            S3IntermediateStore(
                s3_session=s3_session,
                s3_bucket=init_context.system_storage_config['s3_bucket'],
                run_id=init_context.pipeline_run.run_id,
                type_storage_plugin_registry=init_context.type_storage_plugin_registry,
            )
        ),
    )
コード例 #7
0
ファイル: execute.py プロジェクト: rolanddb/dagster
def execute_on_dask(handle,
                    env_config=None,
                    run_config=None,
                    dask_config=None):  # pylint: disable=too-many-locals
    check.inst_param(handle, 'handle', ExecutionTargetHandle)
    check.opt_dict_param(env_config, 'env_config', key_type=str)
    dask_config = check.opt_inst_param(dask_config, 'dask_config', DaskConfig,
                                       DaskConfig())
    run_config = check.opt_inst_param(run_config, 'run_config', RunConfig,
                                      RunConfig(executor_config=dask_config))

    check.inst(
        run_config.executor_config,
        DaskConfig,
        'run_config.executor_config should be instance of DaskConfig to execute on Dask',
    )

    pipeline_def = handle.build_pipeline_definition()

    execution_plan = create_execution_plan(pipeline_def,
                                           env_config,
                                           run_config=run_config)

    with scoped_pipeline_context(pipeline_def, env_config,
                                 run_config) as pipeline_context:
        events = list(
            DaskEngine.execute(pipeline_context, execution_plan, None))

        return PipelineExecutionResult(
            pipeline_def,
            run_config.run_id,
            events,
            lambda: scoped_pipeline_context(
                pipeline_def,
                env_config,
                run_config,
                system_storage_data=SystemStorageData(
                    intermediates_manager=pipeline_context.
                    intermediates_manager,
                    run_storage=pipeline_context.run_storage,
                    file_manager=pipeline_context.file_manager,
                ),
            ),
        )
コード例 #8
0
ファイル: system_storage.py プロジェクト: xhochy/dagster
def gcs_system_storage(init_context):
    client = init_context.resources.gcs.client
    gcs_key = 'dagster/storage/{run_id}/files'.format(
        run_id=init_context.pipeline_run.run_id)
    return SystemStorageData(
        file_manager=GCSFileManager(
            client=client,
            gcs_bucket=init_context.system_storage_config['gcs_bucket'],
            gcs_base_key=gcs_key,
        ),
        intermediates_manager=IntermediateStoreIntermediatesManager(
            GCSIntermediateStore(
                client=client,
                gcs_bucket=init_context.system_storage_config['gcs_bucket'],
                run_id=init_context.pipeline_run.run_id,
                type_storage_plugin_registry=init_context.
                type_storage_plugin_registry,
            )),
    )
コード例 #9
0
def gcs_system_storage(init_context):
    client = init_context.resources.gcs
    gcs_key = "{prefix}/storage/{run_id}/files".format(
        prefix=init_context.system_storage_config["gcs_prefix"],
        run_id=init_context.pipeline_run.run_id,
    )
    return SystemStorageData(
        file_manager=GCSFileManager(
            client=client,
            gcs_bucket=init_context.system_storage_config["gcs_bucket"],
            gcs_base_key=gcs_key,
        ),
        intermediate_storage=GCSIntermediateStorage(
            client=client,
            gcs_bucket=init_context.system_storage_config["gcs_bucket"],
            gcs_prefix=init_context.system_storage_config["gcs_prefix"],
            run_id=init_context.pipeline_run.run_id,
            type_storage_plugin_registry=init_context.
            type_storage_plugin_registry,
        ),
    )
コード例 #10
0
ファイル: test.py プロジェクト: lceames/dagster
def create_test_pipeline_execution_context(logger_defs=None,
                                           scoped_resources_builder=None,
                                           tags=None):
    run_id = str(uuid.uuid4())
    loggers = check.opt_dict_param(logger_defs,
                                   'logger_defs',
                                   key_type=str,
                                   value_type=LoggerDefinition)
    mode_def = ModeDefinition(logger_defs=loggers)
    pipeline_def = PipelineDefinition(name='test_legacy_context',
                                      solid_defs=[],
                                      mode_defs=[mode_def])
    run_config = RunConfig(run_id, tags=tags)
    environment_dict = {'loggers': {key: {} for key in loggers}}
    instance = DagsterInstance.ephemeral()
    creation_data = create_context_creation_data(pipeline_def,
                                                 environment_dict, run_config,
                                                 instance)
    log_manager = create_log_manager(creation_data)
    scoped_resources_builder = check.opt_inst_param(
        scoped_resources_builder,
        'scoped_resources_builder',
        ScopedResourcesBuilder,
        default=ScopedResourcesBuilder(),
    )
    executor_config = create_executor_config(creation_data)
    return construct_pipeline_execution_context(
        context_creation_data=creation_data,
        scoped_resources_builder=scoped_resources_builder,
        system_storage_data=SystemStorageData(
            intermediates_manager=InMemoryIntermediatesManager(),
            file_manager=LocalFileManager.for_instance(instance, run_id),
        ),
        log_manager=log_manager,
        executor_config=executor_config,
        raise_on_error=True,
    )
コード例 #11
0
def execute_on_dask(handle,
                    env_config=None,
                    run_config=None,
                    mode=None,
                    dask_config=None):  # pylint: disable=too-many-locals
    check.inst_param(handle, 'handle', ExecutionTargetHandle)

    env_config = check.opt_dict_param(env_config, 'env_config', key_type=str)
    dask_config = check.opt_inst_param(dask_config, 'dask_config', DaskConfig,
                                       DaskConfig())
    run_config = check.opt_inst_param(run_config, 'run_config', RunConfig,
                                      RunConfig())
    pipeline = handle.build_pipeline_definition()
    mode = check.opt_str_param(mode, 'mode', pipeline.get_default_mode_name())

    # Checks to ensure storage is compatible with Dask configuration
    storage = env_config.get('storage')
    check.invariant(storage.keys(),
                    'Must specify storage to use Dask execution')

    if dask_config.is_remote_execution:
        check.invariant(
            storage.get('s3'),
            'Must use S3 storage with non-local Dask address {dask_address}'.
            format(dask_address=dask_config.address),
        )
    else:
        check.invariant(
            not storage.get('in_memory'),
            'Cannot use in-memory storage with Dask, use filesystem or S3',
        )

    execution_plan = create_execution_plan(pipeline,
                                           env_config,
                                           run_config=run_config)

    step_levels = execution_plan.topological_step_levels()

    query = build_graphql_query()

    with scoped_pipeline_context(pipeline, env_config,
                                 run_config) as pipeline_context:
        with dask.distributed.Client(
                **dask_config.build_dict(pipeline.name)) as client:
            execution_futures = []
            execution_futures_dict = {}

            for step_level in step_levels:
                for step in step_level:
                    step_context = pipeline_context.for_step(step)

                    check.invariant(
                        not step_context.run_config.loggers,
                        'Cannot inject loggers via RunConfig with the Dask executor',
                    )

                    check.invariant(
                        not step_context.event_callback,
                        'Cannot use event_callback with Dask executor',
                    )

                    # We ensure correctness in sequencing by letting Dask schedule futures and
                    # awaiting dependencies within each step.
                    dependencies = [
                        execution_futures_dict[ni.prev_output_handle.step_key]
                        for ni in step.step_inputs
                    ]

                    variables = {
                        'executionParams': {
                            'selector': {
                                'name': pipeline.name
                            },
                            'environmentConfigData': env_config,
                            'mode': mode,
                            'executionMetadata': {
                                'runId': run_config.run_id
                            },
                            'stepKeys': [step.key],
                        }
                    }

                    future = client.submit(query_on_dask_worker, handle, query,
                                           variables, dependencies)

                    execution_futures.append(future)
                    execution_futures_dict[step.key] = future

            # This tells Dask to awaits the step executions and retrieve their results to the master
            execution_step_events = client.gather(execution_futures)

            # execution_step_events is now a list of lists, the inner lists contain the dagster
            # events emitted by each step
            event_list = list(
                itertools.chain.from_iterable(execution_step_events))

            return PipelineExecutionResult(
                pipeline,
                run_config.run_id,
                event_list,
                lambda: scoped_pipeline_context(
                    pipeline,
                    env_config,
                    run_config,
                    system_storage_data=SystemStorageData(
                        intermediates_manager=pipeline_context.
                        intermediates_manager,
                        run_storage=pipeline_context.run_storage,
                        file_manager=pipeline_context.file_manager,
                    ),
                ),
            )