def create_test_pipeline_execution_context(logger_defs=None): loggers = check.opt_dict_param( logger_defs, 'logger_defs', key_type=str, value_type=LoggerDefinition ) mode_def = ModeDefinition(logger_defs=loggers) pipeline_def = PipelineDefinition( name='test_legacy_context', solid_defs=[], mode_defs=[mode_def] ) environment_dict = {'loggers': {key: {} for key in loggers}} pipeline_run = PipelineRun( pipeline_name='test_legacy_context', environment_dict=environment_dict ) instance = DagsterInstance.ephemeral() execution_plan = create_execution_plan(pipeline=pipeline_def, environment_dict=environment_dict) creation_data = create_context_creation_data( pipeline_def, environment_dict, pipeline_run, instance, execution_plan ) log_manager = create_log_manager(creation_data) scoped_resources_builder = ScopedResourcesBuilder() executor_config = create_executor_config(creation_data) return construct_pipeline_execution_context( context_creation_data=creation_data, scoped_resources_builder=scoped_resources_builder, system_storage_data=SystemStorageData( intermediates_manager=InMemoryIntermediatesManager(), file_manager=LocalFileManager.for_instance(instance, pipeline_run.run_id), ), log_manager=log_manager, executor_config=executor_config, raise_on_error=True, )
def create_test_pipeline_execution_context( logger_defs=None, scoped_resources_builder=None, tags=None, run_config_loggers=None ): run_id = str(uuid.uuid4()) loggers = check.opt_dict_param( logger_defs, 'logger_defs', key_type=str, value_type=LoggerDefinition ) mode_def = ModeDefinition(logger_defs=loggers) pipeline_def = PipelineDefinition( name='test_legacy_context', solid_defs=[], mode_defs=[mode_def] ) run_config_loggers = check.opt_list_param( run_config_loggers, 'run_config_loggers', of_type=logging.Logger ) run_config = RunConfig(run_id, tags=tags, loggers=run_config_loggers) environment_dict = {'loggers': {key: {} for key in loggers}} creation_data = create_context_creation_data(pipeline_def, environment_dict, run_config) log_manager = create_log_manager(creation_data) scoped_resources_builder = check.opt_inst_param( scoped_resources_builder, 'scoped_resources_builder', ScopedResourcesBuilder, default=ScopedResourcesBuilder(), ) return construct_pipeline_execution_context( context_creation_data=creation_data, scoped_resources_builder=scoped_resources_builder, system_storage_data=SystemStorageData( run_storage=InMemoryRunStorage(), intermediates_manager=InMemoryIntermediatesManager(), file_manager=LocalFileManager.for_run_id(run_id), ), log_manager=log_manager, )
def create_test_pipeline_execution_context(logger_defs=None): from dagster.core.storage.intermediate_storage import build_in_mem_intermediates_storage loggers = check.opt_dict_param( logger_defs, "logger_defs", key_type=str, value_type=LoggerDefinition ) mode_def = ModeDefinition(logger_defs=loggers) pipeline_def = PipelineDefinition( name="test_legacy_context", solid_defs=[], mode_defs=[mode_def] ) run_config = {"loggers": {key: {} for key in loggers}} pipeline_run = PipelineRun(pipeline_name="test_legacy_context", run_config=run_config) instance = DagsterInstance.ephemeral() execution_plan = create_execution_plan(pipeline=pipeline_def, run_config=run_config) creation_data = create_context_creation_data(execution_plan, run_config, pipeline_run, instance) log_manager = create_log_manager(creation_data) scoped_resources_builder = ScopedResourcesBuilder() executor = create_executor(creation_data) return SystemPipelineExecutionContext( construct_execution_context_data( context_creation_data=creation_data, scoped_resources_builder=scoped_resources_builder, intermediate_storage=build_in_mem_intermediates_storage(pipeline_run.run_id), system_storage_data=SystemStorageData( intermediate_storage=build_in_mem_intermediates_storage(pipeline_run.run_id), file_manager=LocalFileManager.for_instance(instance, pipeline_run.run_id), ), log_manager=log_manager, retries=executor.retries, raise_on_error=True, ), executor=executor, log_manager=log_manager, )
def adls2_system_storage(init_context): '''Persistent system storage using Azure Data Lake Storage Gen2 for storage. Suitable for intermediates storage for distributed executors, so long as each execution node has network connectivity and credentials for ADLS and the backing container. Attach this system storage definition, as well as the :py:data:`~dagster_azure.adls2_resource` it requires, to a :py:class:`~dagster.ModeDefinition` in order to make it available to your pipeline: .. code-block:: python pipeline_def = PipelineDefinition( mode_defs=[ ModeDefinition( resource_defs={'adls2': adls2_resource, ...}, system_storage_defs=default_system_storage_defs + [adls2_system_storage, ...], ... ), ... ], ... ) You may configure this storage as follows: .. code-block:: YAML storage: adls2: config: adls2_sa: my-best-storage-account adls2_file_system: my-cool-file-system adls2_prefix: good/prefix-for-files- ''' resource = init_context.resources.adls2 adls2_base = '{prefix}/storage/{run_id}/files'.format( prefix=init_context.system_storage_config['adls2_prefix'], run_id=init_context.pipeline_run.run_id, ) return SystemStorageData( file_manager=ADLS2FileManager( adls2_client=resource.adls2_client, file_system=init_context. system_storage_config['adls2_file_system'], prefix=adls2_base, ), intermediates_manager=IntermediateStoreIntermediatesManager( ADLS2IntermediateStore( file_system=init_context. system_storage_config['adls2_file_system'], run_id=init_context.pipeline_run.run_id, adls2_client=resource.adls2_client, blob_client=resource.blob_client, prefix=init_context.system_storage_config['adls2_prefix'], type_storage_plugin_registry=init_context. type_storage_plugin_registry, )), )
def s3_system_storage(init_context): '''Persistent system storage using S3 for storage. Suitable for intermediates storage for distributed executors, so long as each execution node has network connectivity and credentials for S3 and the backing bucket. Attach this system storage definition, as well as the :py:data:`~dagster_aws.s3_resource` it requires, to a :py:class:`~dagster.ModeDefinition` in order to make it available to your pipeline: .. code-block:: python pipeline_def = PipelineDefinition( mode_defs=[ ModeDefinition( resource_defs={'s3': s3_resource, ...}, system_storage_defs=default_system_storage_defs + [s3_system_storage, ...], ... ), ... ], ... ) You may configure this storage as follows: .. code-block:: YAML storage: s3: config: s3_bucket: my-cool-bucket s3_prefix: good/prefix-for-files- ''' s3_session = init_context.resources.s3.session s3_key = '{prefix}/storage/{run_id}/files'.format( prefix=init_context.system_storage_config['s3_prefix'], run_id=init_context.pipeline_run.run_id, ) return SystemStorageData( file_manager=S3FileManager( s3_session=s3_session, s3_bucket=init_context.system_storage_config['s3_bucket'], s3_base_key=s3_key, ), intermediates_manager=IntermediateStoreIntermediatesManager( S3IntermediateStore( s3_session=s3_session, s3_bucket=init_context.system_storage_config['s3_bucket'], s3_prefix=init_context.system_storage_config['s3_prefix'], run_id=init_context.pipeline_run.run_id, type_storage_plugin_registry=init_context. type_storage_plugin_registry, )), )
def s3_system_storage(init_context): s3_session = init_context.resources.s3.session s3_key = 'dagster/storage/{run_id}/files'.format(run_id=init_context.pipeline_run.run_id) return SystemStorageData( file_manager=S3FileManager( s3_session=s3_session, s3_bucket=init_context.system_storage_config['s3_bucket'], s3_base_key=s3_key, ), intermediates_manager=IntermediateStoreIntermediatesManager( S3IntermediateStore( s3_session=s3_session, s3_bucket=init_context.system_storage_config['s3_bucket'], run_id=init_context.pipeline_run.run_id, type_storage_plugin_registry=init_context.type_storage_plugin_registry, ) ), )
def execute_on_dask(handle, env_config=None, run_config=None, dask_config=None): # pylint: disable=too-many-locals check.inst_param(handle, 'handle', ExecutionTargetHandle) check.opt_dict_param(env_config, 'env_config', key_type=str) dask_config = check.opt_inst_param(dask_config, 'dask_config', DaskConfig, DaskConfig()) run_config = check.opt_inst_param(run_config, 'run_config', RunConfig, RunConfig(executor_config=dask_config)) check.inst( run_config.executor_config, DaskConfig, 'run_config.executor_config should be instance of DaskConfig to execute on Dask', ) pipeline_def = handle.build_pipeline_definition() execution_plan = create_execution_plan(pipeline_def, env_config, run_config=run_config) with scoped_pipeline_context(pipeline_def, env_config, run_config) as pipeline_context: events = list( DaskEngine.execute(pipeline_context, execution_plan, None)) return PipelineExecutionResult( pipeline_def, run_config.run_id, events, lambda: scoped_pipeline_context( pipeline_def, env_config, run_config, system_storage_data=SystemStorageData( intermediates_manager=pipeline_context. intermediates_manager, run_storage=pipeline_context.run_storage, file_manager=pipeline_context.file_manager, ), ), )
def gcs_system_storage(init_context): client = init_context.resources.gcs.client gcs_key = 'dagster/storage/{run_id}/files'.format( run_id=init_context.pipeline_run.run_id) return SystemStorageData( file_manager=GCSFileManager( client=client, gcs_bucket=init_context.system_storage_config['gcs_bucket'], gcs_base_key=gcs_key, ), intermediates_manager=IntermediateStoreIntermediatesManager( GCSIntermediateStore( client=client, gcs_bucket=init_context.system_storage_config['gcs_bucket'], run_id=init_context.pipeline_run.run_id, type_storage_plugin_registry=init_context. type_storage_plugin_registry, )), )
def gcs_system_storage(init_context): client = init_context.resources.gcs gcs_key = "{prefix}/storage/{run_id}/files".format( prefix=init_context.system_storage_config["gcs_prefix"], run_id=init_context.pipeline_run.run_id, ) return SystemStorageData( file_manager=GCSFileManager( client=client, gcs_bucket=init_context.system_storage_config["gcs_bucket"], gcs_base_key=gcs_key, ), intermediate_storage=GCSIntermediateStorage( client=client, gcs_bucket=init_context.system_storage_config["gcs_bucket"], gcs_prefix=init_context.system_storage_config["gcs_prefix"], run_id=init_context.pipeline_run.run_id, type_storage_plugin_registry=init_context. type_storage_plugin_registry, ), )
def create_test_pipeline_execution_context(logger_defs=None, scoped_resources_builder=None, tags=None): run_id = str(uuid.uuid4()) loggers = check.opt_dict_param(logger_defs, 'logger_defs', key_type=str, value_type=LoggerDefinition) mode_def = ModeDefinition(logger_defs=loggers) pipeline_def = PipelineDefinition(name='test_legacy_context', solid_defs=[], mode_defs=[mode_def]) run_config = RunConfig(run_id, tags=tags) environment_dict = {'loggers': {key: {} for key in loggers}} instance = DagsterInstance.ephemeral() creation_data = create_context_creation_data(pipeline_def, environment_dict, run_config, instance) log_manager = create_log_manager(creation_data) scoped_resources_builder = check.opt_inst_param( scoped_resources_builder, 'scoped_resources_builder', ScopedResourcesBuilder, default=ScopedResourcesBuilder(), ) executor_config = create_executor_config(creation_data) return construct_pipeline_execution_context( context_creation_data=creation_data, scoped_resources_builder=scoped_resources_builder, system_storage_data=SystemStorageData( intermediates_manager=InMemoryIntermediatesManager(), file_manager=LocalFileManager.for_instance(instance, run_id), ), log_manager=log_manager, executor_config=executor_config, raise_on_error=True, )
def execute_on_dask(handle, env_config=None, run_config=None, mode=None, dask_config=None): # pylint: disable=too-many-locals check.inst_param(handle, 'handle', ExecutionTargetHandle) env_config = check.opt_dict_param(env_config, 'env_config', key_type=str) dask_config = check.opt_inst_param(dask_config, 'dask_config', DaskConfig, DaskConfig()) run_config = check.opt_inst_param(run_config, 'run_config', RunConfig, RunConfig()) pipeline = handle.build_pipeline_definition() mode = check.opt_str_param(mode, 'mode', pipeline.get_default_mode_name()) # Checks to ensure storage is compatible with Dask configuration storage = env_config.get('storage') check.invariant(storage.keys(), 'Must specify storage to use Dask execution') if dask_config.is_remote_execution: check.invariant( storage.get('s3'), 'Must use S3 storage with non-local Dask address {dask_address}'. format(dask_address=dask_config.address), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Dask, use filesystem or S3', ) execution_plan = create_execution_plan(pipeline, env_config, run_config=run_config) step_levels = execution_plan.topological_step_levels() query = build_graphql_query() with scoped_pipeline_context(pipeline, env_config, run_config) as pipeline_context: with dask.distributed.Client( **dask_config.build_dict(pipeline.name)) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: step_context = pipeline_context.for_step(step) check.invariant( not step_context.run_config.loggers, 'Cannot inject loggers via RunConfig with the Dask executor', ) check.invariant( not step_context.event_callback, 'Cannot use event_callback with Dask executor', ) # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [ execution_futures_dict[ni.prev_output_handle.step_key] for ni in step.step_inputs ] variables = { 'executionParams': { 'selector': { 'name': pipeline.name }, 'environmentConfigData': env_config, 'mode': mode, 'executionMetadata': { 'runId': run_config.run_id }, 'stepKeys': [step.key], } } future = client.submit(query_on_dask_worker, handle, query, variables, dependencies) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the master execution_step_events = client.gather(execution_futures) # execution_step_events is now a list of lists, the inner lists contain the dagster # events emitted by each step event_list = list( itertools.chain.from_iterable(execution_step_events)) return PipelineExecutionResult( pipeline, run_config.run_id, event_list, lambda: scoped_pipeline_context( pipeline, env_config, run_config, system_storage_data=SystemStorageData( intermediates_manager=pipeline_context. intermediates_manager, run_storage=pipeline_context.run_storage, file_manager=pipeline_context.file_manager, ), ), )