def define_dagstermill_solid( name, notebook_path, input_defs=None, output_defs=None, config=None, required_resource_keys=None, output_notebook=None, config_schema=None, ): '''Wrap a Jupyter notebook in a solid. Arguments: name (str): The name of the solid. notebook_path (str): Path to the backing notebook. input_defs (Optional[List[InputDefinition]]): The solid's inputs. output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should call :py:func:`~dagstermill.yield_result` to yield each of these outputs. required_resource_keys (Optional[Set[str]]): The string names of any required resources. output_notebook (Optional[str]): If set, will be used as the name of an injected output of type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in addition to the :py:class:`~dagster.Materialization` that is always created). This respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on the pipeline system storage, so, e.g., if :py:class:`~dagster_aws.s3.s3_system_storage` is configured, the output will be a :py:class:`~dagster_aws.s3.S3FileHandle`. Returns: :py:class:`~dagster.SolidDefinition` ''' check.str_param(name, 'name') check.str_param(notebook_path, 'notebook_path') input_defs = check.opt_list_param(input_defs, 'input_defs', of_type=InputDefinition) output_defs = check.opt_list_param(output_defs, 'output_defs', of_type=OutputDefinition) required_resource_keys = check.opt_set_param(required_resource_keys, 'required_resource_keys', of_type=str) return SolidDefinition( name=name, input_defs=input_defs, compute_fn=_dm_solid_compute(name, notebook_path, output_notebook), output_defs=output_defs + ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)] if output_notebook else []), config_schema=canonicalize_backcompat_args( check_user_facing_opt_config_param(config_schema, 'config_schema'), 'config_schema', check_user_facing_opt_config_param(config, 'config'), 'config', '0.9.0', ), required_resource_keys=required_resource_keys, description='This solid is backed by the notebook at {path}'.format( path=notebook_path), tags={ 'notebook_path': notebook_path, 'kind': 'ipynb' }, )
def __new__( cls, pipeline_name=None, run_id=None, run_config=None, mode=None, solid_selection=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot_id=None, execution_plan_snapshot_id=None, ## GRAVEYARD BELOW # see https://github.com/dagster-io/dagster/issues/2372 for explanation previous_run_id=None, selector=None, solid_subset=None, environment_dict=None, ): # a frozenset which contains the names of the solids to execute check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str) # a list of solid queries provided by the user # possible to be None when only solids_to_execute is set by the user directly check.opt_list_param(solid_selection, 'solid_selection', of_type=str) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) check.opt_str_param(root_run_id, 'root_run_id') check.opt_str_param(parent_run_id, 'parent_run_id') check.invariant( (root_run_id is not None and parent_run_id is not None) or (root_run_id is None and parent_run_id is None), ('Must set both root_run_id and parent_run_id when creating a PipelineRun that ' 'belongs to a run group'), ) # Compatibility # ---------------------------------------------------------------------------------------- check.invariant( not (run_config is not None and environment_dict is not None), 'Cannot set both run_config and environment_dict. Use run_config parameter.', ) run_config = run_config or environment_dict # Historical runs may have previous_run_id set, in which case # that previous ID becomes both the root and the parent if previous_run_id: if not (parent_run_id and root_run_id): parent_run_id = previous_run_id root_run_id = previous_run_id check.opt_inst_param(selector, 'selector', ExecutionSelector) if selector: check.invariant( pipeline_name is None or selector.name == pipeline_name, ('Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: ' 'selector was passed with pipeline {selector_pipeline}'. format(pipeline_name=pipeline_name, selector_pipeline=selector.name)), ) if pipeline_name is None: pipeline_name = selector.name check.invariant( solids_to_execute is None or set(selector.solid_subset) == solids_to_execute, ('Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: ' 'selector was passed with subset {selector_subset}'.format( solids_to_execute=solids_to_execute, selector_subset=selector.solid_subset)), ) # for old runs that only have selector but no solids_to_execute if solids_to_execute is None: solids_to_execute = (frozenset(selector.solid_subset) if selector.solid_subset else None) # for old runs that specified list-type solid_subset check.opt_list_param(solid_subset, 'solid_subset', of_type=str) if solid_subset: solids_to_execute = frozenset(solid_subset) # ---------------------------------------------------------------------------------------- return super(PipelineRun, cls).__new__( cls, pipeline_name=check.opt_str_param(pipeline_name, 'pipeline_name'), run_id=check.opt_str_param(run_id, 'run_id', default=make_new_run_id()), run_config=check.opt_dict_param(run_config, 'run_config', key_type=str), mode=check.opt_str_param(mode, 'mode'), solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=check.opt_inst_param(status, 'status', PipelineRunStatus, PipelineRunStatus.NOT_STARTED), tags=check.opt_dict_param(tags, 'tags', key_type=str), root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, 'pipeline_snapshot_id'), execution_plan_snapshot_id=check.opt_str_param( execution_plan_snapshot_id, 'execution_plan_snapshot_id'), )
def execute(pipeline_context, execution_plan, step_keys_to_execute=None): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_key_set = None if step_keys_to_execute is None else set( step_keys_to_execute) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps in process (pid: {pid})'.format(pid=os.getpid()), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), ) with time_execution_scope() as timer_result: check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config), ) failed_or_skipped_steps = set() step_levels = execution_plan.topological_step_levels() # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 for step_level in step_levels: for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) failed_inputs = [] for step_input in step.step_inputs: failed_inputs.extend( failed_or_skipped_steps.intersection( step_input.dependency_keys)) if failed_inputs: step_context.log.info(( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.' ).format(step=step.key, failed_inputs=failed_inputs)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional( uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue for step_event in check.generator( dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event yield DagsterEvent.engine_event( pipeline_context, 'Finished steps in process (pid: {pid}) in {duration_ms}'.format( pid=os.getpid(), duration_ms=format_duration(timer_result.millis)), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), )
def __init__( self, solid_defs: Optional[List[NodeDefinition]] = None, name: Optional[str] = None, description: Optional[str] = None, dependencies: Optional[Dict[Union[str, NodeInvocation], Dict[str, IDependencyDefinition]]] = None, mode_defs: Optional[List[ModeDefinition]] = None, preset_defs: Optional[List[PresetDefinition]] = None, tags: Dict[str, Any] = None, hook_defs: Optional[AbstractSet[HookDefinition]] = None, solid_retry_policy: Optional[RetryPolicy] = None, graph_def=None, _parent_pipeline_def=None, # https://github.com/dagster-io/dagster/issues/2115 version_strategy: Optional[VersionStrategy] = None, ): # If a graph is specificed directly use it if check.opt_inst_param(graph_def, "graph_def", GraphDefinition): self._graph_def = graph_def self._name = name or graph_def.name # Otherwise fallback to legacy construction else: if name is None: check.failed("name must be set provided") self._name = name if solid_defs is None: check.failed("solid_defs must be provided") self._graph_def = GraphDefinition( name=name, dependencies=dependencies, node_defs=solid_defs, input_mappings=None, output_mappings=None, config=None, description=None, ) # tags and description can exist on graph as well, but since # same graph may be in multiple pipelines/jobs, keep separate layer self._description = check.opt_str_param(description, "description") self._tags = validate_tags(tags) self._current_level_node_defs = self._graph_def.node_defs mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition) if not mode_definitions: mode_definitions = [ModeDefinition()] self._mode_definitions = mode_definitions seen_modes = set() for mode_def in mode_definitions: if mode_def.name in seen_modes: raise DagsterInvalidDefinitionError(( 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". ' "Modes must have unique names.").format( mode_name=mode_def.name, pipeline_name=self.name)) seen_modes.add(mode_def.name) self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition) self._solid_retry_policy = check.opt_inst_param( solid_retry_policy, "solid_retry_policy", RetryPolicy) self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition) self._preset_dict: Dict[str, PresetDefinition] = {} for preset in self._preset_defs: if preset.name in self._preset_dict: raise DagsterInvalidDefinitionError(( 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". ' "PresetDefinitions must have unique names.").format( name=preset.name, pipeline_name=self.name)) if preset.mode not in seen_modes: raise DagsterInvalidDefinitionError( ('PresetDefinition "{name}" in "{pipeline_name}" ' 'references mode "{mode}" which is not defined.').format( name=preset.name, pipeline_name=self.name, mode=preset.mode)) self._preset_dict[preset.name] = preset self._resource_requirements = { mode_def.name: _checked_resource_reqs_for_mode( mode_def, self._current_level_node_defs, self._graph_def._dagster_type_dict, self._graph_def._node_dict, self._hook_defs, self._graph_def._dependency_structure, ) for mode_def in self._mode_definitions } # Recursively explore all nodes in the this pipeline self._all_node_defs = _build_all_node_defs( self._current_level_node_defs) self._parent_pipeline_def = check.opt_inst_param( _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition) self._cached_run_config_schemas: Dict[str, "RunConfigSchema"] = {} self._cached_external_pipeline = None self.version_strategy = check.opt_inst_param(version_strategy, "version_strategy", VersionStrategy) if self.version_strategy is not None: experimental_class_warning("VersionStrategy")
def define_python_dagster_type( python_type, name=None, description=None, input_hydration_config=None, output_materialization_config=None, serialization_strategy=None, auto_plugins=None, type_check=None, ): '''Core machinery for defining a Dagster type corresponding to an existing python type. Users should generally use the :py:func:`@dagster_type` decorator or :py:func:`as_dagster_type`, both of which defer to this function. Args: python_type (cls): The python type to wrap as a Dagster type. name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of the ``python_type`` will be used. description (Optional[str]): A user-readable description of the type. input_hydration_config (Optional[InputHydrationConfig]): An instance of a class constructed using the :py:func:`@input_hydration_config <dagster.InputHydrationConfig>` decorator that can map config data to a value of this type. output_materialization_config (Optiona[OutputMaterializationConfig]): An instance of a class constructed using the :py:func:`@output_materialization_config <dagster.output_materialization_config>` decorator that can persist values of this type. serialization_strategy (Optional[SerializationStrategy]): An instance of a class that inherits from :py:class:`SerializationStrategy`. The default strategy for serializing this value when automatically persisting it between execution steps. You should set this value if the ordinary serialization machinery (e.g., pickle) will not be adequate for this type. auto_plugins (Optional[List[TypeStoragePlugin]]): If types must be serialized differently depending on the storage being used for intermediates, they should specify this argument. In these cases the serialization_strategy argument is not sufficient because serialization requires specialized API calls, e.g. to call an S3 API directly instead of using a generic file object. See ``dagster_pyspark.DataFrame`` for an example. type_check (Optional[Callable[[Any], Union[bool, TypeCheck]]]): If specified, this function will be called in place of the default isinstance type check. This function should return ``True`` if the type check succeds, ``False`` if it fails, or, if additional metadata should be emitted along with the type check success or failure, an instance of :py:class:`TypeCheck` with the ``success`` field set appropriately. ''' check.type_param(python_type, 'python_type') check.opt_str_param(name, 'name', python_type.__name__) check.opt_str_param(description, 'description') check.opt_inst_param(input_hydration_config, 'input_hydration_config', InputHydrationConfig) check.opt_inst_param(output_materialization_config, 'output_materialization_config', OutputMaterializationConfig) check.opt_inst_param( serialization_strategy, 'serialization_strategy', SerializationStrategy, default=PickleSerializationStrategy(), ) auto_plugins = check.opt_list_param(auto_plugins, 'auto_plugins', of_type=type) check.param_invariant( all( issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins), 'auto_plugins', ) check.opt_callable_param(type_check, 'type_check') class _ObjectType(PythonObjectType): def __init__(self): super(_ObjectType, self).__init__( python_type=python_type, name=name, description=description, input_hydration_config=input_hydration_config, output_materialization_config=output_materialization_config, serialization_strategy=serialization_strategy, auto_plugins=auto_plugins, type_check=type_check, ) return _ObjectType
def reexecute_pipeline( pipeline: Union[IPipeline, PipelineDefinition], parent_run_id: str, run_config: Optional[dict] = None, step_selection: Optional[List[str]] = None, mode: Optional[str] = None, preset: Optional[str] = None, tags: Optional[Dict[str, Any]] = None, instance: DagsterInstance = None, raise_on_error: bool = True, ) -> PipelineExecutionResult: """Reexecute an existing pipeline run. Users will typically call this API when testing pipeline reexecution, or running standalone scripts. Parameters: pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute. parent_run_id (str): The id of the previous run to reexecute. The run must exist in the instance. run_config (Optional[dict]): The environment configuration that parametrizes this run, as a dict. step_selection (Optional[List[str]]): A list of step selection queries (including single step keys) to execute. For example: - ['some_solid']: select the execution step "some_solid" itself. - ['*some_solid']: select the step "some_solid" and all its ancestors (upstream dependencies). - ['*some_solid+++']: select the step "some_solid", all its ancestors, and its descendants (downstream dependencies) within 3 levels down. - ['*some_solid', 'other_solid_a', 'other_solid_b+']: select "some_solid" and all its ancestors, "other_solid_a" itself, and "other_solid_b" and its direct child execution steps. mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode`` and ``preset``. preset (Optional[str]): The name of the pipeline preset to use. You may not set both ``mode`` and ``preset``. tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline logs. instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``, an ephemeral instance will be used, and no artifacts will be persisted from the run. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``True``, since this is the most useful behavior in test. Returns: :py:class:`PipelineExecutionResult`: The result of pipeline execution. For the asynchronous version, see :py:func:`reexecute_pipeline_iterator`. """ check.opt_list_param(step_selection, "step_selection", of_type=str) check.str_param(parent_run_id, "parent_run_id") with ephemeral_instance_if_missing(instance) as execute_instance: (pipeline, run_config, mode, tags, _, _) = _check_execute_pipeline_args( pipeline=pipeline, run_config=run_config, mode=mode, preset=preset, tags=tags, ) parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id) check.invariant( parent_pipeline_run, "No parent run with id {parent_run_id} found in instance.".format( parent_run_id=parent_run_id), ) # resolve step selection DSL queries using parent execution plan snapshot if step_selection: full_plan = create_execution_plan(pipeline, parent_pipeline_run.run_config, mode) step_keys = parse_items_from_selection(step_selection) # resolve execution plan with any resolved dynamic step keys resolved_plan = full_plan.build_subset_plan(step_keys) # parse selection using all step deps step_keys_to_execute = parse_step_selection( resolved_plan.get_all_step_deps(), step_selection) else: step_keys_to_execute = None pipeline_run = execute_instance.create_run_for_pipeline( pipeline_def=pipeline.get_definition(), run_config=run_config, mode=mode, tags=tags, solid_selection=parent_pipeline_run.solid_selection, solids_to_execute=parent_pipeline_run.solids_to_execute, # convert to frozenset https://github.com/dagster-io/dagster/issues/2914 step_keys_to_execute=list(step_keys_to_execute) if step_keys_to_execute else None, root_run_id=parent_pipeline_run.root_run_id or parent_pipeline_run.run_id, parent_run_id=parent_pipeline_run.run_id, ) return execute_run(pipeline, pipeline_run, execute_instance, raise_on_error=raise_on_error)
def _check_execute_pipeline_args( pipeline: Union[PipelineDefinition, IPipeline], run_config: Optional[dict], mode: Optional[str], preset: Optional[str], tags: Optional[Dict[str, Any]], solid_selection: Optional[List[str]] = None, ) -> Tuple[IPipeline, Optional[dict], Optional[str], Dict[str, Any], FrozenSet[str], Optional[List[str]], ]: pipeline = _check_pipeline(pipeline) pipeline_def = pipeline.get_definition() check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition) run_config = check.opt_dict_param(run_config, "run_config") check.opt_str_param(mode, "mode") check.opt_str_param(preset, "preset") check.invariant( not (mode is not None and preset is not None), "You may set only one of `mode` (got {mode}) or `preset` (got {preset})." .format(mode=mode, preset=preset), ) tags = check.opt_dict_param(tags, "tags", key_type=str) check.opt_list_param(solid_selection, "solid_selection", of_type=str) if preset is not None: pipeline_preset = pipeline_def.get_preset(preset) if pipeline_preset.run_config is not None: check.invariant( (not run_config) or (pipeline_preset.run_config == run_config), "The environment set in preset '{preset}' does not agree with the environment " "passed in the `run_config` argument.".format(preset=preset), ) run_config = pipeline_preset.run_config # load solid_selection from preset if pipeline_preset.solid_selection is not None: check.invariant( solid_selection is None or solid_selection == pipeline_preset.solid_selection, "The solid_selection set in preset '{preset}', {preset_subset}, does not agree with " "the `solid_selection` argument: {solid_selection}".format( preset=preset, preset_subset=pipeline_preset.solid_selection, solid_selection=solid_selection, ), ) solid_selection = pipeline_preset.solid_selection check.invariant( mode is None or mode == pipeline_preset.mode, "Mode {mode} does not agree with the mode set in preset '{preset}': " "('{preset_mode}')".format(preset=preset, preset_mode=pipeline_preset.mode, mode=mode), ) mode = pipeline_preset.mode tags = merge_dicts(pipeline_preset.tags, tags) if mode is not None: if not pipeline_def.has_mode_definition(mode): raise DagsterInvariantViolationError(( "You have attempted to execute pipeline {name} with mode {mode}. " "Available modes: {modes}").format( name=pipeline_def.name, mode=mode, modes=pipeline_def.available_modes, )) else: if pipeline_def.is_multi_mode: raise DagsterInvariantViolationError(( "Pipeline {name} has multiple modes (Available modes: {modes}) and you have " "attempted to execute it without specifying a mode. Set " "mode property on the PipelineRun object.").format( name=pipeline_def.name, modes=pipeline_def.available_modes)) mode = pipeline_def.get_default_mode_name() tags = merge_dicts(pipeline_def.tags, tags) # generate pipeline subset from the given solid_selection if solid_selection: pipeline = pipeline.subset_for_execution(solid_selection) return ( pipeline, run_config, mode, tags, pipeline.solids_to_execute, solid_selection, )
def get_step_input_source(plan_builder, solid, input_name, input_def, dependency_structure, handle, parent_step_inputs): check.inst_param(plan_builder, "plan_builder", _PlanBuilder) check.inst_param(solid, "solid", Solid) check.str_param(input_name, "input_name") check.inst_param(input_def, "input_def", InputDefinition) check.inst_param(dependency_structure, "dependency_structure", DependencyStructure) check.opt_inst_param(handle, "handle", SolidHandle) check.opt_list_param(parent_step_inputs, "parent_step_inputs", of_type=StepInput) input_handle = solid.input_handle(input_name) solid_config = plan_builder.environment_config.solids.get(str(handle)) input_config = solid_config.inputs.get( input_name) if solid_config else None input_def = solid.definition.input_def_named(input_name) if input_def.root_manager_key and not dependency_structure.has_deps( input_handle): return FromRootInputManager(input_def=input_def, config_data=input_config) if dependency_structure.has_singular_dep(input_handle): solid_output_handle = dependency_structure.get_singular_dep( input_handle) step_output_handle = plan_builder.get_output_handle( solid_output_handle) if isinstance(step_output_handle, UnresolvedStepOutputHandle): return FromUnresolvedStepOutput( unresolved_step_output_handle=step_output_handle, input_def=input_def, config_data=input_config, ) if solid_output_handle.output_def.is_dynamic: return FromPendingDynamicStepOutput( step_output_handle=step_output_handle, input_def=input_def, config_data=input_config, ) return FromStepOutput( step_output_handle=step_output_handle, input_def=input_def, config_data=input_config, fan_in=False, ) if dependency_structure.has_multi_deps(input_handle): sources = [] for idx, handle_or_placeholder in enumerate( dependency_structure.get_multi_deps(input_handle)): if handle_or_placeholder is MappedInputPlaceholder: parent_name = solid.container_mapped_fan_in_input( input_name, idx).definition.name parent_inputs = { step_input.name: step_input for step_input in parent_step_inputs } parent_input = parent_inputs[parent_name] sources.append(parent_input.source) else: sources.append( FromStepOutput( step_output_handle=plan_builder.get_output_handle( handle_or_placeholder), input_def=input_def, config_data=input_config, fan_in=True, )) return FromMultipleSources(sources) if solid_config and input_name in solid_config.inputs: return FromConfig( solid_config.inputs[input_name], dagster_type=input_def.dagster_type, input_name=input_name, ) if solid.container_maps_input(input_name): parent_name = solid.container_mapped_input(input_name).definition.name parent_inputs = { step_input.name: step_input for step_input in parent_step_inputs } if parent_name in parent_inputs: parent_input = parent_inputs[parent_name] return parent_input.source # else fall through to Nothing case or raise if solid.definition.input_has_default(input_name): return FromDefaultValue( solid.definition.default_value_for_input(input_name)) # At this point we have an input that is not hooked up to # the output of another solid or provided via environment config. # We will allow this for "Nothing" type inputs and continue. if input_def.dagster_type.kind == DagsterTypeKind.NOTHING: return None # Otherwise we throw an error. raise DagsterInvariantViolationError( ("In pipeline {pipeline_name} solid {solid_name}, input {input_name} " "must get a value either (a) from a dependency or (b) from the " "inputs section of its configuration.").format( pipeline_name=plan_builder.pipeline_name, solid_name=solid.name, input_name=input_name))
def __new__( cls, pipeline_name=None, run_id=None, environment_dict=None, mode=None, solid_subset=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot_id=None, execution_plan_snapshot_id=None, ## GRAVEYARD BELOW # see https://github.com/dagster-io/dagster/issues/2372 for explanation previous_run_id=None, selector=None, ): from dagster.core.definitions.pipeline import ExecutionSelector check.opt_list_param(solid_subset, 'solid_subset', of_type=str) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) check.opt_str_param(root_run_id, 'root_run_id') check.opt_str_param(parent_run_id, 'parent_run_id') check.invariant( (root_run_id is not None and parent_run_id is not None) or (root_run_id is None and parent_run_id is None), ('Must set both root_run_id and parent_run_id when creating a PipelineRun that ' 'belongs to a run group'), ) # Compatibility # ---------------------------------------------------------------------------------------- # Historical runs may have previous_run_id set, in which case # that previous ID becomes both the root and the parent if previous_run_id: if not (parent_run_id and root_run_id): parent_run_id = previous_run_id root_run_id = previous_run_id check.opt_inst_param(selector, 'selector', ExecutionSelector) if selector: check.invariant( pipeline_name is None or selector.name == pipeline_name, ('Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: ' 'selector was passed with pipeline {selector_pipeline}'. format(pipeline_name=pipeline_name, selector_pipeline=selector.name)), ) if pipeline_name is None: pipeline_name = selector.name check.invariant( solid_subset is None or selector.solid_subset == solid_subset, ('Conflicting solid_subset {solid_subset} in arguments to PipelineRun: ' 'selector was passed with subset {selector_subset}'.format( solid_subset=solid_subset, selector_subset=selector.solid_subset)), ) if solid_subset is None: solid_subset = selector.solid_subset # ---------------------------------------------------------------------------------------- return super(PipelineRun, cls).__new__( cls, pipeline_name=check.opt_str_param(pipeline_name, 'pipeline_name'), run_id=check.opt_str_param(run_id, 'run_id', default=make_new_run_id()), environment_dict=check.opt_dict_param(environment_dict, 'environment_dict', key_type=str), mode=check.opt_str_param(mode, 'mode'), solid_subset=solid_subset, step_keys_to_execute=step_keys_to_execute, status=check.opt_inst_param(status, 'status', PipelineRunStatus, PipelineRunStatus.NOT_STARTED), tags=check.opt_dict_param(tags, 'tags', key_type=str), root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id, 'pipeline_snapshot_id'), execution_plan_snapshot_id=check.opt_str_param( execution_plan_snapshot_id, 'execution_plan_snapshot_id'), )
def from_pkg_resources(name, pkg_resource_defs=None, solid_selection=None, mode=None, tags=None): '''Load a preset from a package resource, using :py:func:`pkg_resources.resource_string`. Example: .. code-block:: python PresetDefinition.from_pkg_resources( name='local', mode='local', pkg_resource_defs=[ ('dagster_examples.airline_demo.environments', 'local_base.yaml'), ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'), ], ) Args: name (str): The name of this preset. Must be unique in the presets defined on a given pipeline. pkg_resource_defs (Optional[List[(str, str)]]): List of pkg_resource modules/files to load as environment config for this preset. solid_selection (Optional[List[str]]): A list of solid subselection (including single solid names) to execute with this partition. e.g. ``['*some_solid+', 'other_solid']`` mode (Optional[str]): The mode to apply when executing this preset. (default: 'default') tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset. Returns: PresetDefinition: A PresetDefinition constructed from the provided YAML strings Raises: DagsterInvariantViolationError: When one of the YAML documents is invalid and has a parse error. ''' pkg_resource_defs = check.opt_list_param(pkg_resource_defs, 'pkg_resource_defs', of_type=tuple) try: yaml_strings = [ six.ensure_str( pkg_resources.resource_string(*pkg_resource_def)) for pkg_resource_def in pkg_resource_defs ] except (ModuleNotFoundError, FileNotFoundError, UnicodeDecodeError) as err: six.raise_from( DagsterInvariantViolationError( 'Encountered error attempting to parse yaml. Loading YAMLs from ' 'package resources {pkg_resource_defs} ' 'on preset "{name}".'.format( pkg_resource_defs=pkg_resource_defs, name=name)), err, ) return PresetDefinition.from_yaml_strings(name, yaml_strings, solid_selection, mode, tags)
def from_files(name, environment_files=None, config_files=None, solid_selection=None, mode=None, tags=None): '''Static constructor for presets from YAML files. Args: name (str): The name of this preset. Must be unique in the presets defined on a given pipeline. config_files (Optional[List[str]]): List of paths or glob patterns for yaml files to load and parse as the environment config for this preset. solid_selection (Optional[List[str]]): A list of solid subselection (including single solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']`` mode (Optional[str]): The mode to apply when executing this preset. (default: 'default') tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset. Returns: PresetDefinition: A PresetDefinition constructed from the provided YAML files. Raises: DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse error. ''' check.str_param(name, 'name') config_files = canonicalize_backcompat_args(config_files, 'config_files', environment_files, 'environment_files', '0.9.0') config_files = check.opt_list_param(config_files, 'config_files') solid_selection = check.opt_nullable_list_param(solid_selection, 'solid_selection', of_type=str) mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME) filenames = [] for file_glob in config_files or []: globbed_files = glob(file_glob) if not globbed_files: raise DagsterInvalidDefinitionError( 'File or glob pattern "{file_glob}" for "config_files" in preset ' '"{name}" produced no results.'.format( name=name, file_glob=file_glob)) filenames += [ os.path.realpath(globbed_file) for globbed_file in globbed_files ] try: merged = merge_yamls(filenames) except yaml.YAMLError as err: six.raise_from( DagsterInvariantViolationError( 'Encountered error attempting to parse yaml. Parsing files {file_set} ' 'loaded by file/patterns {files} on preset "{name}".'. format(file_set=filenames, files=config_files, name=name)), err, ) return PresetDefinition(name, merged, solid_selection, mode, tags)
def create_run_for_pipeline( self, pipeline_def, execution_plan=None, run_id=None, run_config=None, mode=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, solid_selection=None, ): from dagster.core.execution.api import create_execution_plan from dagster.core.execution.plan.plan import ExecutionPlan from dagster.core.snap import snapshot_from_execution_plan check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition) check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan) # note that solids_to_execute is required to execute the solid subset, which is the # frozenset version of the previous solid_subset. # solid_selection is not required and will not be converted to solids_to_execute here. # i.e. this function doesn't handle solid queries. # solid_selection is only used to pass the user queries further down. check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str) check.opt_list_param(solid_selection, "solid_selection", of_type=str) if solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): # for the case when pipeline_def is created by IPipeline or ExternalPipeline check.invariant( solids_to_execute == pipeline_def.solids_to_execute, "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} " "that conflicts with solids_to_execute arg {solids_to_execute}" .format( pipeline_solids_to_execute=str_format_list( pipeline_def.solids_to_execute), solids_to_execute=str_format_list(solids_to_execute), ), ) else: # for cases when `create_run_for_pipeline` is directly called pipeline_def = pipeline_def.get_pipeline_subset_def( solids_to_execute=solids_to_execute) full_execution_plan = execution_plan or create_execution_plan( pipeline_def, run_config=run_config, mode=mode, ) check.invariant( len(full_execution_plan.step_keys_to_execute) == len( full_execution_plan.steps)) if _is_memoized_run(tags): if step_keys_to_execute: raise DagsterInvariantViolationError( "step_keys_to_execute parameter cannot be used in conjunction with memoized " "pipeline runs.") step_keys_to_execute = self.resolve_unmemoized_steps( full_execution_plan, run_config=run_config, mode=mode, ) # TODO: tighter integration with existing step_keys_to_execute functionality subsetted_execution_plan = ( full_execution_plan.build_subset_plan(step_keys_to_execute) if step_keys_to_execute else full_execution_plan) return self.create_run( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config, mode=check.opt_str_param( mode, "mode", default=pipeline_def.get_default_mode_name()), solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=status, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( subsetted_execution_plan, pipeline_def.get_pipeline_snapshot_id()), parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot( ), )
def __new__(cls, partition_names=None): return super(ExternalPartitionNamesData, cls).__new__( cls, partition_names=check.opt_list_param(partition_names, 'partition_names', str), )
def __call__(self, fn: Callable[..., Any]) -> "OpDefinition": from ..op_definition import OpDefinition if self.input_defs is not None and self.ins is not None: check.failed( "Values cannot be provided for both the 'input_defs' and 'ins' arguments" ) if self.output_defs is not None and self.out is not None: check.failed( "Values cannot be provided for both the 'output_defs' and 'out' arguments" ) inferred_out = infer_output_props(fn) if self.ins is not None: input_defs = [ inp.to_definition(name) for name, inp in self.ins.items() ] else: input_defs = check.opt_list_param(self.input_defs, "input_defs", of_type=InputDefinition) output_defs_from_out = _resolve_output_defs_from_outs( inferred_out=inferred_out, out=self.out) resolved_output_defs = (output_defs_from_out if output_defs_from_out is not None else self.output_defs) if not self.name: self.name = fn.__name__ if resolved_output_defs is None: resolved_output_defs = [ OutputDefinition.create_from_inferred(infer_output_props(fn)) ] elif len(resolved_output_defs) == 1: resolved_output_defs = [ resolved_output_defs[0].combine_with_inferred( infer_output_props(fn)) ] compute_fn = (DecoratedSolidFunction( decorated_fn=fn) if self.decorator_takes_context else NoContextDecoratedSolidFunction(decorated_fn=fn)) resolved_input_defs = resolve_checked_solid_fn_inputs( decorator_name="@op", fn_name=self.name, compute_fn=compute_fn, explicit_input_defs=input_defs, exclude_nothing=True, ) op_def = OpDefinition( name=self.name, input_defs=resolved_input_defs, output_defs=resolved_output_defs, compute_fn=compute_fn, config_schema=self.config_schema, description=self.description or format_docstring_for_description(fn), required_resource_keys=self.required_resource_keys, tags=self.tags, version=self.version, retry_policy=self.retry_policy, ) update_wrapper(op_def, compute_fn.decorated_fn) return op_def
def __init__(self, name, solid_subset=None): self.name = check.str_param(name, 'name') if solid_subset is None: self.solid_subset = None else: self.solid_subset = check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
def __init__( self, solid_defs, name=None, description=None, dependencies=None, mode_defs=None, preset_defs=None, ): self.name = check.opt_str_param(name, 'name', '<<unnamed>>') self.description = check.opt_str_param(description, 'description') mode_definitions = check.opt_list_param(mode_defs, 'mode_defs', of_type=ModeDefinition) if not mode_definitions: mode_definitions = [ModeDefinition()] self.mode_definitions = mode_definitions current_level_solid_defs = check.list_param(_check_solids_arg( self.name, solid_defs), 'solid_defs', of_type=ISolidDefinition) seen_modes = set() for mode_def in mode_definitions: if mode_def.name in seen_modes: raise DagsterInvalidDefinitionError(( 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". ' 'Modes must have unique names.').format( mode_name=mode_def.name, pipeline_name=self.name)) seen_modes.add(mode_def.name) self.dependencies = validate_dependency_dict(dependencies) dependency_structure, pipeline_solid_dict = create_execution_structure( current_level_solid_defs, self.dependencies, container_definition=None) self._solid_dict = pipeline_solid_dict self._dependency_structure = dependency_structure self._runtime_type_dict = construct_runtime_type_dictionary( current_level_solid_defs) self._preset_dict = {} for preset in check.opt_list_param(preset_defs, 'preset_defs', PresetDefinition): if preset.name in self._preset_dict: raise DagsterInvalidDefinitionError(( 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". ' 'PresetDefinitions must have unique names.').format( name=preset.name, pipeline_name=self.name)) if preset.mode not in seen_modes: raise DagsterInvalidDefinitionError( ('PresetDefinition "{name}" in "{pipeline_name}" ' 'references mode "{mode}" which is not defined.').format( name=preset.name, pipeline_name=self.name, mode=preset.mode)) self._preset_dict[preset.name] = preset # Validate solid resource dependencies _validate_resource_dependencies(self.mode_definitions, current_level_solid_defs) self._all_solid_defs = {} for current_level_solid_def in current_level_solid_defs: for solid_def in current_level_solid_def.iterate_solid_defs(): self._all_solid_defs[solid_def.name] = solid_def
def _subset(recon_pipeline, solid_subset): check.inst_param(recon_pipeline, 'recon_pipeline', ReconstructablePipeline) check.opt_list_param(solid_subset, 'solid_subset', of_type=str) return recon_pipeline.subset_for_execution(solid_subset) if solid_subset else recon_pipeline
def bash_script_solid(bash_script_path, name='bash_script_solid', input_defs=None, **kwargs): '''This function is a factory which constructs a solid that will execute a Bash command read from a script file. Any kwargs passed to this function will be passed along to the underlying :func:`@solid <dagster.solid>` decorator. However, note that overriding ``config`` or ``output_defs`` is not supported. You might consider using :func:`@composite_solid <dagster.composite_solid>` to wrap this solid in the cases where you'd like to configure the bash solid with different config fields. Examples: .. literalinclude:: ../../../../../python_modules/libraries/dagster-bash/dagster_bash_tests/example_bash_script_solid.py :language: python Args: bash_script_path (str): The script file to execute. name (str, optional): The name of this solid. Defaults to "bash_script_solid". input_defs (List[InputDefinition], optional): input definitions for the solid. Defaults to a single Nothing input. Raises: Failure: Raised when the shell command returns a non-zero exit code. Returns: SolidDefinition: Returns the constructed solid definition. ''' check.str_param(bash_script_path, 'bash_script_path') name = check.str_param(name, 'name') check.opt_list_param(input_defs, 'input_defs', of_type=InputDefinition) if 'output_defs' in kwargs: raise TypeError( 'Overriding output_defs for bash solid is not supported.') if 'config' in kwargs: raise TypeError('Overriding config for bash solid is not supported.') @solid(name=name, description=kwargs.pop('description', 'A solid to invoke a bash command.'), input_defs=input_defs or [InputDefinition('start', Nothing)], output_defs=[OutputDefinition(str, 'result')], config=bash_solid_config(), **kwargs) def _bash_script_solid(context): output, return_code = execute_script_file( bash_script_path=bash_script_path, log=context.log, **context.solid_config) if return_code: raise Failure( description= 'Bash command execution failed with output: {output}'.format( output=output)) return output return _bash_script_solid
def reexecute_pipeline_iterator( pipeline: Union[IPipeline, PipelineDefinition], parent_run_id: str, run_config: Optional[dict] = None, step_selection: Optional[List[str]] = None, mode: Optional[str] = None, preset: Optional[str] = None, tags: Optional[Dict[str, Any]] = None, instance: DagsterInstance = None, ) -> Iterator[DagsterEvent]: """Reexecute a pipeline iteratively. Rather than package up the result of running a pipeline into a single object, like :py:func:`reexecute_pipeline`, this function yields the stream of events resulting from pipeline reexecution. This is intended to allow the caller to handle these events on a streaming basis in whatever way is appropriate. Parameters: pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute. parent_run_id (str): The id of the previous run to reexecute. The run must exist in the instance. run_config (Optional[dict]): The environment configuration that parametrizes this run, as a dict. step_selection (Optional[List[str]]): A list of step selection queries (including single step keys) to execute. For example: - ['some_solid']: select the execution step "some_solid" itself. - ['*some_solid']: select the step "some_solid" and all its ancestors (upstream dependencies). - ['*some_solid+++']: select the step "some_solid", all its ancestors, and its descendants (downstream dependencies) within 3 levels down. - ['*some_solid', 'other_solid_a', 'other_solid_b+']: select "some_solid" and all its ancestors, "other_solid_a" itself, and "other_solid_b" and its direct child execution steps. mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode`` and ``preset``. preset (Optional[str]): The name of the pipeline preset to use. You may not set both ``mode`` and ``preset``. tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline logs. instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``, an ephemeral instance will be used, and no artifacts will be persisted from the run. Returns: Iterator[DagsterEvent]: The stream of events resulting from pipeline reexecution. """ check.opt_list_param(step_selection, "step_selection", of_type=str) check.str_param(parent_run_id, "parent_run_id") with ephemeral_instance_if_missing(instance) as execute_instance: (pipeline, run_config, mode, tags, _, _) = _check_execute_pipeline_args( pipeline=pipeline, run_config=run_config, mode=mode, preset=preset, tags=tags, solid_selection=None, ) parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id) check.invariant( parent_pipeline_run, "No parent run with id {parent_run_id} found in instance.".format( parent_run_id=parent_run_id), ) # resolve step selection DSL queries using parent execution plan snapshot if step_selection: parent_execution_plan_snapshot = execute_instance.get_execution_plan_snapshot( parent_pipeline_run.execution_plan_snapshot_id) step_keys_to_execute = parse_step_selection( parent_execution_plan_snapshot.step_deps, step_selection) else: step_keys_to_execute = None pipeline_run = execute_instance.create_run_for_pipeline( pipeline_def=pipeline.get_definition(), run_config=run_config, mode=mode, tags=tags, solid_selection=parent_pipeline_run.solid_selection, solids_to_execute=parent_pipeline_run.solids_to_execute, # convert to frozenset https://github.com/dagster-io/dagster/issues/2914 step_keys_to_execute=list(step_keys_to_execute) if step_keys_to_execute else None, root_run_id=parent_pipeline_run.root_run_id or parent_pipeline_run.run_id, parent_run_id=parent_pipeline_run.run_id, ) return execute_run_iterator(pipeline, pipeline_run, execute_instance)
def __init__( self, type_check_fn: TypeCheckFn, key: t.Optional[str] = None, name: t.Optional[str] = None, is_builtin: bool = False, description: t.Optional[str] = None, loader: t.Optional[DagsterTypeLoader] = None, materializer: t.Optional[DagsterTypeMaterializer] = None, required_resource_keys: t.Set[str] = None, kind: DagsterTypeKind = DagsterTypeKind.REGULAR, typing_type: t.Any = None, metadata_entries: t.Optional[t.List[MetadataEntry]] = None, metadata: t.Optional[t.Dict[str, RawMetadataValue]] = None, ): check.opt_str_param(key, "key") check.opt_str_param(name, "name") check.invariant(not (name is None and key is None), "Must set key or name") if name is None: key = check.not_none( key, "If name is not provided, must provide key.", ) self.key, self._name = key, None elif key is None: name = check.not_none( name, "If key is not provided, must provide name.", ) self.key, self._name = name, name else: check.invariant(key and name) self.key, self._name = key, name self.description = check.opt_str_param(description, "description") self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader) self.materializer = check.opt_inst_param( materializer, "materializer", DagsterTypeMaterializer ) self.required_resource_keys = check.opt_set_param( required_resource_keys, "required_resource_keys", ) self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn") _validate_type_check_fn(self._type_check_fn, self._name) self.is_builtin = check.bool_param(is_builtin, "is_builtin") check.invariant( self.display_name is not None, "All types must have a valid display name, got None for key {}".format(key), ) self.kind = check.inst_param(kind, "kind", DagsterTypeKind) self.typing_type = typing_type metadata_entries = check.opt_list_param( metadata_entries, "metadata_entries", of_type=MetadataEntry ) metadata = check.opt_dict_param(metadata, "metadata", key_type=str) self._metadata_entries = normalize_metadata(metadata, metadata_entries)
def __init__( self, solid_defs, name=None, description=None, dependencies=None, mode_defs=None, preset_defs=None, tags=None, _parent_pipeline_def=None, # https://github.com/dagster-io/dagster/issues/2115 _hook_defs=None, ): self._name = check.opt_str_param(name, "name", "<<unnamed>>") self._description = check.opt_str_param(description, "description") mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition) if not mode_definitions: mode_definitions = [ModeDefinition()] self._mode_definitions = mode_definitions self._current_level_solid_defs = check.list_param( _check_solids_arg(self._name, solid_defs), "solid_defs", of_type=ISolidDefinition) self._tags = validate_tags(tags) seen_modes = set() for mode_def in mode_definitions: if mode_def.name in seen_modes: raise DagsterInvalidDefinitionError(( 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". ' "Modes must have unique names.").format( mode_name=mode_def.name, pipeline_name=self._name)) seen_modes.add(mode_def.name) self._dependencies = validate_dependency_dict(dependencies) dependency_structure, solid_dict = create_execution_structure( self._current_level_solid_defs, self._dependencies, container_definition=None) self._solid_dict = solid_dict self._dependency_structure = dependency_structure # eager toposort solids to detect cycles self.solids_in_topological_order = self._solids_in_topological_order() self._dagster_type_dict = construct_dagster_type_dictionary( self._current_level_solid_defs) self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition) self._preset_dict = {} for preset in self._preset_defs: if preset.name in self._preset_dict: raise DagsterInvalidDefinitionError(( 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". ' "PresetDefinitions must have unique names.").format( name=preset.name, pipeline_name=self._name)) if preset.mode not in seen_modes: raise DagsterInvalidDefinitionError( ('PresetDefinition "{name}" in "{pipeline_name}" ' 'references mode "{mode}" which is not defined.').format( name=preset.name, pipeline_name=self._name, mode=preset.mode)) self._preset_dict[preset.name] = preset # Validate solid resource dependencies _validate_resource_dependencies(self._mode_definitions, self._current_level_solid_defs, self._solid_dict) # Validate unsatisfied inputs can be materialized from config _validate_inputs(self._dependency_structure, self._solid_dict) self._all_solid_defs = _build_all_solid_defs( self._current_level_solid_defs) self._parent_pipeline_def = check.opt_inst_param( _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition) self._cached_run_config_schemas = {} self._cached_external_pipeline = None self._hook_defs = check.opt_set_param(_hook_defs, "_hook_defs", of_type=HookDefinition)
def create_run_for_pipeline( self, pipeline_def, execution_plan=None, run_id=None, run_config=None, mode=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, solid_selection=None, ): from dagster.core.execution.api import create_execution_plan from dagster.core.execution.plan.plan import ExecutionPlan from dagster.core.snap import snapshot_from_execution_plan check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition) check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan) # note that solids_to_execute is required to execute the solid subset, which is the # frozenset version of the previous solid_subset. # solid_selection is not required and will not be converted to solids_to_execute here. # i.e. this function doesn't handle solid queries. # solid_selection is only used to pass the user queries further down. check.opt_set_param(solids_to_execute, "solids_to_execute", of_type=str) check.opt_list_param(solid_selection, "solid_selection", of_type=str) if solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): # for the case when pipeline_def is created by ExecutablePipeline or ExternalPipeline check.invariant( solids_to_execute == pipeline_def.solids_to_execute, "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} " "that conflicts with solids_to_execute arg {solids_to_execute}" .format( pipeline_solids_to_execute=str_format_list( pipeline_def.solids_to_execute), solids_to_execute=str_format_list(solids_to_execute), ), ) else: # for cases when `create_run_for_pipeline` is directly called pipeline_def = pipeline_def.get_pipeline_subset_def( solids_to_execute=solids_to_execute) if execution_plan is None: execution_plan = create_execution_plan( pipeline_def, run_config=run_config, mode=mode, step_keys_to_execute=step_keys_to_execute, ) return self.create_run( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config, mode=check.opt_str_param( mode, "mode", default=pipeline_def.get_default_mode_name()), solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=status, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( execution_plan, pipeline_def.get_pipeline_snapshot_id()), parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot( ), )
def _checked_input_resource_reqs_for_mode( dependency_structure: DependencyStructure, node_dict: Dict[str, Node], mode_def: ModeDefinition, outer_dependency_structures: Optional[List[DependencyStructure]] = None, outer_solids: Optional[List[Node]] = None, ) -> Set[str]: outer_dependency_structures = check.opt_list_param( outer_dependency_structures, "outer_dependency_structures", DependencyStructure) outer_solids = check.opt_list_param(outer_solids, "outer_solids", Node) resource_reqs = set() mode_root_input_managers = set( key for key, resource_def in mode_def.resource_defs.items() if isinstance(resource_def, RootInputManagerDefinition)) for node in node_dict.values(): if node.is_graph: graph_def = node.definition.ensure_graph_def() # check inner solids resource_reqs.update( _checked_input_resource_reqs_for_mode( dependency_structure=graph_def.dependency_structure, node_dict=graph_def.node_dict, mode_def=mode_def, outer_dependency_structures=outer_dependency_structures + [dependency_structure], outer_solids=outer_solids + [node], )) for handle in node.input_handles(): source_output_handles = None if dependency_structure.has_deps(handle): # input is connected to outputs from the same dependency structure source_output_handles = dependency_structure.get_deps_list( handle) else: # input is connected to outputs from outer dependency structure, e.g. first solids # in a composite curr_node = node curr_handle = handle curr_index = len(outer_solids) - 1 # Checks to see if input is mapped to an outer dependency structure while curr_index >= 0 and curr_node.container_maps_input( curr_handle.input_name): curr_handle = SolidInputHandle( solid=outer_solids[curr_index], input_def=curr_node.container_mapped_input( curr_handle.input_name).definition, ) if outer_dependency_structures[curr_index].has_deps( curr_handle): source_output_handles = outer_dependency_structures[ curr_index].get_deps_list(curr_handle) break curr_node = outer_solids[curr_index] curr_index -= 1 if source_output_handles: # input is connected to source output handles within the graph for source_output_handle in source_output_handles: output_manager_key = source_output_handle.output_def.io_manager_key output_manager_def = mode_def.resource_defs[ output_manager_key] if not isinstance(output_manager_def, IInputManagerDefinition): raise DagsterInvalidDefinitionError( f'Input "{handle.input_def.name}" of {node.describe_node()} is ' f'connected to output "{source_output_handle.output_def.name}" ' f"of {source_output_handle.solid.describe_node()}. That output does not " "have an output " f"manager that knows how to load inputs, so we don't know how " f"to load the input. To address this, assign an IOManager to " f"the upstream output.") else: # input is unconnected input_def = handle.input_def if (not input_def.dagster_type.loader and not input_def.dagster_type.kind == DagsterTypeKind.NOTHING and not input_def.root_manager_key): raise DagsterInvalidDefinitionError( "Input '{input_name}' in {described_node} is not connected to " "the output of a previous node and can not be loaded from configuration, " "making it impossible to execute. " "Possible solutions are:\n" " * add a dagster_type_loader for the type '{dagster_type}'\n" " * connect '{input_name}' to the output of another node\n" .format( described_node=node.describe_node(), input_name=input_def.name, dagster_type=input_def.dagster_type.display_name, )) # If a root manager is provided, it's always used. I.e. it has priority over # the other ways of loading unsatisfied inputs - dagster type loaders and # default values. if input_def.root_manager_key: resource_reqs.add(input_def.root_manager_key) if input_def.root_manager_key not in mode_def.resource_defs: error_msg = _get_missing_resource_error_msg( resource_type="root input manager", resource_key=input_def.root_manager_key, descriptor= f"unsatisfied input '{input_def.name}' of {node.describe_node()}", mode_def=mode_def, resource_defs_of_type=mode_root_input_managers, ) raise DagsterInvalidDefinitionError(error_msg) return resource_reqs
def __init__(self, sequence): self.sequence = check.opt_list_param(sequence, 'sequence', of_type=(int, float))
def __init__(self, description=None, metadata_entries=None): super(Failure, self).__init__(description) self.description = check.opt_str_param(description, "description") self.metadata_entries = check.opt_list_param( metadata_entries, "metadata_entries", of_type=EventMetadataEntry)
def __init__( self, solid_defs, name=None, description=None, dependencies=None, mode_defs=None, preset_defs=None, ): self._name = check.opt_str_param(name, 'name', '<<unnamed>>') self._description = check.opt_str_param(description, 'description') mode_definitions = check.opt_list_param(mode_defs, 'mode_defs', of_type=ModeDefinition) if not mode_definitions: mode_definitions = [ModeDefinition()] self._mode_definitions = mode_definitions self._current_level_solid_defs = check.list_param( _check_solids_arg(self._name, solid_defs), 'solid_defs', of_type=ISolidDefinition) seen_modes = set() for mode_def in mode_definitions: if mode_def.name in seen_modes: raise DagsterInvalidDefinitionError(( 'Two modes seen with the name "{mode_name}" in "{pipeline_name}". ' 'Modes must have unique names.').format( mode_name=mode_def.name, pipeline_name=self._name)) seen_modes.add(mode_def.name) self._dependencies = validate_dependency_dict(dependencies) dependency_structure, solid_dict = create_execution_structure( self._current_level_solid_defs, self._dependencies, container_definition=None) self._solid_dict = solid_dict self._dependency_structure = dependency_structure self._runtime_type_dict = construct_dagster_type_dictionary( self._current_level_solid_defs) self._preset_defs = check.opt_list_param(preset_defs, 'preset_defs', PresetDefinition) self._preset_dict = {} for preset in self._preset_defs: if preset.name in self._preset_dict: raise DagsterInvalidDefinitionError(( 'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". ' 'PresetDefinitions must have unique names.').format( name=preset.name, pipeline_name=self._name)) if preset.mode not in seen_modes: raise DagsterInvalidDefinitionError( ('PresetDefinition "{name}" in "{pipeline_name}" ' 'references mode "{mode}" which is not defined.').format( name=preset.name, pipeline_name=self._name, mode=preset.mode)) self._preset_dict[preset.name] = preset # Validate solid resource dependencies _validate_resource_dependencies(self._mode_definitions, self._current_level_solid_defs) # Validate unsatisfied inputs can be materialized from config _validate_inputs(self._dependency_structure, self._solid_dict) self._all_solid_defs = _build_all_solid_defs( self._current_level_solid_defs) self._selector = ExecutionSelector(self.name, list(solid_dict.keys())) self._cached_enviroment_schemas = {}
def get_step_input(plan_builder, solid, input_name, input_def, dependency_structure, handle, parent_step_inputs): check.inst_param(plan_builder, 'plan_builder', _PlanBuilder) check.inst_param(solid, 'solid', Solid) check.str_param(input_name, 'input_name') check.inst_param(input_def, 'input_def', InputDefinition) check.inst_param(dependency_structure, 'dependency_structure', DependencyStructure) check.opt_inst_param(handle, 'handle', SolidHandle) check.opt_list_param(parent_step_inputs, 'parent_step_inputs', of_type=StepInput) solid_config = plan_builder.environment_config.solids.get(str(handle)) if solid_config and input_name in solid_config.inputs: return StepInput( input_name, input_def.runtime_type, StepInputSourceType.CONFIG, config_data=solid_config.inputs[input_name], ) input_handle = solid.input_handle(input_name) if dependency_structure.has_singular_dep(input_handle): solid_output_handle = dependency_structure.get_singular_dep( input_handle) return StepInput( input_name, input_def.runtime_type, StepInputSourceType.SINGLE_OUTPUT, [plan_builder.get_output_handle(solid_output_handle)], ) if dependency_structure.has_multi_deps(input_handle): solid_output_handles = dependency_structure.get_multi_deps( input_handle) return StepInput( input_name, input_def.runtime_type, StepInputSourceType.MULTIPLE_OUTPUTS, [ plan_builder.get_output_handle(solid_output_handle) for solid_output_handle in solid_output_handles ], ) if solid.container_maps_input(input_name): parent_name = solid.container_mapped_input(input_name).definition.name parent_inputs = { step_input.name: step_input for step_input in parent_step_inputs } if parent_name in parent_inputs: parent_input = parent_inputs[parent_name] return StepInput( input_name, input_def.runtime_type, parent_input.source_type, parent_input.source_handles, parent_input.config_data, ) # At this point we have an input that is not hooked up to # the output of another solid or provided via environment config. # We will allow this for "Nothing" type inputs and continue. if input_def.runtime_type.is_nothing: return None # Otherwise we throw an error. raise DagsterInvariantViolationError( ('In pipeline {pipeline_name} solid {solid_name}, input {input_name} ' 'must get a value either (a) from a dependency or (b) from the ' 'inputs section of its configuration.').format( pipeline_name=plan_builder.pipeline_name, solid_name=solid.name, input_name=input_name))
def build_sub_pipeline(self, solid_subset): check.opt_list_param(solid_subset, 'solid_subset', of_type=str) return self if solid_subset is None else _build_sub_pipeline( self, solid_subset)
def _check_execute_pipeline_args( pipeline, run_config, mode, preset, tags, instance, solid_selection=None ): pipeline = _check_pipeline(pipeline) pipeline_def = pipeline.get_definition() check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition) run_config = check.opt_dict_param(run_config, 'run_config') check.opt_str_param(mode, 'mode') check.opt_str_param(preset, 'preset') check.invariant( not (mode is not None and preset is not None), 'You may set only one of `mode` (got {mode}) or `preset` (got {preset}).'.format( mode=mode, preset=preset ), ) tags = check.opt_dict_param(tags, 'tags', key_type=str) check.opt_list_param(solid_selection, 'solid_selection', of_type=str) if preset is not None: pipeline_preset = pipeline_def.get_preset(preset) if pipeline_preset.run_config is not None: check.invariant( (not run_config) or (pipeline_preset.run_config == run_config), 'The environment set in preset \'{preset}\' does not agree with the environment ' 'passed in the `run_config` argument.'.format(preset=preset), ) run_config = pipeline_preset.run_config # load solid_selection from preset if pipeline_preset.solid_selection is not None: check.invariant( solid_selection is None or solid_selection == pipeline_preset.solid_selection, 'The solid_selection set in preset \'{preset}\', {preset_subset}, does not agree with ' 'the `solid_selection` argument: {solid_selection}'.format( preset=preset, preset_subset=pipeline_preset.solid_selection, solid_selection=solid_selection, ), ) solid_selection = pipeline_preset.solid_selection check.invariant( mode is None or mode == pipeline_preset.mode, 'Mode {mode} does not agree with the mode set in preset \'{preset}\': ' '(\'{preset_mode}\')'.format( preset=preset, preset_mode=pipeline_preset.mode, mode=mode ), ) mode = pipeline_preset.mode tags = merge_dicts(pipeline_preset.tags, tags) if mode is not None: if not pipeline_def.has_mode_definition(mode): raise DagsterInvariantViolationError( ( 'You have attempted to execute pipeline {name} with mode {mode}. ' 'Available modes: {modes}' ).format( name=pipeline_def.name, mode=mode, modes=pipeline_def.available_modes, ) ) else: if pipeline_def.is_multi_mode: raise DagsterInvariantViolationError( ( 'Pipeline {name} has multiple modes (Available modes: {modes}) and you have ' 'attempted to execute it without specifying a mode. Set ' 'mode property on the PipelineRun object.' ).format(name=pipeline_def.name, modes=pipeline_def.available_modes) ) mode = pipeline_def.get_default_mode_name() tags = merge_dicts(pipeline_def.tags, tags) check.opt_inst_param(instance, 'instance', DagsterInstance) instance = instance or DagsterInstance.ephemeral() # generate pipeline subset from the given solid_selection if solid_selection: pipeline = pipeline.subset_for_execution(solid_selection) return ( pipeline, run_config, instance, mode, tags, pipeline.solids_to_execute, solid_selection, )
def _from_storage( cls, pipeline_name=None, run_id=None, run_config=None, mode=None, solid_selection=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot_id=None, execution_plan_snapshot_id=None, # backcompat environment_dict=None, previous_run_id=None, selector=None, solid_subset=None, reexecution_config=None, # pylint: disable=unused-argument **kwargs): # serdes log # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve # * added pipeline_snapshot_id # * renamed previous_run_id -> parent_run_id, added root_run_id # * added execution_plan_snapshot_id # * removed selector # * added solid_subset # * renamed solid_subset -> solid_selection, added solids_to_execute # * renamed environment_dict -> run_config # back compat for environment dict => run_config if environment_dict: check.invariant( not run_config, 'Cannot set both run_config and environment_dict. Use run_config parameter.', ) run_config = environment_dict # back compat for previous_run_id => parent_run_id, root_run_id if previous_run_id and not (parent_run_id and root_run_id): parent_run_id = previous_run_id root_run_id = previous_run_id # back compat for selector => pipeline_name, solids_to_execute selector = check.opt_inst_param(selector, 'selector', ExecutionSelector) if selector: check.invariant( pipeline_name is None or selector.name == pipeline_name, ('Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: ' 'selector was passed with pipeline {selector_pipeline}'. format(pipeline_name=pipeline_name, selector_pipeline=selector.name)), ) if pipeline_name is None: pipeline_name = selector.name check.invariant( solids_to_execute is None or set(selector.solid_subset) == solids_to_execute, ('Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: ' 'selector was passed with subset {selector_subset}'.format( solids_to_execute=solids_to_execute, selector_subset=selector.solid_subset)), ) # for old runs that only have selector but no solids_to_execute if solids_to_execute is None: solids_to_execute = (frozenset(selector.solid_subset) if selector.solid_subset else None) # back compat for solid_subset => solids_to_execute check.opt_list_param(solid_subset, 'solid_subset', of_type=str) if solid_subset: solids_to_execute = frozenset(solid_subset) # warn about unused arguments if len(kwargs): warnings.warn( 'Found unhandled arguments from stored PipelineRun: {args}'. format(args=kwargs.keys())) return cls.__new__( # pylint: disable=redundant-keyword-arg cls, pipeline_name=pipeline_name, run_id=run_id, run_config=run_config, mode=mode, solid_selection=solid_selection, solids_to_execute=solids_to_execute, step_keys_to_execute=step_keys_to_execute, status=status, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, pipeline_snapshot_id=pipeline_snapshot_id, execution_plan_snapshot_id=execution_plan_snapshot_id, )