def from_dagster_event_record(event_record, pipeline_name, external_execution_plan): # Lots of event types. Pylint thinks there are too many branches # pylint: disable=too-many-branches check.inst_param(event_record, 'event_record', EventRecord) check.param_invariant(event_record.is_dagster_event, 'event_record') check.str_param(pipeline_name, 'pipeline_name') check.opt_inst_param(external_execution_plan, 'external_execution_plan', ExternalExecutionPlan) # circular ref at module scope from .errors import DauphinPythonError dagster_event = event_record.dagster_event basic_params = construct_basic_params(event_record, external_execution_plan) if dagster_event.event_type == DagsterEventType.STEP_START: return DauphinExecutionStepStartEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SKIPPED: return DauphinExecutionStepSkippedEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_UP_FOR_RETRY: return DauphinExecutionStepUpForRetryEvent( error=dagster_event.step_retry_data.error, secondsToWait=dagster_event.step_retry_data.seconds_to_wait, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_RESTARTED: return DauphinExecutionStepRestartEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SUCCESS: return DauphinExecutionStepSuccessEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_INPUT: input_data = dagster_event.event_specific_data return DauphinExecutionStepInputEvent( input_name=input_data.input_name, type_check=input_data.type_check_data, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_OUTPUT: output_data = dagster_event.step_output_data return DauphinExecutionStepOutputEvent( output_name=output_data.output_name, type_check=output_data.type_check_data, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_MATERIALIZATION: materialization = dagster_event.step_materialization_data.materialization return DauphinStepMaterializationEvent(materialization=materialization, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_EXPECTATION_RESULT: expectation_result = dagster_event.event_specific_data.expectation_result return DauphinStepExpectationResultEvent( expectation_result=expectation_result, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_FAILURE: check.inst(dagster_event.step_failure_data, StepFailureData) return DauphinExecutionStepFailureEvent( error=DauphinPythonError(dagster_event.step_failure_data.error), failureMetadata=dagster_event.step_failure_data.user_failure_data, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_START: return DauphinPipelineStartEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_SUCCESS: return DauphinPipelineSuccessEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE: return DauphinPipelineFailureEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_INIT_FAILURE: return DauphinPipelineInitFailureEvent( pipelineName=pipeline_name, error=DauphinPythonError( dagster_event.pipeline_init_failure_data.error), **basic_params) elif dagster_event.event_type == DagsterEventType.OBJECT_STORE_OPERATION: operation_result = dagster_event.event_specific_data return DauphinObjectStoreOperationEvent( operation_result=operation_result, **basic_params) elif dagster_event.event_type == DagsterEventType.ENGINE_EVENT: return DauphinEngineEvent( metadataEntries=_to_dauphin_metadata_entries( dagster_event.engine_event_data.metadata_entries), error=DauphinPythonError(dagster_event.engine_event_data.error) if dagster_event.engine_event_data.error else None, marker_start=dagster_event.engine_event_data.marker_start, marker_end=dagster_event.engine_event_data.marker_end, **basic_params) else: raise Exception( 'Unknown DAGSTER_EVENT type {inner_type} found in logs'.format( inner_type=dagster_event.event_type))
def __init__( self, type_check_fn, key=None, name=None, is_builtin=False, description=None, loader=None, materializer=None, serialization_strategy=None, auto_plugins=None, required_resource_keys=None, kind=DagsterTypeKind.REGULAR, ): check.opt_str_param(key, "key") check.opt_str_param(name, "name") check.invariant(not (name is None and key is None), "Must set key or name") if name is None: check.param_invariant( bool(key), "key", "If name is not provided, must provide key.", ) self.key, self._name = key, None elif key is None: check.param_invariant( bool(name), "name", "If key is not provided, must provide name.", ) self.key, self._name = name, name else: check.invariant(key and name) self.key, self._name = key, name self.description = check.opt_str_param(description, "description") self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader) self.materializer = check.opt_inst_param(materializer, "materializer", DagsterTypeMaterializer) self.serialization_strategy = check.opt_inst_param( serialization_strategy, "serialization_strategy", SerializationStrategy, PickleSerializationStrategy(), ) self.required_resource_keys = check.opt_set_param( required_resource_keys, "required_resource_keys", ) self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn") _validate_type_check_fn(self._type_check_fn, self._name) auto_plugins = check.opt_list_param(auto_plugins, "auto_plugins", of_type=type) check.param_invariant( all( issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins), "auto_plugins", ) self.auto_plugins = auto_plugins self.is_builtin = check.bool_param(is_builtin, "is_builtin") check.invariant( self.display_name is not None, "All types must have a valid display name, got None for key {}". format(key), ) self.kind = check.inst_param(kind, "kind", DagsterTypeKind)
def __init__(self, target_folder, overwrite=False): super(FSFileCache, self).__init__(overwrite=overwrite) check.str_param(target_folder, 'target_folder') check.param_invariant(os.path.isdir(target_folder), 'target_folder') self.target_folder = target_folder
def __init__(self, config_type): check.inst_param(config_type, 'config_type', ConfigType) check.param_invariant(config_type.kind == ConfigTypeKind.ENUM, 'config_type') self._config_type = config_type super(DauphinEnumConfigType, self).__init__(**_ctor_kwargs(config_type))
def ensure_single_item(ddict): check.dict_param(ddict, 'ddict') check.param_invariant(len(ddict) == 1, 'ddict', 'Expected dict with single item') return list(ddict.items())[0]
def resolve_to_config_type(dagster_type): from .field_utils import convert_fields_to_dict_type # Short circuit if it's already a Config Type if isinstance(dagster_type, ConfigType): return dagster_type if isinstance(dagster_type, dict): return convert_fields_to_dict_type(dagster_type) if isinstance(dagster_type, list): if len(dagster_type) != 1: raise DagsterInvalidDefinitionError( "Array specifications must only be of length 1") inner_type = resolve_to_config_type(dagster_type[0]) if not inner_type: raise DagsterInvalidDefinitionError( "Invalid member of array specification: {value} in list {the_list}" .format(value=repr(dagster_type[0]), the_list=dagster_type)) return Array(inner_type) from dagster.core.types.dagster_type import DagsterType, List, ListType from dagster.core.types.python_set import Set, _TypedPythonSet from dagster.core.types.python_tuple import Tuple, _TypedPythonTuple if _is_config_type_class(dagster_type): check.param_invariant( False, "dagster_type", f"Cannot pass config type class {dagster_type} to resolve_to_config_type. " "This error usually occurs when you pass a dagster config type class instead of a class instance into " 'another dagster config type. E.g. "Noneable(Permissive)" should instead be "Noneable(Permissive())".', ) if isinstance(dagster_type, type) and issubclass(dagster_type, DagsterType): raise DagsterInvalidDefinitionError( "You have passed a DagsterType class {dagster_type} to the config system. " "The DagsterType and config schema systems are separate. " "Valid config values are:\n{desc}".format( dagster_type=repr(dagster_type), desc=VALID_CONFIG_DESC, )) if is_closed_python_optional_type(dagster_type): raise DagsterInvalidDefinitionError( "Cannot use typing.Optional as a config type. If you want this field to be " "optional, please use Field(<type>, is_required=False), and if you want this field to " "be required, but accept a value of None, use dagster.Noneable(<type>)." ) if is_typing_type(dagster_type): raise DagsterInvalidDefinitionError(( "You have passed in {dagster_type} to the config system. Types from " "the typing module in python are not allowed in the config system. " "You must use types that are imported from dagster or primitive types " "such as bool, int, etc.").format(dagster_type=dagster_type)) if dagster_type is List or isinstance(dagster_type, ListType): raise DagsterInvalidDefinitionError( "Cannot use List in the context of config. " + helpful_list_error_string()) if dagster_type is Set or isinstance(dagster_type, _TypedPythonSet): raise DagsterInvalidDefinitionError( "Cannot use Set in the context of a config field. " + helpful_list_error_string()) if dagster_type is Tuple or isinstance(dagster_type, _TypedPythonTuple): raise DagsterInvalidDefinitionError( "Cannot use Tuple in the context of a config field. " + helpful_list_error_string()) if isinstance(dagster_type, DagsterType): raise DagsterInvalidDefinitionError(( "You have passed an instance of DagsterType {type_name} to the config " "system (Repr of type: {dagster_type}). " "The DagsterType and config schema systems are separate. " "Valid config values are:\n{desc}").format( type_name=dagster_type.display_name, dagster_type=repr(dagster_type), desc=VALID_CONFIG_DESC, ), ) # If we are passed here either: # 1) We have been passed a python builtin # 2) We have been a dagster wrapping type that needs to be convert its config variant # e.g. dagster.List # 2) We have been passed an invalid thing. We return False to signify this. It is # up to callers to report a reasonable error. from dagster.primitive_mapping import ( remap_python_builtin_for_config, is_supported_config_python_builtin, ) if is_supported_config_python_builtin(dagster_type): return remap_python_builtin_for_config(dagster_type) if dagster_type is None: return ConfigAnyInstance if BuiltinEnum.contains(dagster_type): return ConfigType.from_builtin_enum(dagster_type) # This means that this is an error and we are return False to a callsite # We do the error reporting there because those callsites have more context return False
def __new__(cls, dagster_type, incoming_fields): check.param_invariant(dagster_type.is_selector, 'dagster_type') return super(SelectorTypeErrorData, cls).__new__( cls, dagster_type, check.list_param(incoming_fields, 'incoming_fields', of_type=str) )
def execute(pipeline_context, execution_plan, step_keys_to_execute=None): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_key_set = None if step_keys_to_execute is None else set( step_keys_to_execute) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps in process (pid: {pid})'.format(pid=os.getpid()), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), ) with time_execution_scope() as timer_result: check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config), ) failed_or_skipped_steps = set() step_levels = execution_plan.topological_step_levels() # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 for step_level in step_levels: for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) failed_inputs = [] for step_input in step.step_inputs: failed_inputs.extend( failed_or_skipped_steps.intersection( step_input.dependency_keys)) if failed_inputs: step_context.log.info(( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.' ).format(step=step.key, failed_inputs=failed_inputs)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional( uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue for step_event in check.generator( dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event yield DagsterEvent.engine_event( pipeline_context, 'Finished steps in process (pid: {pid}) in {duration_ms}'.format( pid=os.getpid(), duration_ms=format_duration(timer_result.millis)), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), )
def uri_for_paths(self, paths, protocol=None): check.list_param(paths, "paths", of_type=str) check.param_invariant(len(paths) > 0, "paths") key = self.key_for_paths(paths) return self.object_store.uri_for_key(key, protocol)
def define_python_dagster_type( python_type, name=None, description=None, input_hydration_config=None, output_materialization_config=None, serialization_strategy=None, auto_plugins=None, type_check=None, ): '''Core machinery for defining a Dagster type corresponding to an existing python type. Users should generally use the :py:func:`@dagster_type` decorator or :py:func:`as_dagster_type`, both of which defer to this function. Args: python_type (cls): The python type to wrap as a Dagster type. name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of the ``python_type`` will be used. description (Optional[str]): A user-readable description of the type. input_hydration_config (Optional[InputHydrationConfig]): An instance of a class constructed using the :py:func:`@input_hydration_config <dagster.InputHydrationConfig>` decorator that can map config data to a value of this type. output_materialization_config (Optiona[OutputMaterializationConfig]): An instance of a class constructed using the :py:func:`@output_materialization_config <dagster.output_materialization_config>` decorator that can persist values of this type. serialization_strategy (Optional[SerializationStrategy]): An instance of a class that inherits from :py:class:`SerializationStrategy`. The default strategy for serializing this value when automatically persisting it between execution steps. You should set this value if the ordinary serialization machinery (e.g., pickle) will not be adequate for this type. auto_plugins (Optional[List[TypeStoragePlugin]]): If types must be serialized differently depending on the storage being used for intermediates, they should specify this argument. In these cases the serialization_strategy argument is not sufficient because serialization requires specialized API calls, e.g. to call an S3 API directly instead of using a generic file object. See ``dagster_pyspark.DataFrame`` for an example. type_check (Optional[Callable[[Any], Union[bool, TypeCheck]]]): If specified, this function will be called in place of the default isinstance type check. This function should return ``True`` if the type check succeds, ``False`` if it fails, or, if additional metadata should be emitted along with the type check success or failure, an instance of :py:class:`TypeCheck` with the ``success`` field set appropriately. ''' check.type_param(python_type, 'python_type') check.opt_str_param(name, 'name', python_type.__name__) check.opt_str_param(description, 'description') check.opt_inst_param(input_hydration_config, 'input_hydration_config', InputHydrationConfig) check.opt_inst_param(output_materialization_config, 'output_materialization_config', OutputMaterializationConfig) check.opt_inst_param( serialization_strategy, 'serialization_strategy', SerializationStrategy, default=PickleSerializationStrategy(), ) auto_plugins = check.opt_list_param(auto_plugins, 'auto_plugins', of_type=type) check.param_invariant( all( issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins), 'auto_plugins', ) check.opt_callable_param(type_check, 'type_check') class _ObjectType(PythonObjectType): def __init__(self): super(_ObjectType, self).__init__( python_type=python_type, name=name, description=description, input_hydration_config=input_hydration_config, output_materialization_config=output_materialization_config, serialization_strategy=serialization_strategy, auto_plugins=auto_plugins, type_check=type_check, ) return _ObjectType
def has_object(self, key): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') return os.path.exists(key)
def __init__( self, type_check_fn, key=None, name=None, is_builtin=False, description=None, loader=None, materializer=None, required_resource_keys=None, kind=DagsterTypeKind.REGULAR, typing_type=None, ): check.opt_str_param(key, "key") check.opt_str_param(name, "name") check.invariant(not (name is None and key is None), "Must set key or name") if name is None: check.param_invariant( bool(key), "key", "If name is not provided, must provide key.", ) self.key, self._name = key, None elif key is None: check.param_invariant( bool(name), "name", "If key is not provided, must provide name.", ) self.key, self._name = name, name else: check.invariant(key and name) self.key, self._name = key, name self.description = check.opt_str_param(description, "description") self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader) self.materializer = check.opt_inst_param(materializer, "materializer", DagsterTypeMaterializer) self.required_resource_keys = check.opt_set_param( required_resource_keys, "required_resource_keys", ) self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn") _validate_type_check_fn(self._type_check_fn, self._name) self.is_builtin = check.bool_param(is_builtin, "is_builtin") check.invariant( self.display_name is not None, "All types must have a valid display name, got None for key {}". format(key), ) self.kind = check.inst_param(kind, "kind", DagsterTypeKind) self.typing_type = typing_type
def __getitem__(self, *args): check.param_invariant(len(args[0]) == 2, "args", "Must be two parameters") return create_typed_runtime_dict(args[0][0], args[0][1])
def execute(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) dask_config = pipeline_context.executor_config check.param_invariant( isinstance(pipeline_context.executor_config, DaskConfig), 'pipeline_context', 'Expected executor_config to be DaskConfig got {}'.format( pipeline_context.executor_config), ) # Checks to ensure storage is compatible with Dask configuration storage = pipeline_context.environment_dict.get('storage') check.invariant(storage.keys(), 'Must specify storage to use Dask execution') check.invariant( pipeline_context.instance.is_persistent, 'Dask execution requires a persistent DagsterInstance', ) # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Dask, use filesystem, S3, or GCS', ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance with dask.distributed.Client( **dask_config.build_dict(pipeline_name)) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) environment_dict = dict(pipeline_context.environment_dict, execution={'in_process': {}}) variables = { 'executionParams': { 'selector': { 'name': pipeline_name }, 'environmentConfigData': environment_dict, 'mode': pipeline_context.mode_def.name, 'executionMetadata': { 'runId': pipeline_context.pipeline_run.run_id }, 'stepKeys': [step.key], } } dask_task_name = '%s.%s' % (pipeline_name, step.key) future = client.submit( query_on_dask_worker, pipeline_context.pipeline. get_reconstructable_repository(), variables, dependencies, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master for future in dask.distributed.as_completed(execution_futures): for step_event in future.result(): check.inst(step_event, DagsterEvent) yield step_event
def _input_schema(_context, value): check.dict_param(value, "value") check.param_invariant(set(value.keys()) == field_names, "value") return value
def _t_fn(compute_context, inputs): check.inst_param(compute_context, "compute_context", SolidExecutionContext) check.param_invariant( isinstance(compute_context.run_config, dict), "context", "SystemComputeExecutionContext must have valid run_config", ) system_compute_context = compute_context.get_system_context() with seven.TemporaryDirectory() as output_notebook_dir: with safe_tempfile_path() as output_log_path: parameterized_notebook_path = os.path.join( output_notebook_dir, "{prefix}-inter.ipynb".format(prefix=str(uuid.uuid4()))) executed_notebook_path = os.path.join( output_notebook_dir, "{prefix}-out.ipynb".format(prefix=str(uuid.uuid4()))) # Scaffold the registration here nb = load_notebook_node(notebook_path) nb_no_parameters = replace_parameters( system_compute_context, nb, get_papermill_parameters(system_compute_context, inputs, output_log_path), ) write_ipynb(nb_no_parameters, parameterized_notebook_path) with user_code_error_boundary( DagstermillExecutionError, lambda: ("Error occurred during the execution of Dagstermill solid " "{solid_name}: {notebook_path}".format( solid_name=name, notebook_path=notebook_path)), ): try: papermill_engines.register("dagstermill", DagstermillNBConvertEngine) papermill.execute_notebook( input_path=parameterized_notebook_path, output_path=executed_notebook_path, engine_name="dagstermill", log_output=True, ) except Exception as exc: # pylint: disable=broad-except try: with open(executed_notebook_path, "rb") as fd: executed_notebook_file_handle = compute_context.file_manager.write( fd, mode="wb", ext="ipynb") executed_notebook_materialization_path = ( executed_notebook_file_handle.path_desc) except Exception as exc_inner: # pylint: disable=broad-except compute_context.log.warning( "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}" .format(exc=exc_inner)) executed_notebook_materialization_path = executed_notebook_path yield AssetMaterialization( asset_key=executed_notebook_materialization_path, description= "Location of output notebook in file manager", metadata_entries=[ EventMetadataEntry.fspath( executed_notebook_materialization_path) ], ) raise exc system_compute_context.log.debug( "Notebook execution complete for {name} at {executed_notebook_path}." .format( name=name, executed_notebook_path=executed_notebook_path, )) executed_notebook_file_handle = None try: # use binary mode when when moving the file since certain file_managers such as S3 # may try to hash the contents with open(executed_notebook_path, "rb") as fd: executed_notebook_file_handle = compute_context.file_manager.write( fd, mode="wb", ext="ipynb") executed_notebook_materialization_path = executed_notebook_file_handle.path_desc except Exception as exc: # pylint: disable=broad-except compute_context.log.warning( "Error when attempting to materialize executed notebook using file manager (falling back to local): {exc}" .format(exc=str(exc))) executed_notebook_materialization_path = executed_notebook_path yield AssetMaterialization( asset_key=executed_notebook_materialization_path, description="Location of output notebook in file manager", metadata_entries=[ EventMetadataEntry.fspath( executed_notebook_materialization_path) ], ) if output_notebook is not None: yield Output(executed_notebook_file_handle, output_notebook) # deferred import for perf import scrapbook output_nb = scrapbook.read_notebook(executed_notebook_path) for (output_name, output_def ) in system_compute_context.solid_def.output_dict.items(): data_dict = output_nb.scraps.data_dict if output_name in data_dict: value = read_value(output_def.dagster_type, data_dict[output_name]) yield Output(value, output_name) for key, value in output_nb.scraps.items(): if key.startswith("event-"): with open(value.data, "rb") as fd: yield pickle.loads(fd.read())
def __init__( self, config, default_value=FIELD_NO_DEFAULT_PROVIDED, is_required=None, description=None, ): from .validate import validate_config from .post_process import resolve_defaults self.config_type = check.inst(self._resolve_config_arg(config), ConfigType) self.description = check.opt_str_param(description, "description") check.opt_bool_param(is_required, "is_required") if default_value != FIELD_NO_DEFAULT_PROVIDED: check.param_invariant(not (callable(default_value)), "default_value", "default_value cannot be a callable") if is_required is True: check.param_invariant( default_value == FIELD_NO_DEFAULT_PROVIDED, "default_value", "required arguments should not specify default values", ) self._default_value = default_value # check explicit default value if self.default_provided: if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value( default_value): raise DagsterInvalidDefinitionError(( "You have passed into a python enum value as the default value " "into of a config enum type {name}. You must pass in the underlying " "string represention as the default value. One of {value_set}." ).format( value_set=[ ev.config_value for ev in self.config_type.enum_values ], name=self.config_type.given_name, )) evr = validate_config(self.config_type, default_value) if not evr.success: raise DagsterInvalidConfigError( "Invalid default_value for Field.", evr.errors, default_value, ) if is_required is None: is_optional = has_implicit_default( self.config_type) or self.default_provided is_required = not is_optional # on implicitly optional - set the default value # by resolving the defaults of the type if is_optional and not self.default_provided: evr = resolve_defaults(self.config_type, None) if not evr.success: raise DagsterInvalidConfigError( "Unable to resolve implicit default_value for Field.", evr.errors, None, ) self._default_value = evr.value self._is_required = is_required
def __getitem__(self, *args): from .python_dict import create_typed_runtime_dict check.param_invariant( len(args[0]) == 2, 'args', 'Must be two parameters') return create_typed_runtime_dict(args[0][0], args[0][1])
def execute(pipeline_context, execution_plan, step_keys_to_execute=None): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_key_set = None if step_keys_to_execute is None else set( step_keys_to_execute) check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config), ) failed_or_skipped_steps = set() step_levels = execution_plan.topological_step_levels() intermediates_manager = pipeline_context.intermediates_manager # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 for step_level in step_levels: for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) failed_inputs = [ step_input.prev_output_handle.step_key for step_input in step.step_inputs if step_input.prev_output_handle.step_key in failed_or_skipped_steps ] if failed_inputs: step_context.log.info(( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.' ).format(step=step.key, failed_inputs=failed_inputs)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue uncovered_inputs = intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional(uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue input_values = _create_input_values(step_context, intermediates_manager) for step_event in check.generator( execute_step_in_memory(step_context, input_values, intermediates_manager)): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event
def evaluate_composite_config(context): check.inst_param(context, 'context', TraversalContext) check.param_invariant(context.config_type.is_composite, 'composite_type') fields = context.config_type.fields if context.config_value and not isinstance(context.config_value, dict): return EvaluateValueResult.for_error( create_composite_type_mismatch_error(context)) evaluate_value_result = _evaluate_composite_solid_config(context) if evaluate_value_result.errors or evaluate_value_result.value: return evaluate_value_result # ASK: this can crash on user error config_value = check.opt_dict_param(context.config_value, 'incoming_value', key_type=str) defined_fields = set(fields.keys()) incoming_fields = set(config_value.keys()) extra_fields = list(incoming_fields - defined_fields) # We'll build up a dict of processed config values below errors = [] output_config_value = {} # Here, we support permissive composites. In cases where we know the set of permissible keys a # priori, we validate against the config: if not context.config_type.is_permissive_composite: if extra_fields: if len(extra_fields) == 1: errors.append( create_field_not_defined_error(context, extra_fields[0])) else: errors.append( create_fields_not_defined_error(context, extra_fields)) # And for permissive fields, we just pass along to the output without further validation else: for field_name in extra_fields: output_config_value[field_name] = config_value[field_name] # ...However, for any fields the user *has* told us about, we validate against their config # specifications missing_fields = [] for key, field_def in fields.items(): if key in incoming_fields: evaluate_value_result = _evaluate_config( context.for_field(field_def, key, context.config_value[key])) if evaluate_value_result.errors: errors += evaluate_value_result.errors else: output_config_value[key] = evaluate_value_result.value elif field_def.is_optional: # Try to see if this is a composite solid speculative_composite_solid_result = _evaluate_composite_solid_config( context.for_field( field_def, key, field_def.default_value if field_def.default_provided else {})) if speculative_composite_solid_result.value is not None: output_config_value[ key] = speculative_composite_solid_result.value else: if field_def.default_provided: output_config_value[key] = field_def.default_value else: check.invariant(not field_def.default_provided) missing_fields.append(key) if missing_fields: if len(missing_fields) == 1: errors.append( create_missing_required_field_error(context, missing_fields[0])) else: errors.append( create_missing_required_fields_error(context, missing_fields)) if errors: return EvaluateValueResult.for_errors(errors) else: return EvaluateValueResult.for_value(output_config_value)
def __init__(self, config_type): self._config_type = check.inst_param(config_type, 'config_type', ConfigType) check.param_invariant(config_type.kind == ConfigTypeKind.SCALAR_UNION, 'config_type') super(DauphinScalarUnionConfigType, self).__init__(**_ctor_kwargs(config_type))
def has_object(self, key): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') key_count = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=key)['KeyCount'] return bool(key_count > 0)
def __init__(self, config_type): check.inst_param(config_type, 'config_type', ConfigType) check.param_invariant(ConfigTypeKind.has_fields(config_type.kind), 'config_type') self._config_type = config_type super(DauphinCompositeConfigType, self).__init__(**_ctor_kwargs(config_type))
def ensure_single_item(ddict): check.dict_param(ddict, "ddict") check.param_invariant( len(ddict) == 1, "ddict", "Expected dict with single item") return list(ddict.items())[0]
def __init__(self, inner_dagster_type): self._inner_dagster_type = check.inst_param(inner_dagster_type, "inner_dagster_type", DagsterType) check.param_invariant(inner_dagster_type.loader, "inner_dagster_type") self._schema_type = Array(inner_dagster_type.loader.schema_type)
def __new__(cls, config_type, list_index): check.int_param(list_index, 'list_index') check.param_invariant(list_index >= 0, 'list_index') return super(EvaluationStackListItemEntry, cls).__new__( cls, check.inst_param(config_type, 'config_type', ConfigType), list_index)
def from_dagster_event_record(event_record, pipeline_name): from ..schema.errors import GraphenePythonError from ..schema.logs.events import ( GrapheneEngineEvent, GrapheneExecutionStepFailureEvent, GrapheneExecutionStepInputEvent, GrapheneExecutionStepOutputEvent, GrapheneExecutionStepRestartEvent, GrapheneExecutionStepSkippedEvent, GrapheneExecutionStepStartEvent, GrapheneExecutionStepSuccessEvent, GrapheneExecutionStepUpForRetryEvent, GrapheneHandledOutputEvent, GrapheneHookCompletedEvent, GrapheneHookErroredEvent, GrapheneHookSkippedEvent, GrapheneLoadedInputEvent, GrapheneObjectStoreOperationEvent, GraphenePipelineCanceledEvent, GraphenePipelineCancelingEvent, GraphenePipelineDequeuedEvent, GraphenePipelineEnqueuedEvent, GraphenePipelineFailureEvent, GraphenePipelineInitFailureEvent, GraphenePipelineStartEvent, GraphenePipelineStartingEvent, GraphenePipelineSuccessEvent, GrapheneStepExpectationResultEvent, GrapheneStepMaterializationEvent, ) # Lots of event types. Pylint thinks there are too many branches # pylint: disable=too-many-branches check.inst_param(event_record, "event_record", EventRecord) check.param_invariant(event_record.is_dagster_event, "event_record") check.str_param(pipeline_name, "pipeline_name") dagster_event = event_record.dagster_event basic_params = construct_basic_params(event_record) if dagster_event.event_type == DagsterEventType.STEP_START: return GrapheneExecutionStepStartEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SKIPPED: return GrapheneExecutionStepSkippedEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_UP_FOR_RETRY: return GrapheneExecutionStepUpForRetryEvent( error=dagster_event.step_retry_data.error, secondsToWait=dagster_event.step_retry_data.seconds_to_wait, **basic_params, ) elif dagster_event.event_type == DagsterEventType.STEP_RESTARTED: return GrapheneExecutionStepRestartEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SUCCESS: return GrapheneExecutionStepSuccessEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_INPUT: input_data = dagster_event.event_specific_data return GrapheneExecutionStepInputEvent( input_name=input_data.input_name, type_check=input_data.type_check_data, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_OUTPUT: output_data = dagster_event.step_output_data return GrapheneExecutionStepOutputEvent( output_name=output_data.output_name, type_check=output_data.type_check_data, **basic_params, ) elif dagster_event.event_type == DagsterEventType.STEP_MATERIALIZATION: materialization = dagster_event.step_materialization_data.materialization return GrapheneStepMaterializationEvent( materialization=materialization, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_EXPECTATION_RESULT: expectation_result = dagster_event.event_specific_data.expectation_result return GrapheneStepExpectationResultEvent( expectation_result=expectation_result, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_FAILURE: check.inst(dagster_event.step_failure_data, StepFailureData) return GrapheneExecutionStepFailureEvent( error=GraphenePythonError(dagster_event.step_failure_data.error), failureMetadata=dagster_event.step_failure_data.user_failure_data, **basic_params, ) elif dagster_event.event_type == DagsterEventType.PIPELINE_ENQUEUED: return GraphenePipelineEnqueuedEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_DEQUEUED: return GraphenePipelineDequeuedEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_STARTING: return GraphenePipelineStartingEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_CANCELING: return GraphenePipelineCancelingEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_CANCELED: return GraphenePipelineCanceledEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_START: return GraphenePipelineStartEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_SUCCESS: return GraphenePipelineSuccessEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE: return GraphenePipelineFailureEvent( pipelineName=pipeline_name, error=GraphenePythonError( dagster_event.pipeline_failure_data.error) if (dagster_event.pipeline_failure_data and dagster_event.pipeline_failure_data.error) else None, **basic_params, ) elif dagster_event.event_type == DagsterEventType.PIPELINE_INIT_FAILURE: return GraphenePipelineInitFailureEvent( pipelineName=pipeline_name, error=GraphenePythonError( dagster_event.pipeline_init_failure_data.error), **basic_params, ) elif dagster_event.event_type == DagsterEventType.HANDLED_OUTPUT: return GrapheneHandledOutputEvent( output_name=dagster_event.event_specific_data.output_name, manager_key=dagster_event.event_specific_data.manager_key, **basic_params, ) elif dagster_event.event_type == DagsterEventType.LOADED_INPUT: return GrapheneLoadedInputEvent( input_name=dagster_event.event_specific_data.input_name, manager_key=dagster_event.event_specific_data.manager_key, upstream_output_name=dagster_event.event_specific_data. upstream_output_name, upstream_step_key=dagster_event.event_specific_data. upstream_step_key, **basic_params, ) elif dagster_event.event_type == DagsterEventType.OBJECT_STORE_OPERATION: operation_result = dagster_event.event_specific_data return GrapheneObjectStoreOperationEvent( operation_result=operation_result, **basic_params) elif dagster_event.event_type == DagsterEventType.ENGINE_EVENT: return GrapheneEngineEvent( metadataEntries=_to_metadata_entries( dagster_event.engine_event_data.metadata_entries), error=GraphenePythonError(dagster_event.engine_event_data.error) if dagster_event.engine_event_data.error else None, marker_start=dagster_event.engine_event_data.marker_start, marker_end=dagster_event.engine_event_data.marker_end, **basic_params, ) elif dagster_event.event_type == DagsterEventType.HOOK_COMPLETED: return GrapheneHookCompletedEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.HOOK_SKIPPED: return GrapheneHookSkippedEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.HOOK_ERRORED: return GrapheneHookErroredEvent(error=GraphenePythonError( dagster_event.hook_errored_data.error), **basic_params) else: raise Exception( "Unknown DAGSTER_EVENT type {inner_type} found in logs".format( inner_type=dagster_event.event_type))
def validate_selector_config_value(selector_type, config_value, stack): check.param_invariant(selector_type.is_selector, 'selector_type') check.inst_param(stack, 'stack', EvaluationStack) if config_value and not isinstance(config_value, dict): yield EvaluationError( stack=stack, reason=DagsterEvaluationErrorReason.RUNTIME_TYPE_MISMATCH, message= 'Value for selector type {type_name} must be a dict got {value}'. format(type_name=selector_type.name, value=config_value), error_data=RuntimeMismatchErrorData(config_type=selector_type, value_rep=repr(config_value)), ) return if config_value and len(config_value) > 1: incoming_fields = sorted(list(config_value.keys())) defined_fields = sorted(list(selector_type.fields.keys())) yield EvaluationError( stack=stack, reason=DagsterEvaluationErrorReason.SELECTOR_FIELD_ERROR, message= ('You can only specify a single field. You specified {incoming_fields}. ' 'The available fields are {defined_fields}').format( incoming_fields=incoming_fields, defined_fields=defined_fields), error_data=SelectorTypeErrorData(dagster_type=selector_type, incoming_fields=incoming_fields), ) return elif not config_value: defined_fields = sorted(list(selector_type.fields.keys())) if len(selector_type.fields) > 1: yield EvaluationError( stack=stack, reason=DagsterEvaluationErrorReason.SELECTOR_FIELD_ERROR, message=( 'Must specify a field if more one defined. Defined fields: ' '{defined_fields}').format(defined_fields=defined_fields), error_data=SelectorTypeErrorData(dagster_type=selector_type, incoming_fields=[]), ) return field_name, field_def = single_item(selector_type.fields) if not field_def.is_optional: yield EvaluationError( stack=stack, reason=DagsterEvaluationErrorReason.SELECTOR_FIELD_ERROR, message=('Must specify the required field. Defined fields: ' '{defined_fields}').format( defined_fields=defined_fields), error_data=SelectorTypeErrorData(dagster_type=selector_type, incoming_fields=[]), ) return incoming_field_value = field_def.default_value if field_def.default_provided else None else: check.invariant(config_value and len(config_value) == 1) field_name, incoming_field_value = single_item(config_value) if field_name not in selector_type.fields: yield create_field_not_defined_error(selector_type, stack, field_name) return parent_field = selector_type.fields[field_name] for error in _validate_config( parent_field.config_type, incoming_field_value, stack_with_field(stack, field_name, parent_field), ): yield error
def __init__( self, config, default_value=FIELD_NO_DEFAULT_PROVIDED, is_optional=None, is_required=None, description=None, ): from .validate import validate_config from .post_process import post_process_config self.config_type = check.inst(self._resolve_config_arg(config), ConfigType) self.description = check.opt_str_param(description, 'description') check.opt_bool_param(is_optional, 'is_optional') check.opt_bool_param(is_required, 'is_required') canonical_is_required = canonicalize_backcompat_args( new_val=is_required, new_arg='is_required', old_val=is_optional, old_arg='is_optional', coerce_old_to_new=lambda val: not val, additional_warn_txt= '"is_optional" deprecated in 0.7.0 and will be removed in 0.8.0. Users should use "is_required" instead.', ) if canonical_is_required is True: check.param_invariant( default_value == FIELD_NO_DEFAULT_PROVIDED, 'default_value', 'required arguments should not specify default values', ) self._default_value = default_value # check explicit default value if self.default_provided: # invoke through property in case it is callable value = self.default_value evr = validate_config(self.config_type, value) if not evr.success: raise DagsterInvalidConfigError( 'Invalid default_value for Field.', evr.errors, default_value, ) if canonical_is_required is None: # neither is_required nor is_optional were specified canonical_is_required = not all_optional_type(self.config_type) # on implicitly optional - set the default value # by resolving the defaults of the type if not canonical_is_required and self._default_value == FIELD_NO_DEFAULT_PROVIDED: evr = post_process_config(self.config_type, None) if not evr.success: raise DagsterInvalidConfigError( 'Unable to resolve implicit default_value for Field.', evr.errors, None, ) self._default_value = evr.value self._is_required = canonical_is_required
def _core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): from .tasks import make_app check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.callable_param(step_execution_fn, 'step_execution_fn') check.param_invariant( isinstance(pipeline_context.executor_config, (CeleryConfig, CeleryK8sJobConfig)), 'pipeline_context', 'Expected executor_config to be Celery config got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Celery, use filesystem (on top of NFS or ' 'similar system that allows files to be available to all nodes), S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: (-1 * int( step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted(step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple(step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'.format( ), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )