def create_schedule_definition( self, schedule_name, cron_schedule, partition_selector, should_execute=None, environment_vars=None, execution_timezone=None, ): """Create a ScheduleDefinition from a PartitionSetDefinition. Arguments: schedule_name (str): The name of the schedule. cron_schedule (str): A valid cron string for the schedule partition_selector (Callable[ScheduleExecutionContext, PartitionSetDefinition], Partition): Function that determines the partition to use at a given execution time. For time-based partition sets, will likely be either `identity_partition_selector` or a selector returned by `create_offset_partition_selector`. should_execute (Optional[function]): Function that runs at schedule execution time that determines whether a schedule should execute. Defaults to a function that always returns ``True``. environment_vars (Optional[dict]): The environment variables to set for the schedule. execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works with DagsterDaemonScheduler, and must be set when using that scheduler. Returns: ScheduleDefinition: The generated ScheduleDefinition for the partition selector """ check.str_param(schedule_name, "schedule_name") check.str_param(cron_schedule, "cron_schedule") check.opt_callable_param(should_execute, "should_execute") check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str) check.callable_param(partition_selector, "partition_selector") check.opt_str_param(execution_timezone, "execution_timezone") def _should_execute_wrapper(context): check.inst_param(context, "context", ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition or not selected_partition.name in self.get_partition_names(): return False elif not should_execute: return True else: return should_execute(context) def _run_config_fn_wrapper(context): check.inst_param(context, "context", ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition or not selected_partition.name in self.get_partition_names(): raise DagsterInvariantViolationError( "The partition selection function `{selector}` did not return " "a partition from PartitionSet {partition_set}".format( selector=getattr(partition_selector, "__name__", repr(partition_selector)), partition_set=self.name, ) ) return self.run_config_for_partition(selected_partition) def _tags_fn_wrapper(context): check.inst_param(context, "context", ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition: raise DagsterInvariantViolationError( "The partition selection function `{selector}` did not return " "a partition from PartitionSet {partition_set}".format( selector=getattr(partition_selector, "__name__", repr(partition_selector)), partition_set=self.name, ) ) return self.tags_for_partition(selected_partition) return PartitionScheduleDefinition( name=schedule_name, cron_schedule=cron_schedule, pipeline_name=self.pipeline_name, run_config_fn=_run_config_fn_wrapper, tags_fn=_tags_fn_wrapper, solid_selection=self.solid_selection, mode=self.mode, should_execute=_should_execute_wrapper, environment_vars=environment_vars, partition_set=self, execution_timezone=execution_timezone, )
def _create_solid_compute_wrapper(fn, input_defs, output_defs): check.callable_param(fn, 'fn') check.list_param(input_defs, 'input_defs', of_type=InputDefinition) check.list_param(output_defs, 'output_defs', of_type=OutputDefinition) input_names = [ input_def.name for input_def in input_defs if not input_def.runtime_type.is_nothing ] @wraps(fn) def compute(context, input_defs): kwargs = {} for input_name in input_names: kwargs[input_name] = input_defs[input_name] result = fn(context, **kwargs) if inspect.isgenerator(result): for item in result: yield item else: if isinstance(result, (Materialization, ExpectationResult)): raise DagsterInvariantViolationError( ( 'Error in solid {solid_name}: If you are returning a Materialization ' 'or an ExpectationResult from solid you must yield them to avoid ' 'ambiguity with an implied result from returning a value.'.format( solid_name=context.solid.name ) ) ) if isinstance(result, Output): yield result elif len(output_defs) == 1: yield Output(value=result, output_name=output_defs[0].name) elif result is not None: if not output_defs: raise DagsterInvariantViolationError( ( 'Error in solid {solid_name}: Unexpectedly returned output {result} ' 'of type {type_}. Solid is explicitly defined to return no ' 'results.' ).format(solid_name=context.solid.name, result=result, type_=type(result)) ) raise DagsterInvariantViolationError( ( 'Error in solid {solid_name}: Solid unexpectedly returned ' 'output {result} of type {type_}. Should ' 'be a generator, containing or yielding ' '{n_results} results: {{{expected_results}}}.' ).format( solid_name=context.solid.name, result=result, type_=type(result), n_results=len(output_defs), expected_results=', '.join( [ '\'{result_name}\': {runtime_type}'.format( result_name=output_def.name, runtime_type=output_def.runtime_type, ) for output_def in output_defs ] ), ) ) return compute
def __call__(self, fn): check.callable_param(fn, 'fn') if not self.name: self.name = fn.__name__ input_defs = ( self.input_defs if self.input_defs is not None else infer_input_definitions_for_composite_solid(self.name, fn) ) explicit_outputs = False if self.output_defs is not None: explicit_outputs = True output_defs = self.output_defs else: explicit_outputs = has_explicit_return_type(fn) output_defs = infer_output_definitions('@composite_solid', self.name, fn) positional_inputs = validate_solid_fn( '@composite_solid', self.name, fn, input_defs, exclude_nothing=False ) kwargs = {input_def.name: InputMappingNode(input_def) for input_def in input_defs} output = None mapping = None enter_composition(self.name, '@composite_solid') try: output = fn(**kwargs) mapping = composite_mapping_from_output(output, output_defs, self.name) finally: context = exit_composition(mapping) check.invariant( context.name == self.name, 'Composition context stack desync: received context for ' '"{context.name}" expected "{self.name}"'.format(context=context, self=self), ) # line up mappings in definition order input_mappings = [] for defn in input_defs: mappings = [ mapping for mapping in context.input_mappings if mapping.definition.name == defn.name ] if len(mappings) == 0: raise DagsterInvalidDefinitionError( "@composite_solid '{solid_name}' has unmapped input '{input_name}'. " "Remove it or pass it to the appropriate solid invocation.".format( solid_name=self.name, input_name=defn.name ) ) input_mappings += mappings output_mappings = [] for defn in output_defs: mapping = context.output_mapping_dict.get(defn.name) if mapping is None: # if we inferred output_defs we will be flexible and either take a mapping or not if not explicit_outputs: continue raise DagsterInvalidDefinitionError( "@composite_solid '{solid_name}' has unmapped output '{output_name}'. " "Remove it or return a value from the appropriate solid invocation.".format( solid_name=self.name, output_name=defn.name ) ) output_mappings.append(mapping) config_mapping = _get_validated_config_mapping(self.name, self.config, self.config_fn) return CompositeSolidDefinition( name=self.name, input_mappings=input_mappings, output_mappings=output_mappings, dependencies=context.dependencies, solid_defs=context.solid_defs, description=self.description, config_mapping=config_mapping, positional_inputs=positional_inputs, )
def __init__(self, resource_fn, config_field=None, description=None): self._resource_fn = check.callable_param(resource_fn, 'resource_fn') self._config_field = check_user_facing_opt_field_param( config_field, 'config_field', 'of a ResourceDefinition or @resource' ) self._description = check.opt_str_param(description, 'description')
def __init__(self, manager_fn, marks): self.manager_fn = check.callable_param(manager_fn, "manager_fn") self.marks = check.list_param(marks, "marks")
def create_offset_partition_selector(execution_time_to_partition_fn): """ Utility function for supplying a partition selector when creating a schedule from a partition set made of `datetime`s that assumes a fixed time offset between the partition time and the time at which the schedule executes. It's important to keep the cron string that's supplied to `PartitionSetDefinition.create_schedule_definition` in sync with the offset that's supplied to this function. For example, a schedule created from a partition set with partitions for each day at midnight that fills in the partition for day N at day N+1 at 10:00AM would create the partition selector as follows: .. code-block:: python partition_set = PartitionSetDefinition( name='hello_world_partition_set', pipeline_name='hello_world_pipeline', partition_fn= date_partition_range( start=datetime.datetime(2021, 1, 1), delta_range="days", timezone="US/Central", ) run_config_fn_for_partition=my_run_config_fn, ) schedule_definition = partition_set.create_schedule_definition( "daily_10am_schedule", "0 10 * * *", partition_selector=create_offset_partition_selector(lambda d: d.subtract(hours=10, days=1)) execution_timezone="US/Central", ) Args: execution_time_to_partition_fn (Callable[[datetime.datetime], datetime.datetime]): A function that maps the execution time of the schedule to the partition time. """ check.callable_param(execution_time_to_partition_fn, "execution_time_to_partition_fn") def offset_partition_selector(context, partition_set_def): check.inst_param(context, "context", ScheduleExecutionContext) check.inst_param(partition_set_def, "partition_set_def", PartitionSetDefinition) if not context.scheduled_execution_time: partitions = partition_set_def.get_partitions() if not partitions: return None return partitions[-1] partition_time = execution_time_to_partition_fn( context.scheduled_execution_time) for partition in reversed( partition_set_def.get_partitions( context.scheduled_execution_time)): if partition.value.isoformat() == partition_time.isoformat(): return partition if partition.value < partition_time: break return None return offset_partition_selector
def create_schedule_definition( self, schedule_name, cron_schedule, should_execute=None, partition_selector=last_partition, environment_vars=None, ): '''Create a ScheduleDefinition from a PartitionSetDefinition. Arguments: schedule_name (str): The name of the schedule. cron_schedule (str): A valid cron string for the schedule should_execute (Optional[function]): Function that runs at schedule execution time that determines whether a schedule should execute. Defaults to a function that always returns ``True``. partition_selector (Callable[PartitionSet], Partition): A partition selector for the schedule environment_vars (Optional[dict]): The environment variables to set for the schedule Returns: ScheduleDefinition -- The generated ScheduleDefinition for the partition selector ''' check.str_param(schedule_name, 'schedule_name') check.str_param(cron_schedule, 'cron_schedule') check.opt_callable_param(should_execute, 'should_execute') check.opt_dict_param(environment_vars, 'environment_vars', key_type=str, value_type=str) check.callable_param(partition_selector, 'partition_selector') def _should_execute_wrapper(context): check.inst_param(context, 'context', ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition: return False elif not should_execute: return True else: return should_execute(context) def _environment_dict_fn_wrapper(context): check.inst_param(context, 'context', ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition: raise DagsterInvariantViolationError( "The partition selection function `{selector}` did not return " "a partition from PartitionSet {partition_set}".format( selector=getattr(partition_selector, '__name__', repr(partition_selector)), partition_set=self.name, ) ) return self.environment_dict_for_partition(selected_partition) def _tags_fn_wrapper(context): check.inst_param(context, 'context', ScheduleExecutionContext) selected_partition = partition_selector(context, self) if not selected_partition: raise DagsterInvariantViolationError( "The partition selection function `{selector}` did not return " "a partition from PartitionSet {partition_set}".format( selector=getattr(partition_selector, '__name__', repr(partition_selector)), partition_set=self.name, ) ) return self.tags_for_partition(selected_partition) return PartitionScheduleDefinition( name=schedule_name, cron_schedule=cron_schedule, pipeline_name=self.pipeline_name, environment_dict_fn=_environment_dict_fn_wrapper, tags_fn=_tags_fn_wrapper, solid_subset=self.solid_subset, mode=self.mode, should_execute=_should_execute_wrapper, environment_vars=environment_vars, partition_set=self, )
def __call__(self, fn: Callable[[], Any]) -> RepositoryDefinition: from dagster.core.asset_defs import AssetGroup check.callable_param(fn, "fn") if not self.name: self.name = fn.__name__ repository_definitions = fn() if not (isinstance(repository_definitions, list) or isinstance(repository_definitions, dict) or isinstance(repository_definitions, RepositoryData)): raise DagsterInvalidDefinitionError( "Bad return value of type {type_} from repository construction function: must " "return list, dict, or RepositoryData. See the @repository decorator docstring for " "details and examples".format( type_=type(repository_definitions)), ) if isinstance(repository_definitions, list): bad_definitions = [] for i, definition in enumerate(repository_definitions): if not (isinstance(definition, PipelineDefinition) or isinstance(definition, PartitionSetDefinition) or isinstance(definition, ScheduleDefinition) or isinstance(definition, SensorDefinition) or isinstance(definition, GraphDefinition) or isinstance(definition, AssetGroup)): bad_definitions.append((i, type(definition))) if bad_definitions: bad_definitions_str = ", ".join([ "value of type {type_} at index {i}".format(type_=type_, i=i) for i, type_ in bad_definitions ]) raise DagsterInvalidDefinitionError( "Bad return value from repository construction function: all elements of list " "must be of type JobDefinition, GraphDefinition, PipelineDefinition, " "PartitionSetDefinition, ScheduleDefinition, or SensorDefinition. " f"Got {bad_definitions_str}.") repository_data = CachingRepositoryData.from_list( repository_definitions) elif isinstance(repository_definitions, dict): if not set(repository_definitions.keys()).issubset( VALID_REPOSITORY_DATA_DICT_KEYS): raise DagsterInvalidDefinitionError( "Bad return value from repository construction function: dict must not contain " "keys other than {{'pipelines', 'partition_sets', 'schedules', 'jobs'}}: found " "{bad_keys}".format(bad_keys=", ".join([ "'{key}'".format(key=key) for key in repository_definitions.keys() if key not in VALID_REPOSITORY_DATA_DICT_KEYS ]))) repository_data = CachingRepositoryData.from_dict( repository_definitions) elif isinstance(repository_definitions, RepositoryData): repository_data = repository_definitions repository_def = RepositoryDefinition(name=self.name, description=self.description, repository_data=repository_data) update_wrapper(repository_def, fn) return repository_def
def split_function_parameters(fn, expected_positionals): check.callable_param(fn, 'fn') check.list_param(expected_positionals, 'expected_positionals', str) fn_params = list(funcsigs.signature(fn).parameters.values()) return fn_params[0:len(expected_positionals )], fn_params[len(expected_positionals):]
def __init__( self, name: str, pipeline_run_status: PipelineRunStatus, run_status_sensor_fn: Callable[[RunStatusSensorContext], Union[SkipReason, PipelineRunReaction]], pipeline_selection: Optional[List[str]] = None, minimum_interval_seconds: Optional[int] = None, description: Optional[str] = None, job_selection: Optional[List[Union[PipelineDefinition, GraphDefinition]]] = None, ): from dagster.core.storage.event_log.base import RunShardedEventsCursor, EventRecordsFilter check.str_param(name, "name") check.inst_param(pipeline_run_status, "pipeline_run_status", PipelineRunStatus) check.callable_param(run_status_sensor_fn, "run_status_sensor_fn") check.opt_list_param(pipeline_selection, "pipeline_selection", str) check.opt_int_param(minimum_interval_seconds, "minimum_interval_seconds") check.opt_str_param(description, "description") check.opt_list_param(job_selection, "job_selection", (PipelineDefinition, GraphDefinition)) def _wrapped_fn(context: SensorEvaluationContext): # initiate the cursor to (most recent event id, current timestamp) when: # * it's the first time starting the sensor # * or, the cursor isn't in valid format (backcompt) if context.cursor is None or not RunStatusSensorCursor.is_valid( context.cursor): most_recent_event_records = list( context.instance.get_event_records(ascending=False, limit=1)) most_recent_event_id = (most_recent_event_records[0].storage_id if len(most_recent_event_records) == 1 else -1) new_cursor = RunStatusSensorCursor( update_timestamp=pendulum.now("UTC").isoformat(), record_id=most_recent_event_id, ) context.update_cursor(new_cursor.to_json()) yield SkipReason( f"Initiating {name}. Set cursor to {new_cursor}") return record_id, update_timestamp = RunStatusSensorCursor.from_json( context.cursor) # Fetch events after the cursor id # * we move the cursor forward to the latest visited event's id to avoid revisits # * when the daemon is down, bc we persist the cursor info, we can go back to where we # left and backfill alerts for the qualified events (up to 5 at a time) during the downtime # Note: this is a cross-run query which requires extra handling in sqlite, see details in SqliteEventLogStorage. event_records = context.instance.get_event_records( EventRecordsFilter( after_cursor=RunShardedEventsCursor( id=record_id, run_updated_after=cast( datetime, pendulum.parse(update_timestamp)), ), event_type=PIPELINE_RUN_STATUS_TO_EVENT_TYPE[ pipeline_run_status], ), ascending=True, limit=5, ) for event_record in event_records: event_log_entry = event_record.event_log_entry storage_id = event_record.storage_id # get run info run_records = context.instance.get_run_records( filters=PipelineRunsFilter( run_ids=[event_log_entry.run_id])) # skip if we couldn't find the right run if len(run_records) != 1: # bc we couldn't find the run, we use the event timestamp as the approximate # run update timestamp approximate_update_timestamp = utc_datetime_from_timestamp( event_log_entry.timestamp) context.update_cursor( RunStatusSensorCursor( record_id=storage_id, update_timestamp=approximate_update_timestamp. isoformat(), ).to_json()) continue pipeline_run = run_records[0].pipeline_run update_timestamp = run_records[0].update_timestamp # skip if any of of the followings happens: if ( # the pipeline does not have a repository (manually executed) not pipeline_run.external_pipeline_origin or # the pipeline does not belong to the current repository pipeline_run.external_pipeline_origin. external_repository_origin.repository_name != context.repository_name or # if pipeline is not selected (pipeline_selection and pipeline_run.pipeline_name not in pipeline_selection) or # if job not selected (job_selection and pipeline_run.pipeline_name not in map( lambda x: x.name, job_selection))): context.update_cursor( RunStatusSensorCursor( record_id=storage_id, update_timestamp=update_timestamp.isoformat()). to_json()) continue serializable_error = None try: with user_code_error_boundary( RunStatusSensorExecutionError, lambda: f'Error occurred during the execution sensor "{name}".', ): # one user code invocation maps to one failure event run_status_sensor_fn( RunStatusSensorContext( sensor_name=name, dagster_run=pipeline_run, dagster_event=event_log_entry.dagster_event, instance=context.instance, )) except RunStatusSensorExecutionError as run_status_sensor_execution_error: # When the user code errors, we report error to the sensor tick not the original run. serializable_error = serializable_error_info_from_exc_info( run_status_sensor_execution_error.original_exc_info) context.update_cursor( RunStatusSensorCursor(record_id=storage_id, update_timestamp=update_timestamp. isoformat()).to_json()) # Yield PipelineRunReaction to indicate the execution success/failure. # The sensor machinery would # * report back to the original run if success # * update cursor and job state yield PipelineRunReaction( pipeline_run=pipeline_run, error=serializable_error, ) super(RunStatusSensorDefinition, self).__init__( name=name, evaluation_fn=_wrapped_fn, minimum_interval_seconds=minimum_interval_seconds, description=description, )
def create_schedule_definition( self, schedule_name, cron_schedule, partition_selector, should_execute=None, environment_vars=None, execution_timezone=None, description=None, ): """Create a ScheduleDefinition from a PartitionSetDefinition. Arguments: schedule_name (str): The name of the schedule. cron_schedule (str): A valid cron string for the schedule partition_selector (Callable[ScheduleExecutionContext, PartitionSetDefinition], Partition): Function that determines the partition to use at a given execution time. For time-based partition sets, will likely be either `identity_partition_selector` or a selector returned by `create_offset_partition_selector`. should_execute (Optional[function]): Function that runs at schedule execution time that determines whether a schedule should execute. Defaults to a function that always returns ``True``. environment_vars (Optional[dict]): The environment variables to set for the schedule. execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works with DagsterDaemonScheduler, and must be set when using that scheduler. description (Optional[str]): A human-readable description of the schedule. Returns: ScheduleDefinition: The generated ScheduleDefinition for the partition selector """ check.str_param(schedule_name, "schedule_name") check.str_param(cron_schedule, "cron_schedule") check.opt_callable_param(should_execute, "should_execute") check.opt_dict_param(environment_vars, "environment_vars", key_type=str, value_type=str) check.callable_param(partition_selector, "partition_selector") check.opt_str_param(execution_timezone, "execution_timezone") check.opt_str_param(description, "description") def _execution_fn(context): check.inst_param(context, "context", ScheduleExecutionContext) with user_code_error_boundary( ScheduleExecutionError, lambda: f"Error occurred during the execution of partition_selector for schedule {schedule_name}", ): selected_partition = partition_selector(context, self) if not selected_partition: yield SkipReason( "Partition selector did not return a partition. Make sure that the timezone " "on your partition set matches your execution timezone.") return if selected_partition.name not in self.get_partition_names( context.scheduled_execution_time): yield SkipReason( f"Partition selector returned a partition {selected_partition.name} not in the partition set." ) return with user_code_error_boundary( ScheduleExecutionError, lambda: f"Error occurred during the execution of should_execute for schedule {schedule_name}", ): if should_execute and not should_execute(context): yield SkipReason( "should_execute function for {schedule_name} returned false." .format(schedule_name=schedule_name)) return with user_code_error_boundary( ScheduleExecutionError, lambda: f"Error occurred during the execution of run_config_fn for schedule {schedule_name}", ): run_config = self.run_config_for_partition(selected_partition) with user_code_error_boundary( ScheduleExecutionError, lambda: f"Error occurred during the execution of tags_fn for schedule {schedule_name}", ): tags = self.tags_for_partition(selected_partition) yield RunRequest( run_key=None, run_config=run_config, tags=tags, ) return PartitionScheduleDefinition( name=schedule_name, cron_schedule=cron_schedule, pipeline_name=self.pipeline_name, tags_fn=None, solid_selection=self.solid_selection, mode=self.mode, should_execute=None, environment_vars=environment_vars, partition_set=self, execution_timezone=execution_timezone, execution_fn=_execution_fn, description=description, )
def validate_solid_fn( decorator_name: str, fn_name: str, compute_fn: Callable[..., Any], input_defs: List[InputDefinition], expected_positionals: Optional[List[str]] = None, exclude_nothing: Optional[bool] = True, ) -> List[str]: check.str_param(decorator_name, "decorator_name") check.str_param(fn_name, "fn_name") check.callable_param(compute_fn, "compute_fn") check.list_param(input_defs, "input_defs", of_type=InputDefinition) expected_positionals = check.opt_list_param(expected_positionals, "expected_positionals", of_type=str) if exclude_nothing: names = set(inp.name for inp in input_defs if not inp.dagster_type.kind == DagsterTypeKind.NOTHING) nothing_names = set( inp.name for inp in input_defs if inp.dagster_type.kind == DagsterTypeKind.NOTHING) else: names = set(inp.name for inp in input_defs) nothing_names = set() # Currently being super strict about naming. Might be a good idea to relax. Starting strict. fn_positionals, input_args = split_function_parameters( compute_fn, expected_positionals) # Validate Positional Parameters missing_positional = validate_decorated_fn_positionals( fn_positionals, expected_positionals) if missing_positional: raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function does not have required positional " "parameter '{missing_param}'. Solid functions should only have keyword arguments " "that match input names and a first positional parameter named 'context'." .format(decorator_name=decorator_name, solid_name=fn_name, missing_param=missing_positional)) # Validate non positional parameters invalid_function_info = validate_decorated_fn_input_args(names, input_args) if invalid_function_info: if invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[ "vararg"]: raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function has positional vararg parameter " "'{param}'. Solid functions should only have keyword arguments that match " "input names and a first positional parameter named 'context'." .format( decorator_name=decorator_name, solid_name=fn_name, param=invalid_function_info.param, )) elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[ "missing_name"]: if invalid_function_info.param in nothing_names: raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is " "one of the solid input_defs of type 'Nothing' which should not be included since " "no data will be passed for it. ".format( decorator_name=decorator_name, solid_name=fn_name, param=invalid_function_info.param, )) else: raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is not " "one of the solid input_defs. Solid functions should only have keyword arguments " "that match input names and a first positional parameter named 'context'." .format( decorator_name=decorator_name, solid_name=fn_name, param=invalid_function_info.param, )) elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[ "extra"]: undeclared_inputs_printed = ", '".join( invalid_function_info.missing_names) raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function does not have parameter(s) " "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions " "should only have keyword arguments that match input names and a first positional " "parameter named 'context'.".format( decorator_name=decorator_name, solid_name=fn_name, undeclared_inputs_printed=undeclared_inputs_printed, )) return positional_arg_name_list(input_args)
def solid_execution_error_boundary(error_cls, msg_fn, step_context, **kwargs): """ A specialization of user_code_error_boundary for the steps involved in executing a solid. This variant supports the control flow exceptions RetryRequested and Failure as well as respecting the RetryPolicy if present. """ from dagster.core.execution.context.system import StepExecutionContext check.callable_param(msg_fn, "msg_fn") check.class_param(error_cls, "error_cls", superclass=DagsterUserCodeExecutionError) check.inst_param(step_context, "step_context", StepExecutionContext) with raise_execution_interrupts(): step_context.log.begin_python_log_capture() retry_policy = step_context.solid_retry_policy try: yield except DagsterError as de: # The system has thrown an error that is part of the user-framework contract raise de except Exception as e: # pylint: disable=W0703 # An exception has been thrown by user code and computation should cease # with the error reported further up the stack # Directly thrown RetryRequested escalate before evaluating the retry policy. if isinstance(e, RetryRequested): raise e if retry_policy: raise RetryRequested( max_retries=retry_policy.max_retries, seconds_to_wait=retry_policy.calculate_delay( step_context.previous_attempt_count + 1), ) from e # Failure exceptions get re-throw without wrapping if isinstance(e, Failure): raise e # Otherwise wrap the user exception with context raise error_cls( msg_fn(), user_exception=e, original_exc_info=sys.exc_info(), **kwargs, ) from e except (DagsterExecutionInterruptedError, KeyboardInterrupt) as ie: # respect retry policy when interrupts occur if retry_policy: raise RetryRequested( max_retries=retry_policy.max_retries, seconds_to_wait=retry_policy.calculate_delay( step_context.previous_attempt_count + 1), ) from ie else: raise ie finally: step_context.log.end_python_log_capture()
def __init__(self, config_type, func, required_resource_keys): self._config_type = check.inst_param(config_type, 'config_type', ConfigType) self._func = check.callable_param(func, 'func') self._required_resource_keys = check.opt_set_param( required_resource_keys, 'required_resource_keys', of_type=str)
def __init__( self, type_check_fn, key=None, name=None, is_builtin=False, description=None, loader=None, materializer=None, serialization_strategy=None, auto_plugins=None, required_resource_keys=None, kind=DagsterTypeKind.REGULAR, ): check.opt_str_param(key, "key") check.opt_str_param(name, "name") check.invariant(not (name is None and key is None), "Must set key or name") if name is None: check.param_invariant( bool(key), "key", "If name is not provided, must provide key.", ) self.key, self._name = key, None elif key is None: check.param_invariant( bool(name), "name", "If key is not provided, must provide name.", ) self.key, self._name = name, name else: check.invariant(key and name) self.key, self._name = key, name self.description = check.opt_str_param(description, "description") self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader) self.materializer = check.opt_inst_param( materializer, "materializer", DagsterTypeMaterializer ) self.serialization_strategy = check.opt_inst_param( serialization_strategy, "serialization_strategy", SerializationStrategy, PickleSerializationStrategy(), ) self.required_resource_keys = check.opt_set_param( required_resource_keys, "required_resource_keys", ) self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn") _validate_type_check_fn(self._type_check_fn, self._name) auto_plugins = check.opt_list_param(auto_plugins, "auto_plugins", of_type=type) check.param_invariant( all( issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins ), "auto_plugins", ) self.auto_plugins = auto_plugins self.is_builtin = check.bool_param(is_builtin, "is_builtin") check.invariant( self.display_name is not None, "All types must have a valid display name, got None for key {}".format(key), ) self.kind = check.inst_param(kind, "kind", DagsterTypeKind)
def inner( fn: Callable[ ..., Union[RunRequest, SkipReason, RunConfig, RunRequestGenerator], ] ) -> ScheduleDefinition: check.callable_param(fn, "fn") schedule_name = name or fn.__name__ # perform upfront validation of schedule tags _tags_fn: Optional[Callable[["ScheduleEvaluationContext"], Dict[str, str]]] = None if tags_fn and tags: raise DagsterInvalidDefinitionError( "Attempted to provide both tags_fn and tags as arguments" " to ScheduleDefinition. Must provide only one of the two." ) elif tags: check_tags(tags, "tags") _tags_fn = cast(Callable[["ScheduleEvaluationContext"], Dict[str, str]], lambda _: tags) elif tags_fn: _tags_fn = cast( Callable[["ScheduleEvaluationContext"], Dict[str, str]], lambda context: tags_fn(context) or {}, ) def _wrapped_fn(context: "ScheduleEvaluationContext"): if should_execute: with user_code_error_boundary( ScheduleExecutionError, lambda: f"Error occurred during the execution of should_execute for schedule {schedule_name}", ): if not should_execute(context): yield SkipReason( f"should_execute function for {schedule_name} returned false." ) return with user_code_error_boundary( ScheduleExecutionError, lambda: f"Error occurred during the evaluation of schedule {schedule_name}", ): result = fn(context) if has_context_arg else fn() if isinstance(result, dict): # this is the run-config based decorated function, wrap the evaluated run config # and tags in a RunRequest evaluated_run_config = copy.deepcopy(result) evaluated_tags = _tags_fn(context) if _tags_fn else None yield RunRequest( run_key=None, run_config=evaluated_run_config, tags=evaluated_tags, ) else: # this is a run-request based decorated function yield from ensure_gen(result) has_context_arg = is_context_provided(get_function_params(fn)) evaluation_fn = DecoratedScheduleFunction( decorated_fn=fn, wrapped_fn=_wrapped_fn, has_context_arg=has_context_arg, ) schedule_def = ScheduleDefinition( name=schedule_name, cron_schedule=cron_schedule, pipeline_name=pipeline_name, solid_selection=solid_selection, mode=mode, environment_vars=environment_vars, execution_timezone=execution_timezone, description=description, execution_fn=evaluation_fn, job=job, default_status=default_status, ) update_wrapper(schedule_def, wrapped=fn) return schedule_def
def schedule_partition_range( start, end, cron_schedule, fmt, timezone, execution_time_to_partition_fn, ): check.inst_param(start, "start", datetime.datetime) check.opt_inst_param(end, "end", datetime.datetime) check.str_param(cron_schedule, "cron_schedule") check.str_param(fmt, "fmt") check.opt_str_param(timezone, "timezone") check.callable_param(execution_time_to_partition_fn, "execution_time_to_partition_fn") if end and start > end: raise DagsterInvariantViolationError( 'Selected date range start "{start}" is after date range end "{end}' .format( start=start.strftime(fmt), end=end.strftime(fmt), )) def get_schedule_range_partitions(current_time=None): check.opt_inst_param(current_time, "current_time", datetime.datetime) tz = timezone if timezone else pendulum.now().timezone.name _start = (start.in_tz(tz) if isinstance(start, pendulum.Pendulum) else pendulum.instance(start, tz=tz)) if end: _end = end elif current_time: _end = current_time else: _end = pendulum.now(tz) # coerce to the definition timezone if isinstance(_end, pendulum.Pendulum): _end = _end.in_tz(tz) else: _end = pendulum.instance(_end, tz=tz) end_timestamp = _end.timestamp() partitions = [] for next_time in schedule_execution_time_iterator( _start.timestamp(), cron_schedule, tz): partition_time = execution_time_to_partition_fn(next_time) if partition_time.timestamp() > end_timestamp: break if partition_time.timestamp() < _start.timestamp(): continue partitions.append( Partition(value=partition_time, name=partition_time.strftime(fmt))) return partitions[:-1] return get_schedule_range_partitions
def event_generator( self, execution_plan, run_config, pipeline_run, instance, scoped_resources_builder_cm, intermediate_storage=None, raise_on_error=False, resource_instances_to_override=None, ): execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan) pipeline_def = execution_plan.pipeline.get_definition() run_config = check.dict_param(run_config, "run_config", key_type=str) pipeline_run = check.inst_param(pipeline_run, "pipeline_run", PipelineRun) instance = check.inst_param(instance, "instance", DagsterInstance) scoped_resources_builder_cm = check.callable_param( scoped_resources_builder_cm, "scoped_resources_builder_cm") intermediate_storage = check.opt_inst_param( intermediate_storage, "intermediate_storage_data", IntermediateStorage) raise_on_error = check.bool_param(raise_on_error, "raise_on_error") resource_instances_to_override = check.opt_dict_param( resource_instances_to_override, "resource_instances_to_override") execution_context = None resources_manager = None try: context_creation_data = create_context_creation_data( execution_plan, run_config, pipeline_run, instance, ) log_manager = create_log_manager(context_creation_data) resources_manager = scoped_resources_builder_cm( execution_plan, context_creation_data.environment_config, context_creation_data.pipeline_run, log_manager, context_creation_data.resource_keys_to_init, instance, resource_instances_to_override, ) yield from resources_manager.generate_setup_events() scoped_resources_builder = check.inst( resources_manager.get_object(), ScopedResourcesBuilder) intermediate_storage = create_intermediate_storage( context_creation_data, intermediate_storage, scoped_resources_builder, ) execution_context = self.construct_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, log_manager=log_manager, intermediate_storage=intermediate_storage, raise_on_error=raise_on_error, ) _validate_plan_with_context(execution_context, execution_plan) yield execution_context yield from resources_manager.generate_teardown_events() except DagsterError as dagster_error: if execution_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager( instance, pipeline_run, pipeline_def), ) if resources_manager: yield from resources_manager.generate_teardown_events() else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def _core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): from .tasks import make_app check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.callable_param(step_execution_fn, 'step_execution_fn') check.param_invariant( isinstance(pipeline_context.executor_config, (CeleryConfig, CeleryK8sJobConfig)), 'pipeline_context', 'Expected executor_config to be Celery config got {}'.format( pipeline_context.executor_config), ) celery_config = pipeline_context.executor_config storage = pipeline_context.environment_dict.get('storage') # https://github.com/dagster-io/dagster/issues/2440 check.invariant( pipeline_context.system_storage_def.is_persistent, 'Cannot use in-memory storage with Celery, use filesystem (on top of NFS or ' 'similar system that allows files to be available to all nodes), S3, or GCS', ) app = make_app(celery_config) priority_for_step = lambda step: (-1 * int( step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} completed_steps = set({}) # Set[step_key] active_execution = execution_plan.start( retries=pipeline_context.executor_config.retries, sort_key_fn=priority_for_step) stopping = False while (not active_execution.is_complete and not stopping) or step_results: results_to_pop = [] for step_key, result in sorted(step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except Exception as e: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) stopping = True for step_event in step_events: event = deserialize_json_to_dagster_namedtuple(step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) completed_steps.add(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority) except Exception: yield DagsterEvent.engine_event( pipeline_context, 'Encountered error during celery task submission.'.format( ), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( 'During celery execution errors occurred in workers:\n{error_list}' .format(error_list='\n'.join([ '[{step}]: {err}'.format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def _create_solid_compute_wrapper(fn, input_defs, output_defs): check.callable_param(fn, "fn") check.list_param(input_defs, "input_defs", of_type=InputDefinition) check.list_param(output_defs, "output_defs", of_type=OutputDefinition) input_names = [ input_def.name for input_def in input_defs if not input_def.dagster_type.kind == DagsterTypeKind.NOTHING ] @wraps(fn) def compute(context, input_defs): kwargs = {} for input_name in input_names: kwargs[input_name] = input_defs[input_name] result = fn(context, **kwargs) if inspect.isgenerator(result): for item in result: yield item else: if isinstance( result, (AssetMaterialization, Materialization, ExpectationResult)): raise DagsterInvariantViolationError(( "Error in solid {solid_name}: If you are returning an AssetMaterialization " "or an ExpectationResult from solid you must yield them to avoid " "ambiguity with an implied result from returning a value.". format(solid_name=context.solid.name))) if isinstance(result, Output): yield result elif len(output_defs) == 1: yield Output(value=result, output_name=output_defs[0].name) elif result is not None: if not output_defs: raise DagsterInvariantViolationError(( "Error in solid {solid_name}: Unexpectedly returned output {result} " "of type {type_}. Solid is explicitly defined to return no " "results.").format(solid_name=context.solid.name, result=result, type_=type(result))) raise DagsterInvariantViolationError(( "Error in solid {solid_name}: Solid unexpectedly returned " "output {result} of type {type_}. Should " "be a generator, containing or yielding " "{n_results} results: {{{expected_results}}}.").format( solid_name=context.solid.name, result=result, type_=type(result), n_results=len(output_defs), expected_results=", ".join([ "'{result_name}': {dagster_type}".format( result_name=output_def.name, dagster_type=output_def.dagster_type, ) for output_def in output_defs ]), )) return compute
def pipeline_initialization_event_generator( execution_plan, run_config, pipeline_run, instance, scoped_resources_builder_cm, system_storage_data=None, intermediate_storage=None, raise_on_error=False, ): execution_plan = check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) pipeline_def = execution_plan.pipeline.get_definition() run_config = check.dict_param(run_config, 'run_config', key_type=str) pipeline_run = check.inst_param(pipeline_run, 'pipeline_run', PipelineRun) instance = check.inst_param(instance, 'instance', DagsterInstance) scoped_resources_builder_cm = check.callable_param( scoped_resources_builder_cm, 'scoped_resources_builder_cm') system_storage_data = check.opt_inst_param(system_storage_data, 'system_storage_data', SystemStorageData) intermediate_storage = check.opt_inst_param(intermediate_storage, 'intermediate_storage_data', IntermediateStorage) raise_on_error = check.bool_param(raise_on_error, 'raise_on_error') pipeline_context = None resources_manager = None try: context_creation_data = create_context_creation_data( execution_plan, run_config, pipeline_run, instance, ) executor = check.inst(create_executor(context_creation_data), Executor, 'Must return an Executor') log_manager = create_log_manager(context_creation_data) resources_manager = scoped_resources_builder_cm( execution_plan, context_creation_data.environment_config, context_creation_data.pipeline_run, log_manager, context_creation_data.resource_keys_to_init, ) for event in resources_manager.generate_setup_events(): yield event scoped_resources_builder = check.inst(resources_manager.get_object(), ScopedResourcesBuilder) system_storage_data = create_system_storage_data( context_creation_data, system_storage_data, scoped_resources_builder) if intermediate_storage or context_creation_data.intermediate_storage_def: intermediate_storage = create_intermediate_storage( context_creation_data, intermediate_storage, scoped_resources_builder, ) else: # remove this as part of https://github.com/dagster-io/dagster/issues/2705 intermediate_storage = system_storage_data.intermediates_manager pipeline_context = construct_pipeline_execution_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, system_storage_data=system_storage_data, intermediate_storage=intermediate_storage, log_manager=log_manager, executor=executor, raise_on_error=raise_on_error, ) _validate_plan_with_context(pipeline_context, execution_plan) yield pipeline_context for event in resources_manager.generate_teardown_events(): yield event except DagsterError as dagster_error: if pipeline_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager( instance, pipeline_run, pipeline_def), ) if resources_manager: for event in resources_manager.generate_teardown_events(): yield event else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def __new__(cls, config_fn, config_schema=None): return super(ConfigMapping, cls).__new__( cls, config_fn=check.callable_param(config_fn, 'config_fn'), config_schema=check_user_facing_opt_config_param(config_schema, 'config_schema'), )
def __init__( self, type_check_fn: TypeCheckFn, key: t.Optional[str] = None, name: t.Optional[str] = None, is_builtin: bool = False, description: t.Optional[str] = None, loader: t.Optional[DagsterTypeLoader] = None, materializer: t.Optional[DagsterTypeMaterializer] = None, required_resource_keys: t.Optional[t.Set[str]] = None, kind: DagsterTypeKind = DagsterTypeKind.REGULAR, typing_type: t.Any = None, metadata_entries: t.Optional[t.List[MetadataEntry]] = None, metadata: t.Optional[t.Dict[str, RawMetadataValue]] = None, ): check.opt_str_param(key, "key") check.opt_str_param(name, "name") check.invariant(not (name is None and key is None), "Must set key or name") if name is None: key = check.not_none( key, "If name is not provided, must provide key.", ) self.key, self._name = key, None elif key is None: name = check.not_none( name, "If key is not provided, must provide name.", ) self.key, self._name = name, name else: check.invariant(key and name) self.key, self._name = key, name self.description = check.opt_str_param(description, "description") self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader) self.materializer = check.opt_inst_param(materializer, "materializer", DagsterTypeMaterializer) self.required_resource_keys = check.opt_set_param( required_resource_keys, "required_resource_keys", ) self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn") _validate_type_check_fn(self._type_check_fn, self._name) self.is_builtin = check.bool_param(is_builtin, "is_builtin") check.invariant( self.display_name is not None, "All types must have a valid display name, got None for key {}". format(key), ) self.kind = check.inst_param(kind, "kind", DagsterTypeKind) self.typing_type = typing_type metadata_entries = check.opt_list_param(metadata_entries, "metadata_entries", of_type=MetadataEntry) metadata = check.opt_dict_param(metadata, "metadata", key_type=str) self._metadata_entries = normalize_metadata(metadata, metadata_entries)
def __init__(self, indent_level=2, printer=print): self.current_indent = 0 self.indent_level = check.int_param(indent_level, 'indent_level') self.printer = check.callable_param(printer, 'printer') self._line_so_far = ''
def core_celery_execution_loop(pipeline_context, execution_plan, step_execution_fn): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.callable_param(step_execution_fn, "step_execution_fn") executor = pipeline_context.executor # https://github.com/dagster-io/dagster/issues/2440 check.invariant( execution_plan.artifacts_persisted, "Cannot use in-memory storage with Celery, use filesystem (on top of NFS or " "similar system that allows files to be available to all nodes), S3, or GCS", ) app = make_app(executor.app_args()) priority_for_step = lambda step: (-1 * int( step.tags.get(DAGSTER_CELERY_STEP_PRIORITY_TAG, task_default_priority) ) + -1 * _get_run_priority(pipeline_context)) priority_for_key = lambda step_key: (priority_for_step( execution_plan.get_step_by_key(step_key))) _warn_on_priority_misuse(pipeline_context, execution_plan) step_results = {} # Dict[ExecutionStep, celery.AsyncResult] step_errors = {} with execution_plan.start( retries=pipeline_context.executor.retries, sort_key_fn=priority_for_step, ) as active_execution: stopping = False while (not active_execution.is_complete and not stopping) or step_results: if active_execution.check_for_interrupts(): yield DagsterEvent.engine_event( pipeline_context, "Celery executor: received termination signal - revoking active tasks from workers", EngineEventData.interrupted(list(step_results.keys())), ) stopping = True for key, result in step_results.items(): result.revoke() active_execution.mark_interrupted(key) results_to_pop = [] for step_key, result in sorted( step_results.items(), key=lambda x: priority_for_key(x[0])): if result.ready(): try: step_events = result.get() except TaskRevokedError: step_events = [] yield DagsterEvent.engine_event( pipeline_context, 'celery task for running step "{step_key}" was revoked.' .format(step_key=step_key, ), EngineEventData(marker_end=DELEGATE_MARKER), step_key=step_key, ) except Exception: # pylint: disable=broad-except # We will want to do more to handle the exception here.. maybe subclass Task # Certainly yield an engine or pipeline event step_events = [] step_errors[ step_key] = serializable_error_info_from_exc_info( sys.exc_info()) for step_event in step_events: event = deserialize_json_to_dagster_namedtuple( step_event) yield event active_execution.handle_event(event) results_to_pop.append(step_key) for step_key in results_to_pop: if step_key in step_results: del step_results[step_key] active_execution.verify_complete(pipeline_context, step_key) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator( pipeline_context): yield event # don't add any new steps if we are stopping if stopping or step_errors: continue # This is a slight refinement. If we have n workers idle and schedule m > n steps for # execution, the first n steps will be picked up by the idle workers in the order in # which they are scheduled (and the following m-n steps will be executed in priority # order, provided that it takes longer to execute a step than to schedule it). The test # case has m >> n to exhibit this behavior in the absence of this sort step. for step in active_execution.get_steps_to_execute(): try: queue = step.tags.get(DAGSTER_CELERY_QUEUE_TAG, task_default_queue) yield DagsterEvent.engine_event( pipeline_context, 'Submitting celery task for step "{step_key}" to queue "{queue}".' .format(step_key=step.key, queue=queue), EngineEventData(marker_start=DELEGATE_MARKER), step_key=step.key, ) # Get the Celery priority for this step priority = _get_step_priority(pipeline_context, step) # Submit the Celery tasks step_results[step.key] = step_execution_fn( app, pipeline_context, step, queue, priority) except Exception: yield DagsterEvent.engine_event( pipeline_context, "Encountered error during celery task submission.". format(), event_specific_data=EngineEventData.engine_error( serializable_error_info_from_exc_info( sys.exc_info()), ), ) raise time.sleep(TICK_SECONDS) if step_errors: raise DagsterSubprocessError( "During celery execution errors occurred in workers:\n{error_list}" .format(error_list="\n".join([ "[{step}]: {err}".format(step=key, err=err.to_string()) for key, err in step_errors.items() ])), subprocess_error_infos=list(step_errors.values()), )
def __init__( self, type_check_fn, key=None, name=None, is_builtin=False, description=None, input_hydration_config=None, output_materialization_config=None, serialization_strategy=None, auto_plugins=None, required_resource_keys=None, kind=DagsterTypeKind.REGULAR, ): check.opt_str_param(key, 'key') check.opt_str_param(name, 'name') check.invariant(not (name is None and key is None), 'Must set key or name') if name is None: check.param_invariant( bool(key), 'key', 'If name is not provided, must provide key.', ) self.key, self.name = key, None elif key is None: check.param_invariant( bool(name), 'name', 'If key is not provided, must provide name.', ) self.key, self.name = name, name else: check.invariant(key and name) self.key, self.name = key, name self.description = check.opt_str_param(description, 'description') self.input_hydration_config = check.opt_inst_param( input_hydration_config, 'input_hydration_config', InputHydrationConfig) self.output_materialization_config = check.opt_inst_param( output_materialization_config, 'output_materialization_config', OutputMaterializationConfig, ) self.serialization_strategy = check.opt_inst_param( serialization_strategy, 'serialization_strategy', SerializationStrategy, PickleSerializationStrategy(), ) self.required_resource_keys = check.opt_set_param( required_resource_keys, 'required_resource_keys', ) self._type_check_fn = check.callable_param(type_check_fn, 'type_check_fn') _validate_type_check_fn(self._type_check_fn, self.name) auto_plugins = check.opt_list_param(auto_plugins, 'auto_plugins', of_type=type) check.param_invariant( all( issubclass(auto_plugin_type, TypeStoragePlugin) for auto_plugin_type in auto_plugins), 'auto_plugins', ) self.auto_plugins = auto_plugins self.is_builtin = check.bool_param(is_builtin, 'is_builtin') check.invariant( self.display_name is not None, 'All types must have a valid display name, got None for key {}'. format(key), ) self.kind = check.inst_param(kind, 'kind', DagsterTypeKind)
def validate_solid_fn( decorator_name, fn_name, compute_fn, input_defs, expected_positionals=None, exclude_nothing=True ): check.str_param(decorator_name, 'decorator_name') check.str_param(fn_name, 'fn_name') check.callable_param(compute_fn, 'compute_fn') check.list_param(input_defs, 'input_defs', of_type=InputDefinition) expected_positionals = check.opt_list_param( expected_positionals, 'expected_positionals', of_type=str ) if exclude_nothing: names = set(inp.name for inp in input_defs if not inp.runtime_type.is_nothing) nothing_names = set(inp.name for inp in input_defs if inp.runtime_type.is_nothing) else: names = set(inp.name for inp in input_defs) nothing_names = set() # Currently being super strict about naming. Might be a good idea to relax. Starting strict. fn_positionals, input_args = split_function_parameters(compute_fn, expected_positionals) # Validate Positional Parameters missing_positional = validate_decorated_fn_positionals(fn_positionals, expected_positionals) if missing_positional: raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function does not have required positional " "parameter '{missing_param}'. Solid functions should only have keyword arguments " "that match input names and a first positional parameter named 'context'.".format( decorator_name=decorator_name, solid_name=fn_name, missing_param=missing_positional ) ) # Validate non positional parameters invalid_function_info = validate_decorated_fn_input_args(names, input_args) if invalid_function_info: if invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES['vararg']: raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function has positional vararg parameter " "'{param}'. Solid functions should only have keyword arguments that match " "input names and a first positional parameter named 'context'.".format( decorator_name=decorator_name, solid_name=fn_name, param=invalid_function_info.param, ) ) elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES['missing_name']: if invalid_function_info.param in nothing_names: raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is " "one of the solid input_defs of type 'Nothing' which should not be included since " "no data will be passed for it. ".format( decorator_name=decorator_name, solid_name=fn_name, param=invalid_function_info.param, ) ) else: raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is not " "one of the solid input_defs. Solid functions should only have keyword arguments " "that match input names and a first positional parameter named 'context'.".format( decorator_name=decorator_name, solid_name=fn_name, param=invalid_function_info.param, ) ) elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES['extra']: undeclared_inputs_printed = ", '".join(invalid_function_info.missing_names) raise DagsterInvalidDefinitionError( "{decorator_name} '{solid_name}' decorated function does not have parameter(s) " "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions " "should only have keyword arguments that match input names and a first positional " "parameter named 'context'.".format( decorator_name=decorator_name, solid_name=fn_name, undeclared_inputs_printed=undeclared_inputs_printed, ) ) return positional_arg_name_list(input_args)
def __init__(self, config_type, func, required_resource_keys): self._config_type = check.inst_param(config_type, "config_type", ConfigType) self._func = check.callable_param(func, "func") self._required_resource_keys = check.opt_set_param( required_resource_keys, "required_resource_keys", of_type=str)
def _validate_solid_fn(solid_name, compute_fn, input_defs, expected_positionals=None, exclude_nothing=True): check.str_param(solid_name, 'solid_name') check.callable_param(compute_fn, 'compute_fn') check.list_param(input_defs, 'input_defs', of_type=InputDefinition) expected_positionals = check.opt_list_param(expected_positionals, 'expected_positionals', of_type=(str, tuple)) if exclude_nothing: names = set(inp.name for inp in input_defs if not inp.runtime_type.is_nothing) nothing_names = set(inp.name for inp in input_defs if inp.runtime_type.is_nothing) else: names = set(inp.name for inp in input_defs) nothing_names = set() # Currently being super strict about naming. Might be a good idea to relax. Starting strict. try: _validate_decorated_fn(compute_fn, names, expected_positionals) except FunctionValidationError as e: if e.error_type == FunctionValidationError.TYPES['vararg']: raise DagsterInvalidDefinitionError( "solid '{solid_name}' decorated function has positional vararg parameter " "'{e.param}'. Solid functions should only have keyword arguments that match " "input names and a first positional parameter named 'context'." .format(solid_name=solid_name, e=e)) elif e.error_type == FunctionValidationError.TYPES['missing_name']: if e.param in nothing_names: raise DagsterInvalidDefinitionError( "solid '{solid_name}' decorated function has parameter '{e.param}' that is " "one of the solid input_defs of type 'Nothing' which should not be included since " "no data will be passed for it. ".format( solid_name=solid_name, e=e)) else: raise DagsterInvalidDefinitionError( "solid '{solid_name}' decorated function has parameter '{e.param}' that is not " "one of the solid input_defs. Solid functions should only have keyword arguments " "that match input names and a first positional parameter named 'context'." .format(solid_name=solid_name, e=e)) elif e.error_type == FunctionValidationError.TYPES[ 'missing_positional']: raise DagsterInvalidDefinitionError( "solid '{solid_name}' decorated function does not have required positional " "parameter '{e.param}'. Solid functions should only have keyword arguments " "that match input names and a first positional parameter named 'context'." .format(solid_name=solid_name, e=e)) elif e.error_type == FunctionValidationError.TYPES['extra']: undeclared_inputs_printed = ", '".join(e.missing_names) raise DagsterInvalidDefinitionError( "solid '{solid_name}' decorated function does not have parameter(s) " "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions " "should only have keyword arguments that match input names and a first positional " "parameter named 'context'.".format( solid_name=solid_name, undeclared_inputs_printed=undeclared_inputs_printed)) else: raise e
def __init__(self, resource_fn, config_field=None, description=None): self.resource_fn = check.callable_param(resource_fn, 'resource_fn') self.config_field = check_opt_field_param(config_field, 'config_field') self.description = check.opt_str_param(description, 'description')