Esempio n. 1
0
    def __call__(self, fn: Callable[[], Any]) -> RepositoryDefinition:
        check.callable_param(fn, "fn")

        if not self.name:
            self.name = fn.__name__

        repository_definitions = fn()

        if not (
            isinstance(repository_definitions, list)
            or isinstance(repository_definitions, dict)
            or isinstance(repository_definitions, RepositoryData)
        ):
            raise DagsterInvalidDefinitionError(
                "Bad return value of type {type_} from repository construction function: must "
                "return list, dict, or RepositoryData. See the @repository decorator docstring for "
                "details and examples".format(type_=type(repository_definitions)),
            )

        if isinstance(repository_definitions, list):
            bad_definitions = []
            for i, definition in enumerate(repository_definitions):
                if not (
                    isinstance(definition, PipelineDefinition)
                    or isinstance(definition, PartitionSetDefinition)
                    or isinstance(definition, ScheduleDefinition)
                    or isinstance(definition, SensorDefinition)
                    or isinstance(definition, GraphDefinition)
                ):
                    bad_definitions.append((i, type(definition)))
            if bad_definitions:
                bad_definitions_str = ", ".join(
                    [
                        "value of type {type_} at index {i}".format(type_=type_, i=i)
                        for i, type_ in bad_definitions
                    ]
                )
                raise DagsterInvalidDefinitionError(
                    "Bad return value from repository construction function: all elements of list "
                    "must be of type PipelineDefinition, PartitionSetDefinition, "
                    f"ScheduleDefinition, or SensorDefinition. Got {bad_definitions_str}."
                )
            repository_data = CachingRepositoryData.from_list(repository_definitions)

        elif isinstance(repository_definitions, dict):
            if not set(repository_definitions.keys()).issubset(VALID_REPOSITORY_DATA_DICT_KEYS):
                raise DagsterInvalidDefinitionError(
                    "Bad return value from repository construction function: dict must not contain "
                    "keys other than {{'pipelines', 'partition_sets', 'schedules', 'jobs'}}: found "
                    "{bad_keys}".format(
                        bad_keys=", ".join(
                            [
                                "'{key}'".format(key=key)
                                for key in repository_definitions.keys()
                                if key not in VALID_REPOSITORY_DATA_DICT_KEYS
                            ]
                        )
                    )
                )
            repository_data = CachingRepositoryData.from_dict(repository_definitions)
        elif isinstance(repository_definitions, RepositoryData):
            repository_data = repository_definitions

        repository_def = RepositoryDefinition(
            name=self.name, description=self.description, repository_data=repository_data
        )

        update_wrapper(repository_def, fn)
        return repository_def
Esempio n. 2
0
def composite_mapping_from_output(output, output_defs, solid_name):
    # output can be different types
    check.list_param(output_defs, "output_defs", OutputDefinition)
    check.str_param(solid_name, "solid_name")

    # single output
    if isinstance(output, InvokedSolidOutputHandle):
        if len(output_defs) == 1:
            defn = output_defs[0]
            return {
                defn.name: defn.mapping_from(output.solid_name,
                                             output.output_name)
            }
        else:
            raise DagsterInvalidDefinitionError(
                "Returned a single output ({solid_name}.{output_name}) in "
                "@composite_solid {name} but {num} outputs are defined. "
                "Return a dict to map defined outputs.".format(
                    solid_name=output.solid_name,
                    output_name=output.output_name,
                    name=solid_name,
                    num=len(output_defs),
                ))

    output_mapping_dict = {}
    output_def_dict = {
        output_def.name: output_def
        for output_def in output_defs
    }

    # tuple returned directly
    if isinstance(output, tuple) and all(
            map(lambda item: isinstance(item, InvokedSolidOutputHandle),
                output)):
        for handle in output:
            if handle.output_name not in output_def_dict:
                raise DagsterInvalidDefinitionError(
                    "Output name mismatch returning output tuple in @composite_solid {name}. "
                    "No matching OutputDefinition named {output_name} for {solid_name}.{output_name}."
                    "Return a dict to map to the desired OutputDefinition".
                    format(
                        name=solid_name,
                        output_name=handle.output_name,
                        solid_name=handle.solid_name,
                    ))
            output_mapping_dict[handle.output_name] = output_def_dict[
                handle.output_name].mapping_from(handle.solid_name,
                                                 handle.output_name)

        return output_mapping_dict

    # mapping dict
    if isinstance(output, dict):
        for name, handle in output.items():
            if name not in output_def_dict:
                raise DagsterInvalidDefinitionError(
                    "@composite_solid {name} referenced key {key} which does not match any "
                    "OutputDefinitions. Valid options are: {options}".format(
                        name=solid_name,
                        key=name,
                        options=list(output_def_dict.keys())))

            if isinstance(handle, InvokedSolidOutputHandle):
                output_mapping_dict[name] = output_def_dict[name].mapping_from(
                    handle.solid_name, handle.output_name)
            elif isinstance(handle, InvokedSolidDynamicOutputWrapper):
                unwrapped = handle.unwrap_for_composite_mapping()
                output_mapping_dict[name] = output_def_dict[name].mapping_from(
                    unwrapped.solid_name, unwrapped.output_name)
            else:
                raise DagsterInvalidDefinitionError(
                    "@composite_solid {name} returned problematic dict entry under "
                    "key {key} of type {type}. Dict values must be outputs of "
                    "invoked solids".format(name=solid_name,
                                            key=name,
                                            type=type(handle)))

        return output_mapping_dict

    elif isinstance(output, InvokedSolidDynamicOutputWrapper):
        return composite_mapping_from_output(
            output.unwrap_for_composite_mapping(), output_defs, solid_name)

    # error
    if output is not None:
        raise DagsterInvalidDefinitionError(
            "@composite_solid {name} returned problematic value "
            "of type {type}. Expected return value from invoked solid or dict mapping "
            "output name to return values from invoked solids".format(
                name=solid_name, type=type(output)))
Esempio n. 3
0
def external_asset_graph_from_defs(
    pipelines: Sequence[PipelineDefinition],
    source_assets_by_key: Mapping[AssetKey, SourceAsset]
) -> Sequence[ExternalAssetNode]:
    node_defs_by_asset_key: Dict[
        AssetKey, List[Tuple[OutputDefinition, NodeDefinition,
                             PipelineDefinition]]] = defaultdict(list)

    deps: Dict[AssetKey, Dict[AssetKey,
                              ExternalAssetDependency]] = defaultdict(dict)
    dep_by: Dict[AssetKey, List[ExternalAssetDependedBy]] = defaultdict(list)
    all_upstream_asset_keys: Set[AssetKey] = set()

    for pipeline in pipelines:
        for node_def in pipeline.all_node_defs:
            input_name_by_asset_key = {
                id.hardcoded_asset_key: id.name
                for id in node_def.input_defs
                if id.hardcoded_asset_key is not None
            }

            output_name_by_asset_key = {
                od.hardcoded_asset_key: od.name
                for od in node_def.output_defs
                if od.hardcoded_asset_key is not None
            }

            node_upstream_asset_keys = set(
                filter(None,
                       (id.hardcoded_asset_key for id in node_def.input_defs)))
            all_upstream_asset_keys.update(node_upstream_asset_keys)

            for output_def in node_def.output_defs:
                output_asset_key = output_def.hardcoded_asset_key
                if not output_asset_key:
                    continue

                node_defs_by_asset_key[output_asset_key].append(
                    (output_def, node_def, pipeline))

                # if no deps specified, assume depends on all inputs and no outputs
                asset_deps = (output_def.metadata
                              or {}).get(ASSET_DEPENDENCY_METADATA_KEY)
                if asset_deps is None:
                    asset_deps = node_upstream_asset_keys

                for upstream_asset_key in asset_deps:
                    deps[output_asset_key][
                        upstream_asset_key] = ExternalAssetDependency(
                            upstream_asset_key=upstream_asset_key,
                            input_name=input_name_by_asset_key.get(
                                upstream_asset_key),
                            output_name=output_name_by_asset_key.get(
                                upstream_asset_key),
                        )
                    dep_by[upstream_asset_key].append(
                        ExternalAssetDependedBy(
                            downstream_asset_key=output_asset_key,
                            input_name=input_name_by_asset_key.get(
                                upstream_asset_key),
                            output_name=output_name_by_asset_key.get(
                                upstream_asset_key),
                        ))
    asset_keys_without_definitions = all_upstream_asset_keys.difference(
        node_defs_by_asset_key.keys()).difference(source_assets_by_key.keys())

    asset_nodes = [
        ExternalAssetNode(
            asset_key=asset_key,
            dependencies=list(deps[asset_key].values()),
            depended_by=dep_by[asset_key],
            job_names=[],
        ) for asset_key in asset_keys_without_definitions
    ]

    for source_asset in source_assets_by_key.values():
        if source_asset.key in node_defs_by_asset_key:
            raise DagsterInvariantViolationError(
                f"Asset with key {source_asset.key.to_string()} is defined both as a source asset"
                " and as a non-source asset")

        asset_nodes.append(
            ExternalAssetNode(
                asset_key=source_asset.key,
                dependencies=list(deps[source_asset.key].values()),
                depended_by=dep_by[source_asset.key],
                job_names=[],
                op_description=source_asset.description,
            ))

    for asset_key, node_tuple_list in node_defs_by_asset_key.items():
        output_def, node_def, _ = node_tuple_list[0]
        job_names = [job_def.name for _, _, job_def in node_tuple_list]

        # temporary workaround to retrieve asset partition definition from job
        partitions_def_data = None

        if output_def and output_def._asset_partitions_def:  # pylint: disable=protected-access
            partitions_def = output_def._asset_partitions_def  # pylint: disable=protected-access
            if partitions_def:
                if isinstance(partitions_def, TimeWindowPartitionsDefinition):
                    partitions_def_data = external_time_window_partitions_definition_from_def(
                        partitions_def)
                elif isinstance(partitions_def, StaticPartitionsDefinition):
                    partitions_def_data = external_static_partitions_definition_from_def(
                        partitions_def)
                else:
                    raise DagsterInvalidDefinitionError(
                        "Only static partition and time window partitions are currently supported."
                    )

        asset_nodes.append(
            ExternalAssetNode(
                asset_key=asset_key,
                dependencies=list(deps[asset_key].values()),
                depended_by=dep_by[asset_key],
                op_name=node_def.name,
                op_description=node_def.description,
                job_names=job_names,
                partitions_def_data=partitions_def_data,
                output_name=output_def.name,
                output_description=output_def.description,
            ))

    return asset_nodes
Esempio n. 4
0
    def __init__(
        self,
        name: str,
        cron_schedule: str,
        pipeline_name: str,
        run_config: Optional[Any] = None,
        run_config_fn: Optional[Callable[..., Any]] = None,
        tags: Optional[Dict[str, str]] = None,
        tags_fn: Optional[Callable[..., Optional[Dict[str, str]]]] = None,
        solid_selection: Optional[List[Any]] = None,
        mode: Optional[str] = "default",
        should_execute: Optional[Callable[..., bool]] = None,
        environment_vars: Optional[Dict[str, str]] = None,
        execution_timezone: Optional[str] = None,
        execution_fn: Optional[Callable[[ScheduleExecutionContext],
                                        Any]] = None,
        description: Optional[str] = None,
    ):

        if not croniter.is_valid(cron_schedule):
            raise DagsterInvalidDefinitionError(
                f"Found invalid cron schedule '{cron_schedule}' for schedule '{name}''."
            )

        self._name = check_valid_name(name)
        self._pipeline_name = check.str_param(pipeline_name, "pipeline_name")
        self._mode = cast(str,
                          check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME))
        self._solid_selection = check.opt_nullable_list_param(
            solid_selection, "solid_selection", of_type=str)
        self._description = check.opt_str_param(description, "description")

        self._cron_schedule = check.str_param(cron_schedule, "cron_schedule")
        self._environment_vars = check.opt_dict_param(environment_vars,
                                                      "environment_vars",
                                                      key_type=str,
                                                      value_type=str)
        self._execution_timezone = check.opt_str_param(execution_timezone,
                                                       "execution_timezone")

        if execution_fn and (run_config_fn or tags_fn or should_execute or tags
                             or run_config):
            raise DagsterInvalidDefinitionError(
                "Attempted to provide both execution_fn and individual run_config/tags arguments "
                "to ScheduleDefinition. Must provide only one of the two.")
        elif execution_fn:
            self._execution_fn = check.opt_callable_param(
                execution_fn, "execution_fn")
        else:
            if run_config_fn and run_config:
                raise DagsterInvalidDefinitionError(
                    "Attempted to provide both run_config_fn and run_config as arguments"
                    " to ScheduleDefinition. Must provide only one of the two."
                )
            run_config_fn = check.opt_callable_param(
                run_config_fn,
                "run_config_fn",
                default=lambda _context: check.opt_dict_param(
                    run_config, "run_config"),
            )

            if tags_fn and tags:
                raise DagsterInvalidDefinitionError(
                    "Attempted to provide both tags_fn and tags as arguments"
                    " to ScheduleDefinition. Must provide only one of the two."
                )
            elif tags:
                check_tags(tags, "tags")
                tags_fn = lambda _context: tags
            else:
                tags_fn = check.opt_callable_param(tags_fn,
                                                   "tags_fn",
                                                   default=lambda _context: {})

            should_execute = check.opt_callable_param(
                should_execute,
                "should_execute",
                default=lambda _context: True)

            def _execution_fn(context):
                with user_code_error_boundary(
                        ScheduleExecutionError,
                        lambda:
                        f"Error occurred during the execution of should_execute for schedule {name}",
                ):
                    if not should_execute(context):
                        yield SkipReason(
                            "should_execute function for {schedule_name} returned false."
                            .format(schedule_name=name))
                        return

                with user_code_error_boundary(
                        ScheduleExecutionError,
                        lambda:
                        f"Error occurred during the execution of run_config_fn for schedule {name}",
                ):
                    evaluated_run_config = run_config_fn(context)

                with user_code_error_boundary(
                        ScheduleExecutionError,
                        lambda:
                        f"Error occurred during the execution of tags_fn for schedule {name}",
                ):
                    evaluated_tags = tags_fn(context)

                yield RunRequest(
                    run_key=None,
                    run_config=evaluated_run_config,
                    tags=evaluated_tags,
                )

            self._execution_fn = _execution_fn

        if self._execution_timezone:
            try:
                # Verify that the timezone can be loaded
                pendulum.timezone(self._execution_timezone)
            except Exception:
                raise DagsterInvalidDefinitionError(
                    "Invalid execution timezone {timezone} for {schedule_name}"
                    .format(schedule_name=name,
                            timezone=self._execution_timezone))
Esempio n. 5
0
    def __call__(self, *args, **kwargs):
        node_name = self.given_alias if self.given_alias else self.node_def.name
        assert_in_composition(node_name)

        input_bindings = {}

        # handle *args
        for idx, output_node in enumerate(args):
            if idx >= len(self.node_def.input_defs):
                raise DagsterInvalidDefinitionError(
                    "In {source} {name}, received too many inputs for "
                    "invocation {node_name}. Only {def_num} defined, received {arg_num}"
                    .format(
                        source=current_context().source,
                        name=current_context().name,
                        node_name=node_name,
                        def_num=len(self.node_def.input_defs),
                        arg_num=len(args),
                    ))

            input_name = self.node_def.resolve_input_name_at_position(idx)
            if input_name is None:
                raise DagsterInvalidDefinitionError(
                    "In {source} {name}, could not resolve input based on position at "
                    "index {idx} for invocation {node_name}. Use keyword args instead, "
                    "available inputs are: {inputs}".format(
                        idx=idx,
                        source=current_context().source,
                        name=current_context().name,
                        node_name=node_name,
                        inputs=list(
                            map(lambda inp: inp.name,
                                self.node_def.input_defs)),
                    ))

            self._process_argument_node(
                node_name,
                output_node,
                input_name,
                input_bindings,
                "(at position {idx})".format(idx=idx),
            )

        # then **kwargs
        for input_name, output_node in kwargs.items():
            self._process_argument_node(
                node_name,
                output_node,
                input_name,
                input_bindings,
                "(passed by keyword)",
            )

        # the node name is potentially reassigned for aliasing
        resolved_node_name = current_context().observe_invocation(
            self.given_alias,
            self.node_def,
            input_bindings,
            self.tags,
            self.hook_defs,
        )

        if len(self.node_def.output_defs) == 0:
            return None

        if len(self.node_def.output_defs) == 1:
            output_def = self.node_def.output_defs[0]
            output_name = output_def.name
            if output_def.is_dynamic:
                return InvokedSolidDynamicOutputWrapper(
                    resolved_node_name, output_name)
            else:
                return InvokedSolidOutputHandle(resolved_node_name,
                                                output_name)

        outputs = [output_def for output_def in self.node_def.output_defs]
        invoked_output_handles = {}
        for output_def in outputs:
            if output_def.is_dynamic:
                invoked_output_handles[
                    output_def.name] = InvokedSolidDynamicOutputWrapper(
                        resolved_node_name, output_def.name)
            else:
                invoked_output_handles[
                    output_def.name] = InvokedSolidOutputHandle(
                        resolved_node_name, output_def.name)

        return namedtuple(
            "_{node_def}_outputs".format(node_def=self.node_def.name),
            " ".join([output_def.name for output_def in outputs]),
        )(**invoked_output_handles)
Esempio n. 6
0
def validate_solid_fn(decorator_name,
                      fn_name,
                      compute_fn,
                      input_defs,
                      expected_positionals=None,
                      exclude_nothing=True):
    check.str_param(decorator_name, 'decorator_name')
    check.str_param(fn_name, 'fn_name')
    check.callable_param(compute_fn, 'compute_fn')
    check.list_param(input_defs, 'input_defs', of_type=InputDefinition)
    expected_positionals = check.opt_list_param(expected_positionals,
                                                'expected_positionals',
                                                of_type=str)
    if exclude_nothing:
        names = set(inp.name for inp in input_defs
                    if not inp.runtime_type.is_nothing)
        nothing_names = set(inp.name for inp in input_defs
                            if inp.runtime_type.is_nothing)
    else:
        names = set(inp.name for inp in input_defs)
        nothing_names = set()

    # Currently being super strict about naming. Might be a good idea to relax. Starting strict.
    fn_positionals, fn_non_positionals = split_function_parameters(
        compute_fn, expected_positionals)

    # Validate Positional Parameters
    missing_positional = validate_decorated_fn_positionals(
        fn_positionals, expected_positionals)
    if missing_positional:
        raise DagsterInvalidDefinitionError(
            "{decorator_name} '{solid_name}' decorated function does not have required positional "
            "parameter '{missing_param}'. Solid functions should only have keyword arguments "
            "that match input names and a first positional parameter named 'context'."
            .format(decorator_name=decorator_name,
                    solid_name=fn_name,
                    missing_param=missing_positional))

    # Validate non positional parameters
    invalid_function_info = validate_decorated_fn_non_positionals(
        names, fn_non_positionals)
    if invalid_function_info:
        if invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[
                'vararg']:
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function has positional vararg parameter "
                "'{param}'. Solid functions should only have keyword arguments that match "
                "input names and a first positional parameter named 'context'."
                .format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    param=invalid_function_info.param,
                ))
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[
                'missing_name']:
            if invalid_function_info.param in nothing_names:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is "
                    "one of the solid input_defs of type 'Nothing' which should not be included since "
                    "no data will be passed for it. ".format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    ))
            else:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is not "
                    "one of the solid input_defs. Solid functions should only have keyword arguments "
                    "that match input names and a first positional parameter named 'context'."
                    .format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    ))
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[
                'extra']:
            undeclared_inputs_printed = ", '".join(
                invalid_function_info.missing_names)
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function does not have parameter(s) "
                "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions "
                "should only have keyword arguments that match input names and a first positional "
                "parameter named 'context'.".format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    undeclared_inputs_printed=undeclared_inputs_printed,
                ))
Esempio n. 7
0
    def __init__(self, solid_names: List[str], handle_dict: InputToOutputHandleDict):
        self._solid_names = solid_names
        self._handle_dict = handle_dict

        # Building up a couple indexes here so that one can look up all the upstream output handles
        # or downstream input handles in O(1). Without this, this can become O(N^2) where N is solid
        # count during the GraphQL query in particular

        # solid_name => input_handle => list[output_handle]
        self._solid_input_index: dict = defaultdict(dict)

        # solid_name => output_handle => list[input_handle]
        self._solid_output_index: dict = defaultdict(lambda: defaultdict(list))

        # solid_name => dynamic output_handle that this will solid dupe for
        self._dynamic_fan_out_index: dict = {}

        # solid_name => set of dynamic output_handle this collects over
        self._collect_index: Dict[str, set] = defaultdict(set)

        for input_handle, (dep_type, output_handle_or_list) in self._handle_dict.items():
            if dep_type == DependencyType.FAN_IN:
                output_handle_list = []
                for handle in output_handle_or_list:
                    if not isinstance(handle, SolidOutputHandle):
                        continue

                    if handle.is_dynamic:
                        raise DagsterInvalidDefinitionError(
                            "Currently, items in a fan-in dependency cannot be downstream of dynamic outputs. "
                            f'Problematic dependency on dynamic output "{handle.describe()}".'
                        )
                    if self._dynamic_fan_out_index.get(handle.solid_name):
                        raise DagsterInvalidDefinitionError(
                            "Currently, items in a fan-in dependency cannot be downstream of dynamic outputs. "
                            f'Problematic dependency on output "{handle.describe()}", downstream of '
                            f'"{self._dynamic_fan_out_index[handle.solid_name].describe()}".'
                        )

                    output_handle_list.append(handle)
            elif dep_type == DependencyType.DIRECT:
                output_handle = cast(SolidOutputHandle, output_handle_or_list)

                if output_handle.is_dynamic:
                    self._validate_and_set_fan_out(input_handle, output_handle)

                if self._dynamic_fan_out_index.get(output_handle.solid_name):
                    self._validate_and_set_fan_out(
                        input_handle, self._dynamic_fan_out_index[output_handle.solid_name]
                    )

                output_handle_list = [output_handle]
            elif dep_type == DependencyType.DYNAMIC_COLLECT:
                output_handle = cast(SolidOutputHandle, output_handle_or_list)

                if output_handle.is_dynamic:
                    self._validate_and_set_collect(input_handle, output_handle)

                elif self._dynamic_fan_out_index.get(output_handle.solid_name):
                    self._validate_and_set_collect(
                        input_handle,
                        self._dynamic_fan_out_index[output_handle.solid_name],
                    )
                else:
                    check.failed("Unexpected dynamic fan in dep created")

                output_handle_list = [output_handle]
            else:
                check.failed(f"Unexpected dep type {dep_type}")

            self._solid_input_index[input_handle.solid.name][input_handle] = output_handle_list
            for output_handle in output_handle_list:
                self._solid_output_index[output_handle.solid.name][output_handle].append(
                    input_handle
                )
Esempio n. 8
0
def weekly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_day_of_week=0,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_selection=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
    end_date=None,
    execution_timezone=None,
):
    """Create a schedule that runs weekly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_day_of_week (int): The day of the week on which to run the schedule. Must be
            between 0 (Monday) and 6 (Sunday).
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterCommandLineScheduler, and must be set when using that scheduler.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection,
                                  "solid_selection",
                                  of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars,
                         "environment_vars",
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.int_param(execution_day_of_week, "execution_day_of_week")
    check.inst_param(execution_time, "execution_time", datetime.time)
    check.opt_str_param(execution_timezone, "execution_timezone")

    if start_date.hour != 0 or start_date.minute != 0 or start_date.second != 0:
        warnings.warn(
            "`start_date` must be at the beginning of a day for a weekly schedule. "
            "Use `execution_time` to execute the schedule at a specific time of day. For example, "
            "to run the schedule at 3AM each Tuesday starting on 10/20/2020, your schedule "
            "definition would look like:"
            """
@weekly_schedule(
    start_date=datetime.datetime(2020, 10, 20),
    execution_day_of_week=1,
    execution_time=datetime.time(3, 0)
):
def my_schedule_definition(_):
    ...
""")

    if execution_day_of_week < 0 or execution_day_of_week >= 7:
        raise DagsterInvalidDefinitionError(
            "`execution_day_of_week={}` is not valid for weekly schedule. Execution day must be "
            "between 0 [Sunday] and 6 [Saturday]".format(
                execution_day_of_week))

    cron_schedule = "{minute} {hour} * * {day}".format(
        minute=execution_time.minute,
        hour=execution_time.hour,
        day=execution_day_of_week)

    fmt = DEFAULT_DATE_FORMAT

    day_difference = (execution_day_of_week - start_date.weekday()) % 7

    partition_fn = date_partition_range(start_date,
                                        end=end_date,
                                        delta_range="weeks",
                                        fmt=fmt,
                                        timezone=execution_timezone)

    def inner(fn):
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_default_partition_selector_fn(
                delta_fn=lambda d: pendulum.instance(d).subtract(
                    weeks=1, days=day_difference),
                fmt=fmt,
            ),
            execution_timezone=execution_timezone,
        )

    return inner
Esempio n. 9
0
def _List(inner_type):
    check.inst_param(inner_type, 'inner_type', DagsterType)
    if inner_type is Nothing:
        raise DagsterInvalidDefinitionError('Type Nothing can not be wrapped in List or Optional')
    return ListType(inner_type)
Esempio n. 10
0
    def __init__(
        self,
        config,
        default_value=FIELD_NO_DEFAULT_PROVIDED,
        is_required=None,
        description=None,
    ):
        from .validate import validate_config
        from .post_process import resolve_defaults

        self.config_type = check.inst(self._resolve_config_arg(config),
                                      ConfigType)

        self.description = check.opt_str_param(description, "description")

        check.opt_bool_param(is_required, "is_required")

        if default_value != FIELD_NO_DEFAULT_PROVIDED:
            check.param_invariant(not (callable(default_value)),
                                  "default_value",
                                  "default_value cannot be a callable")

        if is_required is True:
            check.param_invariant(
                default_value == FIELD_NO_DEFAULT_PROVIDED,
                "default_value",
                "required arguments should not specify default values",
            )

        self._default_value = default_value

        # check explicit default value
        if self.default_provided:
            if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value(
                    default_value):
                raise DagsterInvalidDefinitionError((
                    "You have passed into a python enum value as the default value "
                    "into of a config enum type {name}. You must pass in the underlying "
                    "string represention as the default value. One of {value_set}."
                ).format(
                    value_set=[
                        ev.config_value for ev in self.config_type.enum_values
                    ],
                    name=self.config_type.given_name,
                ))

            evr = validate_config(self.config_type, default_value)
            if not evr.success:
                raise DagsterInvalidConfigError(
                    "Invalid default_value for Field.",
                    evr.errors,
                    default_value,
                )

        if is_required is None:
            is_optional = has_implicit_default(
                self.config_type) or self.default_provided
            is_required = not is_optional

            # on implicitly optional - set the default value
            # by resolving the defaults of the type
            if is_optional and not self.default_provided:
                evr = resolve_defaults(self.config_type, None)
                if not evr.success:
                    raise DagsterInvalidConfigError(
                        "Unable to resolve implicit default_value for Field.",
                        evr.errors,
                        None,
                    )
                self._default_value = evr.value
        self._is_required = is_required
Esempio n. 11
0
def resolve_to_config_type(dagster_type):
    from .field_utils import convert_fields_to_dict_type

    # Short circuit if it's already a Config Type
    if isinstance(dagster_type, ConfigType):
        return dagster_type

    if isinstance(dagster_type, dict):
        return convert_fields_to_dict_type(dagster_type)

    if isinstance(dagster_type, list):
        if len(dagster_type) != 1:
            raise DagsterInvalidDefinitionError(
                "Array specifications must only be of length 1")

        inner_type = resolve_to_config_type(dagster_type[0])

        if not inner_type:
            raise DagsterInvalidDefinitionError(
                "Invalid member of array specification: {value} in list {the_list}"
                .format(value=repr(dagster_type[0]), the_list=dagster_type))
        return Array(inner_type)

    from dagster.core.types.dagster_type import DagsterType, List, ListType
    from dagster.core.types.python_set import Set, _TypedPythonSet
    from dagster.core.types.python_tuple import Tuple, _TypedPythonTuple

    if _is_config_type_class(dagster_type):
        check.param_invariant(
            False,
            "dagster_type",
            "Cannot pass a config type class to resolve_to_config_type. Got {dagster_type}"
            .format(dagster_type=dagster_type),
        )

    if isinstance(dagster_type, type) and issubclass(dagster_type,
                                                     DagsterType):
        raise DagsterInvalidDefinitionError(
            "You have passed a DagsterType class {dagster_type} to the config system. "
            "The DagsterType and config schema systems are separate. "
            "Valid config values are:\n{desc}".format(
                dagster_type=repr(dagster_type),
                desc=VALID_CONFIG_DESC,
            ))

    if is_typing_type(dagster_type):
        raise DagsterInvalidDefinitionError((
            "You have passed in {dagster_type} to the config system. Types from "
            "the typing module in python are not allowed in the config system. "
            "You must use types that are imported from dagster or primitive types "
            "such as bool, int, etc.").format(dagster_type=dagster_type))

    if dagster_type is List or isinstance(dagster_type, ListType):
        raise DagsterInvalidDefinitionError(
            "Cannot use List in the context of config. " +
            helpful_list_error_string())

    if dagster_type is Set or isinstance(dagster_type, _TypedPythonSet):
        raise DagsterInvalidDefinitionError(
            "Cannot use Set in the context of a config field. " +
            helpful_list_error_string())

    if dagster_type is Tuple or isinstance(dagster_type, _TypedPythonTuple):
        raise DagsterInvalidDefinitionError(
            "Cannot use Tuple in the context of a config field. " +
            helpful_list_error_string())

    if isinstance(dagster_type, DagsterType):
        raise DagsterInvalidDefinitionError((
            "You have passed an instance of DagsterType {type_name} to the config "
            "system (Repr of type: {dagster_type}). "
            "The DagsterType and config schema systems are separate. "
            "Valid config values are:\n{desc}").format(
                type_name=dagster_type.display_name,
                dagster_type=repr(dagster_type),
                desc=VALID_CONFIG_DESC,
            ), )

    # If we are passed here either:
    #  1) We have been passed a python builtin
    #  2) We have been a dagster wrapping type that needs to be convert its config variant
    #     e.g. dagster.List
    #  2) We have been passed an invalid thing. We return False to signify this. It is
    #     up to callers to report a reasonable error.

    from dagster.primitive_mapping import (
        remap_python_builtin_for_config,
        is_supported_config_python_builtin,
    )

    if is_supported_config_python_builtin(dagster_type):
        return remap_python_builtin_for_config(dagster_type)

    if dagster_type is None:
        return ConfigAnyInstance
    if BuiltinEnum.contains(dagster_type):
        return ConfigType.from_builtin_enum(dagster_type)

    # This means that this is an error and we are return False to a callsite
    # We do the error reporting there because those callsites have more context
    return False
Esempio n. 12
0
def validate_solid_fn(
    decorator_name: str,
    fn_name: str,
    compute_fn: Callable[..., Any],
    input_defs: List[InputDefinition],
    expected_positionals: Optional[List[str]] = None,
    exclude_nothing: Optional[bool] = True,
) -> List[str]:
    check.str_param(decorator_name, "decorator_name")
    check.str_param(fn_name, "fn_name")
    check.callable_param(compute_fn, "compute_fn")
    check.list_param(input_defs, "input_defs", of_type=InputDefinition)
    expected_positionals = check.opt_list_param(expected_positionals,
                                                "expected_positionals",
                                                of_type=str)
    if exclude_nothing:
        names = set(inp.name for inp in input_defs
                    if not inp.dagster_type.kind == DagsterTypeKind.NOTHING)
        nothing_names = set(
            inp.name for inp in input_defs
            if inp.dagster_type.kind == DagsterTypeKind.NOTHING)
    else:
        names = set(inp.name for inp in input_defs)
        nothing_names = set()

    # Currently being super strict about naming. Might be a good idea to relax. Starting strict.
    fn_positionals, input_args = split_function_parameters(
        compute_fn, expected_positionals)

    # Validate Positional Parameters
    missing_positional = validate_decorated_fn_positionals(
        fn_positionals, expected_positionals)
    if missing_positional:
        raise DagsterInvalidDefinitionError(
            "{decorator_name} '{solid_name}' decorated function does not have required positional "
            "parameter '{missing_param}'. Solid functions should only have keyword arguments "
            "that match input names and a first positional parameter named 'context'."
            .format(decorator_name=decorator_name,
                    solid_name=fn_name,
                    missing_param=missing_positional))

    # Validate non positional parameters
    invalid_function_info = validate_decorated_fn_input_args(names, input_args)
    if invalid_function_info:
        if invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[
                "vararg"]:
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function has positional vararg parameter "
                "'{param}'. Solid functions should only have keyword arguments that match "
                "input names and a first positional parameter named 'context'."
                .format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    param=invalid_function_info.param,
                ))
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[
                "missing_name"]:
            if invalid_function_info.param in nothing_names:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is "
                    "one of the solid input_defs of type 'Nothing' which should not be included since "
                    "no data will be passed for it. ".format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    ))
            else:
                raise DagsterInvalidDefinitionError(
                    "{decorator_name} '{solid_name}' decorated function has parameter '{param}' that is not "
                    "one of the solid input_defs. Solid functions should only have keyword arguments "
                    "that match input names and a first positional parameter named 'context'."
                    .format(
                        decorator_name=decorator_name,
                        solid_name=fn_name,
                        param=invalid_function_info.param,
                    ))
        elif invalid_function_info.error_type == InvalidDecoratedFunctionInfo.TYPES[
                "extra"]:
            undeclared_inputs_printed = ", '".join(
                invalid_function_info.missing_names)
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{solid_name}' decorated function does not have parameter(s) "
                "'{undeclared_inputs_printed}', which are in solid's input_defs. Solid functions "
                "should only have keyword arguments that match input names and a first positional "
                "parameter named 'context'.".format(
                    decorator_name=decorator_name,
                    solid_name=fn_name,
                    undeclared_inputs_printed=undeclared_inputs_printed,
                ))

    return positional_arg_name_list(input_args)
Esempio n. 13
0
    def _process_argument_node(self, solid_name, output_node, input_name,
                               input_mappings, input_bindings, arg_desc):

        if isinstance(output_node, InvokedSolidOutputHandle):
            input_bindings[input_name] = output_node
        elif isinstance(output_node, InputMappingNode):
            input_mappings[input_name] = output_node
        elif isinstance(output_node, list):
            if all(
                    map(
                        lambda item: isinstance(item, InvokedSolidOutputHandle
                                                ), output_node)):
                input_bindings[input_name] = output_node

            else:
                raise DagsterInvalidDefinitionError(
                    'In {source} {name}, received a list containing invalid types for input '
                    '"{input_name}" {arg_desc} in solid invocation {solid_name}. '
                    'Lists can only contain the output from previous solid invocations.'
                    .format(
                        source=current_context().source,
                        name=current_context().name,
                        arg_desc=arg_desc,
                        input_name=input_name,
                        solid_name=solid_name,
                    ))

        elif isinstance(output_node, tuple) and all(
                map(lambda item: isinstance(item, InvokedSolidOutputHandle),
                    output_node)):
            raise DagsterInvalidDefinitionError(
                'In {source} {name}, received a tuple of multiple outputs for '
                'input "{input_name}" {arg_desc} in solid invocation {solid_name}. '
                'Must pass individual output, available from tuple: {options}'.
                format(
                    source=current_context().source,
                    name=current_context().name,
                    arg_desc=arg_desc,
                    input_name=input_name,
                    solid_name=solid_name,
                    options=output_node._fields,
                ))
        elif isinstance(output_node, CallableSolidNode) or isinstance(
                output_node, ISolidDefinition):
            raise DagsterInvalidDefinitionError(
                'In {source} {name}, received an un-invoked solid for input '
                '"{input_name}" {arg_desc} in solid invocation "{solid_name}". '
                'Did you forget parentheses?'.format(
                    source=current_context().source,
                    name=current_context().name,
                    arg_desc=arg_desc,
                    input_name=input_name,
                    solid_name=solid_name,
                ))
        else:
            raise DagsterInvalidDefinitionError(
                'In {source} {name}, received invalid type {type} for input '
                '"{input_name}" {arg_desc} in solid invocation "{solid_name}". '
                'Must pass the output from previous solid invocations or inputs to the '
                'composition function as inputs when invoking solids during composition.'
                .format(
                    source=current_context().source,
                    name=current_context().name,
                    type=type(output_node),
                    arg_desc=arg_desc,
                    input_name=input_name,
                    solid_name=solid_name,
                ))
Esempio n. 14
0
    def from_files(name,
                   environment_files=None,
                   config_files=None,
                   solid_selection=None,
                   mode=None,
                   tags=None):
        '''Static constructor for presets from YAML files.

        Args:
            name (str): The name of this preset. Must be unique in the presets defined on a given
                pipeline.
            config_files (Optional[List[str]]): List of paths or glob patterns for yaml files
                to load and parse as the environment config for this preset.
            solid_selection (Optional[List[str]]): A list of solid subselection (including single
                solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``
            mode (Optional[str]): The mode to apply when executing this preset. (default:
                'default')
            tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.

        Returns:
            PresetDefinition: A PresetDefinition constructed from the provided YAML files.

        Raises:
            DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse
                error.
        '''
        check.str_param(name, 'name')
        config_files = canonicalize_backcompat_args(config_files,
                                                    'config_files',
                                                    environment_files,
                                                    'environment_files',
                                                    '0.9.0')
        config_files = check.opt_list_param(config_files, 'config_files')
        solid_selection = check.opt_nullable_list_param(solid_selection,
                                                        'solid_selection',
                                                        of_type=str)
        mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)

        filenames = []
        for file_glob in environment_files or []:
            globbed_files = glob(file_glob)
            if not globbed_files:
                raise DagsterInvalidDefinitionError(
                    'File or glob pattern "{file_glob}" for "environment_files" in preset '
                    '"{name}" produced no results.'.format(
                        name=name, file_glob=file_glob))

            filenames += [
                os.path.realpath(globbed_file)
                for globbed_file in globbed_files
            ]

        try:
            merged = merge_yamls(filenames)
        except yaml.YAMLError as err:
            six.raise_from(
                DagsterInvariantViolationError(
                    'Encountered error attempting to parse yaml. Parsing files {file_set} '
                    'loaded by file/patterns {files} on preset "{name}".'.
                    format(file_set=filenames,
                           files=environment_files,
                           name=name)),
                err,
            )

        return PresetDefinition(name, merged, solid_selection, mode, tags)
Esempio n. 15
0
    def __call__(self, *args, **kwargs):
        assert_in_composition(self.solid_name)

        input_bindings = {}
        input_mappings = {}

        # handle *args
        for idx, output_node in enumerate(args):
            if idx >= len(self.solid_def.input_defs):
                raise DagsterInvalidDefinitionError(
                    'In {source} {name} received too many inputs for solid '
                    'invocation {solid_name}. Only {def_num} defined, received {arg_num}'
                    .format(
                        source=current_context().source,
                        name=current_context().name,
                        solid_name=self.solid_name,
                        def_num=len(self.solid_def.input_defs),
                        arg_num=len(args),
                    ))

            input_name = self.solid_def.input_defs[idx].name

            self._process_argument_node(
                output_node,
                input_name,
                input_mappings,
                input_bindings,
                '(at position {idx})'.format(idx=idx),
            )

        # then **kwargs
        for input_name, output_node in kwargs.items():
            self._process_argument_node(output_node, input_name,
                                        input_mappings, input_bindings,
                                        '(passed by keyword)')

        if current_context().has_seen_invocation(self.solid_name):
            raise DagsterInvalidDefinitionError(
                '{source} {name} invoked the same solid ({solid_name}) twice without aliasing.'
                .format(
                    source=current_context().source,
                    name=current_context().name,
                    solid_name=self.solid_name,
                ))

        current_context().observe_invocation(
            InvokedSolidNode(self.solid_name, self.solid_def, input_bindings,
                             input_mappings))

        if len(self.solid_def.output_defs) == 0:
            return None

        if len(self.solid_def.output_defs) == 1:
            output_name = self.solid_def.output_defs[0].name
            return InvokedSolidOutputHandle(self.solid_name, output_name)

        outputs = [
            output_def.name for output_def in self.solid_def.output_defs
        ]
        return namedtuple(
            '_{solid_def}_outputs'.format(solid_def=self.solid_def.name),
            outputs)(**{
                output: InvokedSolidOutputHandle(self.solid_name, output)
                for output in outputs
            })
Esempio n. 16
0
def resolve_dagster_type(dagster_type):
    # circular dep
    from .python_dict import PythonDict, Dict
    from .python_set import PythonSet, DagsterSetApi
    from .python_tuple import PythonTuple, DagsterTupleApi
    from .transform_typing import transform_typing_type
    from dagster.config.config_type import ConfigType
    from dagster.primitive_mapping import (
        remap_python_builtin_for_runtime,
        is_supported_runtime_python_builtin,
    )
    from dagster.utils.typing_api import is_typing_type

    check.invariant(
        not (isinstance(dagster_type, type) and issubclass(dagster_type, ConfigType)),
        'Cannot resolve a config type to a runtime type',
    )

    check.invariant(
        not (isinstance(dagster_type, type) and issubclass(dagster_type, DagsterType)),
        'Do not pass runtime type classes. Got {}'.format(dagster_type),
    )

    # First check to see if it part of python's typing library
    if is_typing_type(dagster_type):
        dagster_type = transform_typing_type(dagster_type)

    if isinstance(dagster_type, DagsterType):
        return dagster_type

    # Test for unhashable objects -- this is if, for instance, someone has passed us an instance of
    # a dict where they meant to pass dict or Dict, etc.
    try:
        hash(dagster_type)
    except TypeError:
        raise DagsterInvalidDefinitionError(
            DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(
                additional_msg=(
                    ', which isn\'t hashable. Did you pass an instance of a type instead of '
                    'the type?'
                ),
                dagster_type=str(dagster_type),
            )
        )

    if is_supported_runtime_python_builtin(dagster_type):
        return remap_python_builtin_for_runtime(dagster_type)

    if dagster_type is None:
        return Any

    if dagster_type in _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY:
        return _PYTHON_TYPE_TO_DAGSTER_TYPE_MAPPING_REGISTRY[dagster_type]

    if dagster_type is Dict:
        return PythonDict
    if isinstance(dagster_type, DagsterTupleApi):
        return PythonTuple
    if isinstance(dagster_type, DagsterSetApi):
        return PythonSet
    if isinstance(dagster_type, DagsterListApi):
        return List(Any)
    if BuiltinEnum.contains(dagster_type):
        return DagsterType.from_builtin_enum(dagster_type)
    if not isinstance(dagster_type, type):
        raise DagsterInvalidDefinitionError(
            DAGSTER_INVALID_TYPE_ERROR_MESSAGE.format(
                dagster_type=str(dagster_type), additional_msg='.'
            )
        )

    raise DagsterInvalidDefinitionError(
        '{dagster_type} is not a valid dagster type.'.format(dagster_type=dagster_type)
    )
Esempio n. 17
0
def composite_mapping_from_output(output, output_defs, solid_name):
    # output can be different types
    check.list_param(output_defs, 'output_defs', OutputDefinition)
    check.str_param(solid_name, 'solid_name')

    # single output
    if isinstance(output, InvokedSolidOutputHandle):
        if len(output_defs) == 1:
            defn = output_defs[0]
            return {
                defn.name: defn.mapping_from(output.solid_name,
                                             output.output_name)
            }
        else:
            raise DagsterInvalidDefinitionError(
                'Returned a single output ({solid_name}.{output_name}) in '
                '@composite_solid {name} but {num} outputs are defined. '
                'Return a dict to map defined outputs.'.format(
                    solid_name=output.solid_name,
                    output_name=output.output_name,
                    name=solid_name,
                    num=len(output_defs),
                ))

    output_mapping_dict = {}
    output_def_dict = {
        output_def.name: output_def
        for output_def in output_defs
    }

    # tuple returned directly
    if isinstance(output, tuple) and all(
            map(lambda item: isinstance(item, InvokedSolidOutputHandle),
                output)):
        for handle in output:
            if handle.output_name not in output_def_dict:
                raise DagsterInvalidDefinitionError(
                    'Output name mismatch returning output tuple in @composite_solid {name}. '
                    'No matching OutputDefinition named {output_name} for {solid_name}.{output_name}.'
                    'Return a dict to map to the desired OutputDefinition'.
                    format(
                        name=solid_name,
                        output_name=handle.output_name,
                        solid_name=handle.solid_name,
                    ))
            output_mapping_dict[handle.output_name] = output_def_dict[
                handle.output_name].mapping_from(handle.solid_name,
                                                 handle.output_name)

        return output_mapping_dict

    # mapping dict
    if isinstance(output, dict):
        for name, handle in output.items():
            if name not in output_def_dict:
                raise DagsterInvalidDefinitionError(
                    '@composite_solid {name} referenced key {key} which does not match any '
                    'OutputDefinitions. Valid options are: {options}'.format(
                        name=solid_name,
                        key=name,
                        options=list(output_def_dict.keys())))
            if not isinstance(handle, InvokedSolidOutputHandle):
                raise DagsterInvalidDefinitionError(
                    '@composite_solid {name} returned problematic dict entry under '
                    'key {key} of type {type}. Dict values must be outputs of '
                    'invoked solids'.format(name=solid_name,
                                            key=name,
                                            type=type(handle)))

            output_mapping_dict[name] = output_def_dict[name].mapping_from(
                handle.solid_name, handle.output_name)

        return output_mapping_dict

    # error
    if output is not None:
        raise DagsterInvalidDefinitionError(
            '@composite_solid {name} returned problematic value '
            'of type {type}. Expected return value from invoked solid or dict mapping '
            'output name to return values from invoked solids'.format(
                name=solid_name, type=type(output)))
Esempio n. 18
0
def resolve_checked_solid_fn_inputs(
    decorator_name: str,
    fn_name: str,
    compute_fn: DecoratedSolidFunction,
    explicit_input_defs: List[InputDefinition],
    exclude_nothing: bool,
) -> List[InputDefinition]:
    """
    Validate provided input definitions and infer the remaining from the type signature of the compute_fn.
    Returns the resolved set of InputDefinitions.

    Args:
        decorator_name (str): Name of the decorator that is wrapping the solid function.
        fn_name (str): Name of the decorated function.
        compute_fn (DecoratedSolidFunction): The decorated function, wrapped in the
            DecoratedSolidFunction wrapper.
        explicit_input_defs (List[InputDefinition]): The input definitions that were explicitly
            provided in the decorator.
        exclude_nothing (bool): True if Nothing type inputs should be excluded from compute_fn
            arguments.
    """

    if exclude_nothing:
        explicit_names = set(
            inp.name for inp in explicit_input_defs
            if not inp.dagster_type.kind == DagsterTypeKind.NOTHING)
        nothing_names = set(
            inp.name for inp in explicit_input_defs
            if inp.dagster_type.kind == DagsterTypeKind.NOTHING)
    else:
        explicit_names = set(inp.name for inp in explicit_input_defs)
        nothing_names = set()

    params = get_function_params(compute_fn.decorated_fn)

    input_args = params[1:] if compute_fn.has_context_arg() else params

    # Validate input arguments
    used_inputs = set()
    inputs_to_infer = set()
    has_kwargs = False

    for param in cast(List[funcsigs.Parameter], input_args):
        if param.kind == funcsigs.Parameter.VAR_KEYWORD:
            has_kwargs = True
        elif param.kind == funcsigs.Parameter.VAR_POSITIONAL:
            raise DagsterInvalidDefinitionError(
                f"{decorator_name} '{fn_name}' decorated function has positional vararg parameter "
                f"'{param}'. {decorator_name} decorated functions should only have keyword "
                "arguments that match input names and, if system information is required, a first "
                "positional parameter named 'context'.")

        else:
            if param.name not in explicit_names:
                if param.name in nothing_names:
                    raise DagsterInvalidDefinitionError(
                        f"{decorator_name} '{fn_name}' decorated function has parameter '{param.name}' that is "
                        "one of the input_defs of type 'Nothing' which should not be included since "
                        "no data will be passed for it. ")
                else:
                    inputs_to_infer.add(param.name)

            else:
                used_inputs.add(param.name)

    undeclared_inputs = explicit_names - used_inputs
    if not has_kwargs and undeclared_inputs:
        undeclared_inputs_printed = ", '".join(undeclared_inputs)
        raise DagsterInvalidDefinitionError(
            f"{decorator_name} '{fn_name}' decorated function does not have parameter(s) "
            f"'{undeclared_inputs_printed}', which are in provided input_defs. {decorator_name} "
            "decorated functions should only have keyword arguments that match input names and, if "
            "system information is required, a first positional parameter named 'context'."
        )

    inferred_props = {
        inferred.name: inferred
        for inferred in infer_input_props(compute_fn.decorated_fn,
                                          compute_fn.has_context_arg())
    }
    input_defs = []
    for input_def in explicit_input_defs:
        if input_def.name in inferred_props:
            # combine any information missing on the explicit def that can be inferred
            input_defs.append(
                input_def.combine_with_inferred(
                    inferred_props[input_def.name]))
        else:
            # pass through those that don't have any inference info, such as Nothing type inputs
            input_defs.append(input_def)

    # build defs from the inferred props for those without explicit entries
    input_defs.extend(
        InputDefinition.create_from_inferred(inferred)
        for inferred in inferred_props.values()
        if inferred.name in inputs_to_infer)

    return input_defs
Esempio n. 19
0
    def __call__(self, fn):
        check.callable_param(fn, 'fn')

        if not self.name:
            self.name = fn.__name__

        input_defs = (self.input_defs if self.input_defs is not None else
                      infer_input_definitions_for_composite_solid(
                          self.name, fn))

        explicit_outputs = False
        if self.output_defs is not None:
            explicit_outputs = True
            output_defs = self.output_defs
        else:
            explicit_outputs = has_explicit_return_type(fn)
            output_defs = infer_output_definitions('@composite_solid',
                                                   self.name, fn)

        validate_solid_fn('@composite_solid',
                          self.name,
                          fn,
                          input_defs,
                          exclude_nothing=False)

        kwargs = {
            input_def.name: InputMappingNode(input_def)
            for input_def in input_defs
        }

        output = None
        mapping = None
        enter_composition(self.name, '@composite_solid')
        try:
            output = fn(**kwargs)
            mapping = composite_mapping_from_output(output, output_defs,
                                                    self.name)
        finally:
            context = exit_composition(mapping)

        check.invariant(
            context.name == self.name,
            'Composition context stack desync: received context for '
            '"{context.name}" expected "{self.name}"'.format(context=context,
                                                             self=self),
        )

        # line up mappings in definition order
        input_mappings = []
        for defn in input_defs:
            mappings = [
                mapping for mapping in context.input_mappings
                if mapping.definition.name == defn.name
            ]

            if len(mappings) == 0:
                raise DagsterInvalidDefinitionError(
                    "@composite_solid '{solid_name}' has unmapped input '{input_name}'. "
                    "Remove it or pass it to the appropriate solid invocation."
                    .format(solid_name=self.name, input_name=defn.name))

            input_mappings += mappings

        output_mappings = []
        for defn in output_defs:
            mapping = context.output_mapping_dict.get(defn.name)
            if mapping is None:
                # if we inferred output_defs we will be flexible and either take a mapping or not
                if not explicit_outputs:
                    continue

                raise DagsterInvalidDefinitionError(
                    "@composite_solid '{solid_name}' has unmapped output '{output_name}'. "
                    "Remove it or return a value from the appropriate solid invocation."
                    .format(solid_name=self.name, output_name=defn.name))
            output_mappings.append(mapping)

        config_mapping = _get_validated_config_mapping(self.name, self.config,
                                                       self.config_fn)

        return CompositeSolidDefinition(
            name=self.name,
            input_mappings=input_mappings,
            output_mappings=output_mappings,
            dependencies=context.dependencies,
            solid_defs=context.solid_defs,
            description=self.description,
            config_mapping=config_mapping,
        )
Esempio n. 20
0
    def __call__(self, *args, **kwargs):
        solid_name = self.given_alias if self.given_alias else self.solid_def.name
        assert_in_composition(solid_name)

        input_bindings = {}
        input_mappings = {}

        # handle *args
        for idx, output_node in enumerate(args):
            if idx >= len(self.solid_def.input_defs):
                raise DagsterInvalidDefinitionError(
                    'In {source} {name} received too many inputs for solid '
                    'invocation {solid_name}. Only {def_num} defined, received {arg_num}'
                    .format(
                        source=current_context().source,
                        name=current_context().name,
                        solid_name=solid_name,
                        def_num=len(self.solid_def.input_defs),
                        arg_num=len(args),
                    ))

            input_name = self.solid_def.resolve_input_name_at_position(idx)
            if input_name is None:
                raise DagsterInvalidDefinitionError(
                    'In {source} {name} could not resolve input based on position at '
                    'index {idx} for solid invocation {solid_name}. Use keyword args instead, '
                    'available inputs are: {inputs}'.format(
                        idx=idx,
                        source=current_context().source,
                        name=current_context().name,
                        solid_name=solid_name,
                        inputs=list(
                            map(lambda inp: inp.name,
                                self.solid_def.input_defs)),
                    ))

            self._process_argument_node(
                solid_name,
                output_node,
                input_name,
                input_mappings,
                input_bindings,
                '(at position {idx})'.format(idx=idx),
            )

        # then **kwargs
        for input_name, output_node in kwargs.items():
            self._process_argument_node(
                solid_name,
                output_node,
                input_name,
                input_mappings,
                input_bindings,
                '(passed by keyword)',
            )

        solid_name = current_context().observe_invocation(
            self.given_alias, self.solid_def, input_bindings, input_mappings)

        if len(self.solid_def.output_defs) == 0:
            return None

        if len(self.solid_def.output_defs) == 1:
            output_name = self.solid_def.output_defs[0].name
            return InvokedSolidOutputHandle(solid_name, output_name)

        outputs = [
            output_def.name for output_def in self.solid_def.output_defs
        ]
        return namedtuple(
            '_{solid_def}_outputs'.format(solid_def=self.solid_def.name),
            outputs)(**{
                output: InvokedSolidOutputHandle(solid_name, output)
                for output in outputs
            })
Esempio n. 21
0
def _validate_dependencies(dependencies, solid_dict, alias_to_name):
    for from_solid, dep_by_input in dependencies.items():
        for from_input, dep_def in dep_by_input.items():
            for dep in dep_def.get_solid_dependencies():

                if from_solid == dep.solid:
                    raise DagsterInvalidDefinitionError(
                        (
                            "Invalid dependencies: circular reference detected in solid "
                            '"{from_solid}" input "{from_input}"'
                        ).format(from_solid=from_solid, from_input=from_input)
                    )

                if not from_solid in solid_dict:
                    aliased_solid = alias_to_name.get(from_solid)
                    if aliased_solid == from_solid:
                        raise DagsterInvalidDefinitionError(
                            'Invalid dependencies: solid "{solid}" in dependency dictionary not '
                            "found in solid list".format(solid=from_solid)
                        )
                    else:
                        raise DagsterInvalidDefinitionError(
                            (
                                'Invalid dependencies: solid "{aliased_solid}" (aliased by '
                                '"{from_solid}" in dependency dictionary) not found in solid list'
                            ).format(aliased_solid=aliased_solid, from_solid=from_solid)
                        )
                if not solid_dict[from_solid].definition.has_input(from_input):
                    input_list = solid_dict[from_solid].definition.input_dict.keys()
                    raise DagsterInvalidDefinitionError(
                        'Invalid dependencies: solid "{from_solid}" does not have input '
                        '"{from_input}". '.format(from_solid=from_solid, from_input=from_input)
                        + "Available inputs: {input_list}".format(input_list=input_list)
                    )

                if not dep.solid in solid_dict:
                    raise DagsterInvalidDefinitionError(
                        'Invalid dependencies: solid "{dep.solid}" not found in solid list. '
                        'Listed as dependency for solid "{from_solid}" input "{from_input}" '.format(
                            dep=dep, from_solid=from_solid, from_input=from_input
                        )
                    )

                if not solid_dict[dep.solid].definition.has_output(dep.output):
                    raise DagsterInvalidDefinitionError(
                        'Invalid dependencies: solid "{dep.solid}" does not have output '
                        '"{dep.output}". Listed as dependency for solid "{from_solid} input '
                        '"{from_input}"'.format(
                            dep=dep, from_solid=from_solid, from_input=from_input
                        )
                    )

                input_def = solid_dict[from_solid].definition.input_def_named(from_input)
                output_def = solid_dict[dep.solid].definition.output_def_named(dep.output)

                if dep_def.is_multi() and not input_def.dagster_type.supports_fan_in:
                    raise DagsterInvalidDefinitionError(
                        f'Invalid dependencies: for solid "{dep.solid}" input "{input_def.name}", the '
                        f'DagsterType "{input_def.dagster_type.display_name}" does not support fanning in '
                        "(MultiDependencyDefinition). Use the List type, since fanning in will result in a list."
                    )

                _validate_input_output_pair(input_def, output_def, from_solid, dep)
Esempio n. 22
0
    def __init__(
        self,
        solid_defs: List[NodeDefinition],
        name: str,
        description: Optional[str] = None,
        dependencies: Optional[
            Dict[Union[str, SolidInvocation], Dict[str, IDependencyDefinition]]
        ] = None,
        mode_defs: Optional[List[ModeDefinition]] = None,
        preset_defs: Optional[List[PresetDefinition]] = None,
        tags: Dict[str, Any] = None,
        hook_defs: Optional[AbstractSet[HookDefinition]] = None,
        input_mappings: Optional[List[InputMapping]] = None,
        output_mappings: Optional[List[OutputMapping]] = None,
        config_mapping: Optional[ConfigMapping] = None,
        positional_inputs: List[str] = None,
        solid_retry_policy: Optional[RetryPolicy] = None,
        _parent_pipeline_def: Optional[
            "PipelineDefinition"
        ] = None,  # https://github.com/dagster-io/dagster/issues/2115
    ):
        # For these warnings they check truthiness because they get changed to [] higher
        # in the stack for the decorator case

        if input_mappings:
            experimental_arg_warning("input_mappings", "PipelineDefinition")

        if output_mappings:
            experimental_arg_warning("output_mappings", "PipelineDefinition")

        if config_mapping is not None:
            experimental_arg_warning("config_mapping", "PipelineDefinition")

        if positional_inputs:
            experimental_arg_warning("positional_inputs", "PipelineDefinition")

        super(PipelineDefinition, self).__init__(
            name=name,
            description=description,
            dependencies=dependencies,
            node_defs=solid_defs,
            tags=check.opt_dict_param(tags, "tags", key_type=str),
            positional_inputs=positional_inputs,
            input_mappings=input_mappings,
            output_mappings=output_mappings,
            config_mapping=config_mapping,
        )

        self._current_level_node_defs = solid_defs
        self._tags = validate_tags(tags)

        mode_definitions = check.opt_list_param(mode_defs, "mode_defs", of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError(
                    (
                        'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                        "Modes must have unique names."
                    ).format(mode_name=mode_def.name, pipeline_name=self._name)
                )
            seen_modes.add(mode_def.name)

        self._hook_defs = check.opt_set_param(hook_defs, "hook_defs", of_type=HookDefinition)
        self._solid_retry_policy = check.opt_inst_param(
            solid_retry_policy, "solid_retry_policy", RetryPolicy
        )

        self._preset_defs = check.opt_list_param(preset_defs, "preset_defs", PresetDefinition)
        self._preset_dict: Dict[str, PresetDefinition] = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError(
                    (
                        'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                        "PresetDefinitions must have unique names."
                    ).format(name=preset.name, pipeline_name=self._name)
                )
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    (
                        'PresetDefinition "{name}" in "{pipeline_name}" '
                        'references mode "{mode}" which is not defined.'
                    ).format(name=preset.name, pipeline_name=self._name, mode=preset.mode)
                )
            self._preset_dict[preset.name] = preset

        self._resource_requirements = {
            mode_def.name: _checked_resource_reqs_for_mode(
                mode_def,
                self._current_level_node_defs,
                self._dagster_type_dict,
                self._solid_dict,
                self._hook_defs,
                self._dependency_structure,
            )
            for mode_def in self._mode_definitions
        }

        # Recursively explore all nodes in the this pipeline
        self._all_node_defs = _build_all_node_defs(self._current_level_node_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition
        )
        self._cached_run_config_schemas: Dict[str, "RunConfigSchema"] = {}
        self._cached_external_pipeline = None
Esempio n. 23
0
    def __new__(cls, name, source, invocations, output_mapping_dict,
                pending_invocations):

        dep_dict = {}
        node_def_dict = {}
        input_mappings = []

        for solid in pending_invocations.values():
            _not_invoked_warning(solid, source, name)

        for invocation in invocations.values():
            def_name = invocation.node_def.name
            if def_name in node_def_dict and node_def_dict[
                    def_name] is not invocation.node_def:
                raise DagsterInvalidDefinitionError(
                    'Detected conflicting solid definitions with the same name "{name}"'
                    .format(name=def_name))
            node_def_dict[def_name] = invocation.node_def

            deps = {}
            for input_name, node in invocation.input_bindings.items():
                if isinstance(node, InvokedSolidOutputHandle):
                    deps[input_name] = DependencyDefinition(
                        node.solid_name, node.output_name)
                elif isinstance(node, InputMappingNode):
                    input_mappings.append(
                        node.input_def.mapping_to(invocation.node_name,
                                                  input_name))
                elif isinstance(node, list):
                    entries = []
                    for idx, fanned_in_node in enumerate(node):
                        if isinstance(fanned_in_node,
                                      InvokedSolidOutputHandle):
                            entries.append(
                                DependencyDefinition(
                                    fanned_in_node.solid_name,
                                    fanned_in_node.output_name))
                        elif isinstance(fanned_in_node, InputMappingNode):
                            entries.append(MappedInputPlaceholder)
                            input_mappings.append(
                                fanned_in_node.input_def.mapping_to(
                                    invocation.node_name, input_name, idx))
                        else:
                            check.invariant(
                                "Unexpected fanned in node received")

                    deps[input_name] = MultiDependencyDefinition(entries)
                elif isinstance(node, DynamicFanIn):
                    deps[input_name] = DynamicCollectDependencyDefinition(
                        node.solid_name, node.output_name)
                else:
                    check.failed(
                        "Unexpected input binding - got {node}".format(
                            node=node))

            dep_dict[SolidInvocation(
                invocation.node_def.name,
                invocation.node_name,
                tags=invocation.tags,
                hook_defs=invocation.hook_defs,
            )] = deps

        return super(cls, CompleteCompositionContext).__new__(
            cls, name, list(node_def_dict.values()), dep_dict, input_mappings,
            output_mapping_dict)
Esempio n. 24
0
def _checked_resource_reqs_for_mode(
    mode_def: ModeDefinition,
    node_defs: List[NodeDefinition],
    dagster_type_dict: Dict[str, DagsterType],
    solid_dict: Dict[str, Solid],
    pipeline_hook_defs: AbstractSet[HookDefinition],
    dependency_structure: DependencyStructure,
) -> Set[str]:
    """
    Calculate the resource requirements for the pipeline in this mode and ensure they are
    provided by the mode.

    We combine these operations in to one traversal to allow for raising excpetions that provide
    as much context as possible about where the unsatisfied resource requirement came from.
    """
    resource_reqs: Set[str] = set()
    mode_resources = set(mode_def.resource_defs.keys())
    for node_def in node_defs:
        for solid_def in node_def.iterate_solid_defs():
            for required_resource in solid_def.required_resource_keys:
                resource_reqs.add(required_resource)
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError(
                        (
                            f'Resource "{required_resource}" is required by solid def '
                            f'{solid_def.name}, but is not provided by mode "{mode_def.name}".'
                        )
                    )

            for output_def in solid_def.output_defs:
                resource_reqs.add(output_def.io_manager_key)
                if output_def.io_manager_key not in mode_resources:
                    raise DagsterInvalidDefinitionError(
                        f'IO manager "{output_def.io_manager_key}" is required by output '
                        f'"{output_def.name}" of solid def {solid_def.name}, but is not '
                        f'provided by mode "{mode_def.name}".'
                    )

    resource_reqs.update(
        _checked_type_resource_reqs_for_mode(
            mode_def,
            dagster_type_dict,
        )
    )

    # Validate unsatisfied inputs can be materialized from config
    resource_reqs.update(
        _checked_input_resource_reqs_for_mode(dependency_structure, solid_dict, mode_def)
    )

    for intermediate_storage in mode_def.intermediate_storage_defs or []:
        for required_resource in intermediate_storage.required_resource_keys:
            resource_reqs.add(required_resource)
            if required_resource not in mode_resources:
                raise DagsterInvalidDefinitionError(
                    (
                        "Resource '{resource}' is required by intermediate storage "
                        "'{storage_name}', but is not provided by mode '{mode_name}'."
                    ).format(
                        resource=required_resource,
                        storage_name=intermediate_storage.name,
                        mode_name=mode_def.name,
                    )
                )
    for solid in solid_dict.values():
        for hook_def in solid.hook_defs:
            for required_resource in hook_def.required_resource_keys:
                resource_reqs.add(required_resource)
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError(
                        (
                            'Resource "{resource}" is required by hook "{hook_name}", but is not '
                            'provided by mode "{mode_name}".'
                        ).format(
                            resource=required_resource,
                            hook_name=hook_def.name,
                            mode_name=mode_def.name,
                        )
                    )

    for hook_def in pipeline_hook_defs:
        for required_resource in hook_def.required_resource_keys:
            resource_reqs.add(required_resource)
            if required_resource not in mode_resources:
                raise DagsterInvalidDefinitionError(
                    (
                        'Resource "{resource}" is required by hook "{hook_name}", but is not '
                        'provided by mode "{mode_name}".'
                    ).format(
                        resource=required_resource,
                        hook_name=hook_def.name,
                        mode_name=mode_def.name,
                    )
                )

    for resource_key, resource in mode_def.resource_defs.items():
        for required_resource in resource.required_resource_keys:
            resource_reqs.add(required_resource)
            if required_resource not in mode_resources:
                raise DagsterInvalidDefinitionError(
                    f'Resource "{required_resource}" is required by resource at key "{resource_key}", '
                    f'but is not provided by mode "{mode_def.name}"'
                )

    return resource_reqs
Esempio n. 25
0
    def _process_argument_node(self, solid_name, output_node, input_name,
                               input_bindings, arg_desc):

        if isinstance(
                output_node,
            (InvokedSolidOutputHandle, InputMappingNode, DynamicFanIn)):
            input_bindings[input_name] = output_node

        elif isinstance(output_node, list):
            input_bindings[input_name] = []
            for idx, fanned_in_node in enumerate(output_node):
                if isinstance(fanned_in_node,
                              (InvokedSolidOutputHandle, InputMappingNode)):
                    input_bindings[input_name].append(fanned_in_node)
                else:
                    raise DagsterInvalidDefinitionError(
                        "In {source} {name}, received a list containing an invalid type "
                        'at index {idx} for input "{input_name}" {arg_desc} in '
                        "solid invocation {solid_name}. Lists can only contain the "
                        "output from previous solid invocations or input mappings, "
                        "received {type}".format(
                            source=current_context().source,
                            name=current_context().name,
                            arg_desc=arg_desc,
                            input_name=input_name,
                            solid_name=solid_name,
                            idx=idx,
                            type=type(output_node),
                        ))

        elif isinstance(output_node, tuple) and all(
                map(lambda item: isinstance(item, InvokedSolidOutputHandle),
                    output_node)):
            raise DagsterInvalidDefinitionError(
                "In {source} {name}, received a tuple of multiple outputs for "
                'input "{input_name}" {arg_desc} in solid invocation {solid_name}. '
                "Must pass individual output, available from tuple: {options}".
                format(
                    source=current_context().source,
                    name=current_context().name,
                    arg_desc=arg_desc,
                    input_name=input_name,
                    solid_name=solid_name,
                    options=output_node._fields,
                ))
        elif isinstance(output_node, InvokedSolidDynamicOutputWrapper):
            raise DagsterInvalidDefinitionError(
                f"In {current_context().source} {current_context().name}, received the dynamic output "
                f"{output_node.output_name} from solid {output_node.solid_name} directly. Dynamic "
                "output must be unpacked by invoking map or collect.")

        elif isinstance(output_node, PendingNodeInvocation) or isinstance(
                output_node, NodeDefinition):
            raise DagsterInvalidDefinitionError(
                "In {source} {name}, received an un-invoked solid for input "
                '"{input_name}" {arg_desc} in solid invocation "{solid_name}". '
                "Did you forget parentheses?".format(
                    source=current_context().source,
                    name=current_context().name,
                    arg_desc=arg_desc,
                    input_name=input_name,
                    solid_name=solid_name,
                ))
        else:
            raise DagsterInvalidDefinitionError(
                "In {source} {name}, received invalid type {type} for input "
                '"{input_name}" {arg_desc} in solid invocation "{solid_name}". '
                "Must pass the output from previous solid invocations or inputs to the "
                "composition function as inputs when invoking solids during composition."
                .format(
                    source=current_context().source,
                    name=current_context().name,
                    type=type(output_node),
                    arg_desc=arg_desc,
                    input_name=input_name,
                    solid_name=solid_name,
                ))
Esempio n. 26
0
def _checked_type_resource_reqs_for_mode(
    mode_def: ModeDefinition,
    dagster_type_dict: Dict[str, DagsterType],
) -> Set[str]:
    """
    Calculate all the resource requirements related to DagsterTypes for this mode and ensure the
    mode provides those resources.
    """

    resource_reqs = set()
    mode_resources = set(mode_def.resource_defs.keys())
    for dagster_type in dagster_type_dict.values():
        for required_resource in dagster_type.required_resource_keys:
            resource_reqs.add(required_resource)
            if required_resource not in mode_resources:
                raise DagsterInvalidDefinitionError(
                    (
                        'Resource "{resource}" is required by type "{type_name}", but is not '
                        'provided by mode "{mode_name}".'
                    ).format(
                        resource=required_resource,
                        type_name=dagster_type.display_name,
                        mode_name=mode_def.name,
                    )
                )
        if dagster_type.loader:
            for required_resource in dagster_type.loader.required_resource_keys():
                resource_reqs.add(required_resource)
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError(
                        (
                            'Resource "{resource}" is required by the loader on type '
                            '"{type_name}", but is not provided by mode "{mode_name}".'
                        ).format(
                            resource=required_resource,
                            type_name=dagster_type.display_name,
                            mode_name=mode_def.name,
                        )
                    )
        if dagster_type.materializer:
            for required_resource in dagster_type.materializer.required_resource_keys():
                resource_reqs.add(required_resource)
                if required_resource not in mode_resources:
                    raise DagsterInvalidDefinitionError(
                        (
                            'Resource "{resource}" is required by the materializer on type '
                            '"{type_name}", but is not provided by mode "{mode_name}".'
                        ).format(
                            resource=required_resource,
                            type_name=dagster_type.display_name,
                            mode_name=mode_def.name,
                        )
                    )

        for plugin in dagster_type.auto_plugins:
            used_by_storage = set(
                [
                    intermediate_storage_def.name
                    for intermediate_storage_def in mode_def.intermediate_storage_defs
                    if plugin.compatible_with_storage_def(intermediate_storage_def)
                ]
            )

            if used_by_storage:
                for required_resource in plugin.required_resource_keys():
                    resource_reqs.add(required_resource)
                    if required_resource not in mode_resources:
                        raise DagsterInvalidDefinitionError(
                            (
                                'Resource "{resource}" is required by the plugin "{plugin_name}"'
                                ' on type "{type_name}" (used with storages {storages}), '
                                'but is not provided by mode "{mode_name}".'
                            ).format(
                                resource=required_resource,
                                type_name=dagster_type.display_name,
                                plugin_name=plugin.__name__,
                                mode_name=mode_def.name,
                                storages=used_by_storage,
                            )
                        )
    return resource_reqs
Esempio n. 27
0
def do_composition(
    decorator_name,
    graph_name,
    fn,
    provided_input_defs,
    provided_output_defs,
    config_schema,
    config_fn,
    ignore_output_from_composition_fn,
):
    """
    This a function used by both @pipeline and @composite_solid to implement their composition
    function which is our DSL for constructing a dependency graph.

    Args:
        decorator_name (str): Name of the calling decorator. e.g. "@pipeline",
            "@composite_solid", "@graph"
        graph_name (str): User-defined name of the definition being constructed
        fn (Callable): The composition function to be called.
        provided_input_defs(List[InputDefinition]): List of input definitions
            explicitly provided to the decorator by the user.
        provided_output_defs(List[OutputDefinition]): List of output definitions
            explicitly provided to the decorator by the user.
        config_schema(Any): Config schema provided to decorator by user.
        config_fn(Callable): Config fn provided to decorator by user.
        ignore_output_from_composite_fn(Bool): Because of backwards compatibility
            issues, pipelines ignore the return value out of the mapping if
            the user has not explicitly provided the output definitions.
            This should be removed in 0.11.0.
    """

    actual_input_defs = (provided_input_defs if provided_input_defs is not None
                         else infer_input_definitions_for_graph(
                             decorator_name, graph_name, fn))

    actual_output_defs, outputs_are_explicit = ((
        provided_output_defs, True) if provided_output_defs is not None else (
            infer_output_definitions(decorator_name, graph_name, fn),
            has_explicit_return_type(fn),
        ))

    positional_inputs = validate_solid_fn(decorator_name,
                                          graph_name,
                                          fn,
                                          actual_input_defs,
                                          exclude_nothing=False)

    kwargs = {
        input_def.name: InputMappingNode(input_def)
        for input_def in actual_input_defs
    }

    output = None
    returned_mapping = None
    enter_composition(graph_name, decorator_name)
    try:
        output = fn(**kwargs)
        if ignore_output_from_composition_fn:
            if output is not None:
                warnings.warn(
                    "You have returned a value out of a @pipeline-decorated function. "
                    "This currently has no effect on behavior, but will after 0.11.0 is "
                    "released. In order to preserve existing behavior to do not return "
                    "anything out of this function. Pipelines (and its successor, graphs) "
                    "will have meaningful outputs just like composite solids do today, "
                    "and the return value will be meaningful.",
                    stacklevel=3,
                )
            output = None

        returned_mapping = composite_mapping_from_output(
            output, actual_output_defs, graph_name)
    finally:
        context = exit_composition(returned_mapping)

    check.invariant(
        context.name == graph_name,
        "Composition context stack desync: received context for "
        '"{context.name}" expected "{graph_name}"'.format(
            context=context, graph_name=graph_name),
    )

    # line up mappings in definition order
    input_mappings = []
    for defn in actual_input_defs:
        mappings = [
            mapping for mapping in context.input_mappings
            if mapping.definition.name == defn.name
        ]

        if len(mappings) == 0:
            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{graph_name}' has unmapped input '{input_name}'. "
                "Remove it or pass it to the appropriate solid invocation.".
                format(decorator_name=decorator_name,
                       graph_name=graph_name,
                       input_name=defn.name))

        input_mappings += mappings

    output_mappings = []
    for defn in actual_output_defs:
        mapping = context.output_mapping_dict.get(defn.name)
        if mapping is None:
            # if we inferred output_defs we will be flexible and either take a mapping or not
            if not outputs_are_explicit:
                continue

            # if we are ignoring the output, disregard this unsatisfied mapping
            if ignore_output_from_composition_fn:
                continue

            raise DagsterInvalidDefinitionError(
                "{decorator_name} '{graph_name}' has unmapped output '{output_name}'. "
                "Remove it or return a value from the appropriate solid invocation."
                .format(decorator_name=decorator_name,
                        graph_name=graph_name,
                        output_name=defn.name))
        output_mappings.append(mapping)

    config_mapping = _get_validated_config_mapping(graph_name, config_schema,
                                                   config_fn)

    return (
        input_mappings,
        output_mappings,
        context.dependencies,
        context.solid_defs,
        config_mapping,
        positional_inputs,
    )
Esempio n. 28
0
    def __init__(
        self,
        solid_defs,
        name=None,
        description=None,
        dependencies=None,
        mode_defs=None,
        preset_defs=None,
    ):
        self._name = check.opt_str_param(name, 'name', '<<unnamed>>')
        self._description = check.opt_str_param(description, 'description')

        mode_definitions = check.opt_list_param(mode_defs,
                                                'mode_defs',
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        self._current_level_solid_defs = check.list_param(
            _check_solids_arg(self._name, solid_defs),
            'solid_defs',
            of_type=ISolidDefinition)

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    'Modes must have unique names.').format(
                        mode_name=mode_def.name, pipeline_name=self._name))
            seen_modes.add(mode_def.name)

        self._dependencies = validate_dependency_dict(dependencies)

        dependency_structure, pipeline_solid_dict = create_execution_structure(
            self._current_level_solid_defs,
            self._dependencies,
            container_definition=None)

        self._solid_dict = pipeline_solid_dict
        self._dependency_structure = dependency_structure

        self._runtime_type_dict = construct_runtime_type_dictionary(
            self._current_level_solid_defs)

        self._preset_defs = check.opt_list_param(preset_defs, 'preset_defs',
                                                 PresetDefinition)
        self._preset_dict = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    'PresetDefinitions must have unique names.').format(
                        name=preset.name, pipeline_name=self._name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self._name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(self._mode_definitions,
                                        self._current_level_solid_defs)

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict)

        self._all_solid_defs = {}
        for current_level_solid_def in self._current_level_solid_defs:
            for solid_def in current_level_solid_def.iterate_solid_defs():
                self._all_solid_defs[solid_def.name] = solid_def
Esempio n. 29
0
def monthly_schedule(
    pipeline_name: str,
    start_date: datetime.datetime,
    name: Optional[str] = None,
    execution_day_of_month: int = 1,
    execution_time: datetime.time = datetime.time(0, 0),
    tags_fn_for_date: Optional[Callable[[datetime.datetime],
                                        Optional[Dict[str, str]]]] = None,
    solid_selection: Optional[List[str]] = None,
    mode: Optional[str] = "default",
    should_execute: Optional[Callable[["ScheduleExecutionContext"],
                                      bool]] = None,
    environment_vars: Optional[Dict[str, str]] = None,
    end_date: Optional[datetime.datetime] = None,
    execution_timezone: Optional[str] = None,
    partition_months_offset: Optional[int] = 1,
    description: Optional[str] = None,
) -> Callable[[Callable[..., Dict[str, Any]]], ScheduleDefinition]:
    """Create a schedule that runs monthly.

    The decorated function will be called as the ``run_config_fn`` of the underlying
    :py:class:`~dagster.ScheduleDefinition` and should take a
    :py:class:`~dagster.ScheduleExecutionContext` as its only argument, returning the environment
    dict for the scheduled execution.

    Args:
        pipeline_name (str): The name of the pipeline to execute when the schedule runs.
        start_date (datetime.datetime): The date from which to run the schedule.
        name (Optional[str]): The name of the schedule to create.
        execution_day_of_month (int): The day of the month on which to run the schedule (must be
            between 0 and 31).
        execution_time (datetime.time): The time at which to execute the schedule.
        tags_fn_for_date (Optional[Callable[[datetime.datetime], Optional[Dict[str, str]]]]): A
            function that generates tags to attach to the schedules runs. Takes the date of the
            schedule run and returns a dictionary of tags (string key-value pairs).
        solid_selection (Optional[List[str]]): A list of solid subselection (including single
            solid names) to execute when the schedule runs. e.g. ``['*some_solid+', 'other_solid']``
        mode (Optional[str]): The pipeline mode in which to execute this schedule.
            (Default: 'default')
        should_execute (Optional[Callable[ScheduleExecutionContext, bool]]): A function that runs at
            schedule execution tie to determine whether a schedule should execute or skip. Takes a
            :py:class:`~dagster.ScheduleExecutionContext` and returns a boolean (``True`` if the
            schedule should execute). Defaults to a function that always returns ``True``.
        environment_vars (Optional[Dict[str, str]]): Any environment variables to set when executing
            the schedule.
        end_date (Optional[datetime.datetime]): The last time to run the schedule to, defaults to
            current time.
        execution_timezone (Optional[str]): Timezone in which the schedule should run. Only works
            with DagsterDaemonScheduler, and must be set when using that scheduler.
        partition_months_offset (Optional[int]): How many months back to go when choosing the partition
            for a given schedule execution. For example, when partition_months_offset=1, the schedule
            that executes during month N will fill in the partition for month N-1.
            (Default: 1)
        description (Optional[str]): A human-readable description of the schedule.
    """
    check.opt_str_param(name, "name")
    check.inst_param(start_date, "start_date", datetime.datetime)
    check.opt_inst_param(end_date, "end_date", datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, "tags_fn_for_date")
    check.opt_nullable_list_param(solid_selection,
                                  "solid_selection",
                                  of_type=str)
    mode = check.opt_str_param(mode, "mode", DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, "should_execute")
    check.opt_dict_param(environment_vars,
                         "environment_vars",
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, "pipeline_name")
    check.int_param(execution_day_of_month, "execution_day")
    check.inst_param(execution_time, "execution_time", datetime.time)
    check.opt_str_param(execution_timezone, "execution_timezone")
    check.opt_int_param(partition_months_offset, "partition_months_offset")
    check.opt_str_param(description, "description")

    if (start_date.day != 1 or start_date.hour != 0 or start_date.minute != 0
            or start_date.second != 0):
        warnings.warn(
            "`start_date` must be at the beginning of the first day of the month for a monthly "
            "schedule. Use `execution_day_of_month` and `execution_time` to execute the schedule "
            "at a specific time within the month. For example, to run the schedule at 3AM on the "
            "23rd of each month starting in October, your schedule definition would look like:"
            """
@monthly_schedule(
    start_date=datetime.datetime(2020, 10, 1),
    execution_day_of_month=23,
    execution_time=datetime.time(3, 0)
):
def my_schedule_definition(_):
    ...
""")

    if execution_day_of_month <= 0 or execution_day_of_month > 31:
        raise DagsterInvalidDefinitionError(
            "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be "
            "between 1 and 31".format(execution_day_of_month))

    cron_schedule = "{minute} {hour} {day} * *".format(
        minute=execution_time.minute,
        hour=execution_time.hour,
        day=execution_day_of_month)

    fmt = DEFAULT_MONTHLY_FORMAT

    execution_time_to_partition_fn = (
        lambda d: pendulum.instance(d).replace(hour=0, minute=0).subtract(
            months=partition_months_offset, days=execution_day_of_month - 1))

    partition_fn = schedule_partition_range(
        start_date,
        end=end_date,
        cron_schedule=cron_schedule,
        fmt=fmt,
        timezone=execution_timezone,
        execution_time_to_partition_fn=execution_time_to_partition_fn,
        inclusive=(partition_months_offset == 0),
    )

    def inner(fn: Callable[..., Dict[str, Any]]) -> ScheduleDefinition:
        check.callable_param(fn, "fn")

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value: Callable[["Partition"], Optional[Dict[
            str, str]]] = lambda partition: {}
        if tags_fn_for_date:
            tags_fn = cast(
                Callable[[datetime.datetime], Optional[Dict[str, str]]],
                tags_fn_for_date)
            tags_fn_for_partition_value = lambda partition: tags_fn(partition.
                                                                    value)

        partition_set = PartitionSetDefinition(
            name="{}_partitions".format(schedule_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            run_config_fn_for_partition=lambda partition: fn(partition.value),
            solid_selection=solid_selection,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
            partition_selector=create_offset_partition_selector(
                execution_time_to_partition_fn=execution_time_to_partition_fn),
            execution_timezone=execution_timezone,
            description=description,
        )

    return inner
Esempio n. 30
0
def monthly_schedule(
    pipeline_name,
    start_date,
    name=None,
    execution_day_of_month=1,
    execution_time=datetime.time(0, 0),
    tags_fn_for_date=None,
    solid_subset=None,
    mode="default",
    should_execute=None,
    environment_vars=None,
):
    check.opt_str_param(name, 'name')
    check.inst_param(start_date, 'start_date', datetime.datetime)
    check.opt_callable_param(tags_fn_for_date, 'tags_fn_for_date')
    check.opt_nullable_list_param(solid_subset, 'solid_subset', of_type=str)
    mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)
    check.opt_callable_param(should_execute, 'should_execute')
    check.opt_dict_param(environment_vars,
                         'environment_vars',
                         key_type=str,
                         value_type=str)
    check.str_param(pipeline_name, 'pipeline_name')
    check.int_param(execution_day_of_month, 'execution_day')
    check.inst_param(execution_time, 'execution_time', datetime.time)

    if execution_day_of_month <= 0 or execution_day_of_month > 31:
        raise DagsterInvalidDefinitionError(
            "`execution_day_of_month={}` is not valid for monthly schedule. Execution day must be between 1 and 31"
            .format(execution_day_of_month))

    cron_schedule = '{minute} {hour} {day} * *'.format(
        minute=execution_time.minute,
        hour=execution_time.hour,
        day=execution_day_of_month)

    partition_fn = date_partition_range(start_date,
                                        delta=relativedelta(months=1),
                                        fmt="%Y-%m")

    def inner(fn):
        check.callable_param(fn, 'fn')

        schedule_name = name or fn.__name__

        tags_fn_for_partition_value = lambda partition: {}
        if tags_fn_for_date:
            tags_fn_for_partition_value = lambda partition: tags_fn_for_date(
                partition.value)

        partition_set = PartitionSetDefinition(
            name='{}_monthly'.format(pipeline_name),
            pipeline_name=pipeline_name,
            partition_fn=partition_fn,
            environment_dict_fn_for_partition=lambda partition: fn(partition.
                                                                   value),
            solid_subset=solid_subset,
            tags_fn_for_partition=tags_fn_for_partition_value,
            mode=mode,
        )

        return partition_set.create_schedule_definition(
            schedule_name,
            cron_schedule,
            should_execute=should_execute,
            environment_vars=environment_vars,
        )

    return inner