Example #1
0
 def t_pipe():
     output_path = current_task().get_target(
         "outputs", config=TargetConfig(folder=True)
     )
     output_path.mkdir()
     v = calc_value(output_path=os.path.join(str(output_path), "f1"))
     return v
Example #2
0
    def __init__(self, task_run):
        super(TaskRunLogManager, self).__init__(task_run)

        self.local_log_file = self.task_run.local_task_run_root.partition(
            name="%s.log" % task_run.attempt_number
        )  # type: FileTarget

        if environ_enabled("DBND__LOG_SPARK"):
            self.local_spark_log_file = self.task_run.local_task_run_root.partition(
                name="%s-spark.log" % task_run.attempt_number
            )  # type: FileTarget
        else:
            self.local_spark_log_file = None

        self.remote_log_file = None
        if not isinstance(self.task.task_env, LocalEnvConfig):
            self.remote_log_file = self.task_run.attempt_folder.partition(
                name=str(task_run.attempt_number),
                config=TargetConfig().as_file().txt,
                extension=".log",
            )  # type: FileTarget

        # file handler for task log
        # if set -> we are in the context of capturing
        self._log_task_run_into_file_active = False
 def my_second_target(self, pandas_data_frame):
     _target = self.target(
         "other_file.parquet",
         fs=PseudoLocalFileSystem(),
         config=TargetConfig().with_require_local_access(),
     )
     _target.write_df(pandas_data_frame)
     return _target
Example #4
0
    def __init__(self, targets, properties=None, source=None):
        super(MultiTarget, self).__init__(properties=properties, source=source)
        self._targets = targets

        if targets and len({t.config for t in targets if hasattr(t, "config")}) == 1:
            self.config = targets[0].config
        else:
            self.config = TargetConfig()
Example #5
0
 def get_multi_target_from_config_ids(config_ids):
     configs = []
     # multi-target config is based on the first target in the list
     # we use csv format, as spark doesn't have explicit tsv support
     target_config = TargetConfig(folder=True, format=FileFormat.csv, flag=None)
     for config_id in config_ids:
         config = target(base_target, f"configID={config_id}", config=target_config)
         configs.append(config)
     return data_combine(configs)
Example #6
0
    def target(self, name, config=None, output_ext=None, output_mode=None):
        task = self.task
        config = config or TargetConfig()
        path_pattern = task._get_task_output_path_format(output_mode)

        path = calculate_path(
            task=task,
            name=name,
            output_ext=output_ext,
            is_dir=config.folder,
            path_pattern=path_pattern,
        )

        return target(path, config=config)
Example #7
0
File: task.py Project: lbtanh/dbnd
    def get_target(self, name, config=None, output_ext=None, output_mode=None):
        name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999)
        config = config or TargetConfig()
        path_pattern = self._get_task_output_path_format(output_mode)

        path = calculate_path(
            task=self,
            name=name,
            output_ext=output_ext,
            is_dir=config.folder,
            path_pattern=path_pattern,
        )

        return target(path, config=config)
Example #8
0
    def __init__(self, obj=_NOTHING, path=None, value_type=None, **kwargs):
        # type: (Any, str, ValueType, **Any) -> None
        from targets.values import get_value_type_of_obj

        super(InMemoryTarget, self).__init__(**kwargs)
        self._obj = obj
        if self._obj is _NOTHING and not path:
            raise Exception("InMemoryTarget requires object or path")

        self.value_type = value_type or get_value_type_of_obj(self._obj)
        self.path = path or "memory://%s:%s" % (
            self.value_type,
            self.value_type.to_signature(self._obj),
        )
        self.config = TargetConfig()
Example #9
0
 def _target_config(self):
     return self.parameter.target_config or TargetConfig()
Example #10
0
class ParameterDefinition(object):  # generics are broken: typing.Generic[T]
    """
    Parameter whose value is a ``str``, and a base class for other parameter types.

    Parameters are objects set on the Task class level to make it possible to parameterize tasks.
    For instance::

        class MyTask(dbnd.Task):
            foo = databand.parameter[str]

        class RequiringTask(dbnd.Task):
            def requires(self):
                return MyTask(foo="hello")

            def run(self):
                print(self.requires().foo)  # prints "hello"

    This makes it possible to instantiate multiple tasks, eg ``MyTask(foo='bar')`` and
    ``MyTask(foo='baz')``. The task will then have the ``foo`` attribute set appropriately.

    When a task is instantiated, it will first use any argument as the value of the parameter, eg.
    if you instantiate ``a = TaskA(x=44)`` then ``a.x == 44``. When the value is not provided, the
    value  will be resolved in this order of falling priority:

        * Any value provided on the command line:

          - To the root task (eg. ``--param xyz``)

          - Then to the class, using the qualified task name syntax (eg. ``--TaskA-param xyz``).

        * With ``[TASK_NAME]>PARAM_NAME: <serialized value>`` syntax.

        * Any default value set using the ``default`` flag.

    """

    default_description = ""
    default_input_description = "data input"
    default_output_description = "data output"

    _total_counter = (
        0  # non-atomically increasing counter used for ordering parameters.
    )

    default = attr.ib(default=NOTHING)

    name = attr.ib(default=None)

    # value type and sub type
    value_type = attr.ib(default=None)  # type: ValueType
    value_type_defined = attr.ib(default=None)  # type: ValueType
    sub_type = attr.ib(default=None)

    description = attr.ib(default=NOTHING)  # type: str
    config_path = attr.ib(default=None)  # type: Optional[ConfigPath]
    disable_jinja_templating = attr.ib(default=False)  # type: bool
    require_local_access = attr.ib(default=False)  # type: bool
    env_interpolation = attr.ib(default=True)
    # parameter kind
    significant = attr.ib(default=True)  # type: bool
    scope = attr.ib(default=ParameterScope.task)
    from_task_env_config = attr.ib(default=False)
    system = attr.ib(default=False)
    kind = attr.ib(default=_ParameterKind.task_input)

    # output configuration
    output_name = attr.ib(default=None)  # type: str
    output_ext = attr.ib(default=None)  # type: str
    output_mode = attr.ib(default=OutputMode.regular)
    # used for output factories only
    output_factory = attr.ib(default=None)

    target_config = attr.ib(default=TargetConfig())
    load_options = attr.ib(factory=dict)  # type: Dict[FileFormat, Dict[str, Any]]
    save_options = attr.ib(factory=dict)  # type: Dict[FileFormat, Dict[str, Any]]

    validator = attr.ib(default=None)
    choices = attr.ib(default=None)

    load_on_build = attr.ib(default=NOTHING)  # type: bool
    empty_default = attr.ib(default=NOTHING)

    # value preview and meta settings
    log_preview = attr.ib(default=None)  # type: Optional[bool]
    log_preview_size = attr.ib(default=None)  # type: Optional[int]
    log_schema = attr.ib(default=None)  # type: Optional[bool]
    log_size = attr.ib(default=None)  # type: Optional[bool]
    log_stats = attr.ib(default=None)  # type: Optional[bool]
    log_histograms = attr.ib(default=None)  # type: Optional[bool]

    log_meta = attr.ib(
        default=True
    )  # type: bool  # log all (can disable whole value log)

    # ParameterDefinition ownership
    task_definition = attr.ib(default=None)  # type: TaskDefinition
    parameter_origin = attr.ib(default=None)
    parameter_id = attr.ib(default=1)

    value_meta_conf = attr.ib(default=None)  # type: ValueMetaConf
    hidden = attr.ib(default=False)  # type: bool

    @property
    def group(self):
        return ParameterGroup.system if self.system else ParameterGroup.user

    @property
    def task_definition_uid(self):
        if not self.task_definition:
            return None
        return self.task_definition.task_definition_uid

    def evolve_with_owner(self, task_definition, name):
        if self.task_definition and self.name != name:
            logger.warning(
                "Name of parameter has been changed from '%s' to '%s' at %s",
                name,
                self.name,
                task_definition,
            )
        parameter_origin = self.parameter_origin or task_definition
        return attr.evolve(
            self,
            task_definition=task_definition,
            name=name,
            parameter_origin=parameter_origin,
        )

    def parse_from_str(self, x):  # type: (str) -> T
        """
        Parse an individual value from the input.

        :param str x: the value to parse.
        :return: the parsed value.
        """
        return self.calc_init_value(x)

    def calc_init_value(self, value):
        if value is None:
            # it's None
            # TODO: may be we still can "normalize" the value
            return value
        if isinstance(value, Path):
            return target(str(value), config=self.target_config)
        if isinstance(value, Target):
            # it's deferred result - > we load it lately
            return value

        # we process value regardless parse!
        # cf_value.require_parse:

        if self.env_interpolation and isinstance(value, six.string_types):
            try:
                value = expand_env_var(value)
            except Exception as ex:
                logger.warning(
                    "failed to expand variable '%s' : %s", safe_string(value), str(ex)
                )

        # in case we are output and have value:
        # it's Target or it's str to be converted as target
        load_value = self.load_on_build and not self.is_output()

        return self.value_type.parse_value(
            value, load_value=load_value, target_config=self.target_config
        )

    def calc_runtime_value(self, value, task):
        if value is None:
            return value

        if isinstance(self.value_type, _TargetValueType):
            # if it "target" type, let read it into "user friendly" format
            # regardless it's input or output,
            # so if function has  param = output[Path] - it will get Path

            return traverse(value, self.value_type.target_to_value)

        # usually we should not load "outputs" on read
        if self.is_output():
            # actually we should not load it, so just return
            return value

        if isinstance(value, Target):
            try:
                runtime_value = self.load_from_target(value)
                if self.is_input():
                    self._log_parameter_value(runtime_value, value, task)
                return runtime_value
            except Exception as ex:
                raise friendly_error.failed_to_read_target_as_task_input(
                    ex=ex, task=task, parameter=self, target=value
                )

        if (
            isinstance(self.value_type, _StructureValueType)
            and self.value_type.sub_value_type
        ):
            try:

                def load_with_preview(val):
                    runtime_val = self.value_type.sub_value_type.load_runtime(val)
                    if self.is_input() and isinstance(val, Target):
                        # Optimisation opportunity: log all targets in a single call
                        self._log_parameter_value(runtime_val, val, task)

                    return runtime_val

                return traverse(value, convert_f=load_with_preview)
            except Exception as ex:
                raise friendly_error.failed_to_read_task_input(
                    ex=ex, task=task, parameter=self, target=value
                )

        return value

    def to_str(self, x):  # type: (T) -> str
        """
        Opposite of :py:meth:`parse`.

        Converts the value ``x`` to a string.

        :param x: the value to serialize.
        """
        if isinstance(x, Target):
            return str(x)

        return self.value_type.to_str(x)  # default impl

    def to_repr(self, x):  # type: (T) -> str
        return self.value_type.to_repr(x)

    def signature(self, x):
        if x is None:
            return str(x)

        if isinstance(x, Target):
            return str(x)

        # we can have
        # 1. a value of value_type
        # 2. target with value type TargetValueType
        # 3. list/dict of targets with value type TargetValueType
        return self.value_type.to_signature(x)

    def load_from_target(
        self, target, **kwargs
    ):  # type: (ParameterDefinition, FileTarget, **Any)-> T
        from targets import InMemoryTarget

        if isinstance(target, InMemoryTarget):
            value = target.load()
        else:
            if target.config:
                f = target.config.format
                if f and f in self.load_options:
                    kwargs.update(**self.load_options[f])
            value = self.value_type.load_from_target(target, **kwargs)

        self.validate(value)

        self._store_value_origin_target(value, target)

        return value

    def dump_to_target(
        self, target, value, **kwargs
    ):  # type: (DataTarget, T, **Any)-> None
        if hasattr(target, "config"):
            f = target.config.format
            if f and f in self.save_options:
                kwargs.update(**self.save_options[f])

        self.value_type.save_to_target(target, value, **kwargs)  # default impl

        # we need updated target
        self._store_value_origin_target(value, target)

    def _log_parameter_value(self, runtime_value, value, task):
        if try_get_databand_run() and task.current_task_run:
            task.current_task_run.tracker.log_parameter_data(
                parameter=self,
                target=value,
                value=runtime_value,
                operation_type=DbndTargetOperationType.read,
                operation_status=DbndTargetOperationStatus.OK,
            )

    def _store_value_origin_target(self, value, target):
        dbnd_run = try_get_databand_run()
        if not dbnd_run:
            return

        dbnd_run.target_origin.add(target, value, self.value_type)

    def normalize(self, x):  # type: (T) -> T
        """
        Given a parsed parameter value, normalizes it.

        The value can either be the result of parse(), the default value or
        arguments passed into the task's constructor by instantiation.

        This is very implementation defined, but can be used to validate/clamp
        valid values. For example, if you wanted to only accept even integers,
        and "correct" odd values to the nearest integer, you can implement
        normalize as ``x // 2 * 2``.
        """
        if isinstance(self.value_type, _TargetValueType):
            # can not move to value_type, we need target_config

            from dbnd._core.utils.task_utils import to_targets

            return to_targets(x, from_string_kwargs=dict(config=self.target_config))
        return self.value_type.normalize(x)

    def validate(self, x):
        if self.validator:
            self.validator.validate(self, x)

    def as_str_input(self, value):
        if value is None:
            return "@none"

        switch_value = self.to_str(value)
        if isinstance(value, Target):
            if self.load_on_build:
                # this is non-data parameter, it's int/str/bool
                # we are in the scenario, when something should be loaded, however, it's still Target
                switch_value = "@target:%s" % switch_value
        return switch_value

    def next_in_enumeration(self, value):
        """
        This method is used by the :py:mod:`databand.execution_summary` module for pretty printing purposes.

        If your Parameter type has an enumerable ordering of values. You can choose to override this method.
        Enable it to pretty print tasks like ``MyTask(num=1), MyTask(num=2), MyTask(num=3)`` to ``MyTask(num=1..3)``.

        :param value: The value
        :return: The next value, like "value + 1". Or ``None`` if there's no enumerable ordering.
        """
        return self.value_type.next_in_enumeration(value)  # default impl

    def _get_help_message(self, sections=None):
        sections = sections or [(self.task_family)]

        define_via = []

        define_via.append(
            "project.cfg : [%s]%s=VALUE" % (" | ".join(sections), self.name)
        )
        define_via.append("cli:   --set %s.%s=VALUE" % (self.task_family, self.name))
        define_via.append(
            "constructor: %s(%s=VALUE, ...)" % (self.task_family, self.name)
        )
        define_via = "\n".join(["\t* %s" % l for l in define_via])

        return "You can change '{task_family}.{name}' value using one of the following methods: \n {methods}".format(
            task_family=(self.task_family), name=self.name, methods=define_via
        )

    def parameter_exception(self, reason, ex):
        err_msg = "Failed to {reason} for parameter '{name}' at {task_family}()".format(
            reason=reason, name=self.name, task_family=self.task_family
        )
        log_exception(err_msg, ex, logger)
        raise DatabandBuildError(
            err_msg, nested_exceptions=[ex], help_msg=self._get_help_message()
        )

    @property
    def task_family(self):
        if self.task_definition:
            return self.task_definition.task_family
        return None

    @property
    def task_config_section(self):
        if self.task_definition:
            return self.task_definition.task_config_section
        return None

    def __repr__(self):
        owned_by = ""
        parameter_origin = ""  # show it only if different

        if self.task_definition:
            owned_by = self.task_definition.task_family if self.task_definition else ""
            origin_cls_str = (
                self.parameter_origin.task_family if self.parameter_origin else ""
            )
            if origin_cls_str and origin_cls_str != owned_by:
                parameter_origin = " at %s" % origin_cls_str

        parameter_kind = (
            "output" if self.kind == _ParameterKind.task_output else "parameter"
        )
        return "{owned_by}.{name}({parameter_kind}[{value_type}]{parameter_origin})".format(
            owned_by=owned_by,
            parameter_origin=parameter_origin,
            value_type=self.value_type_str,
            parameter_kind=parameter_kind,
            name=self.name or "_unknown_",
        )

    @property
    def value_type_str(self):
        if self.value_type is None:
            return "unknown"
        type_handler = self.value_type.type_str
        if isinstance(self.value_type, InlineValueType):
            type_handler = "!" + type_handler
        if self.value_type_defined != self.value_type:
            type_handler = "*" + type_handler
        return type_handler

    def _target_source(self, task):
        return TargetSource(
            task_id=task.task_id, parameter_name=self.name, name=self.name
        )

    def build_target(self, task):  # type: (ParameterDefinition, Task) -> DataTarget
        target_config = self.target_config
        if not target_config.format:
            default_config = task.settings.output.get_value_target_config(
                self.value_type
            )
            # for now we take only format and compression from config
            target_config = target_config.with_format(
                default_config.format
            ).with_compression(default_config.compression)
        output_ext = self.output_ext
        if output_ext is None:
            output_ext = target_config.get_ext()
        return task.get_target(
            name=self.output_name or self.name,
            output_ext=output_ext,
            config=target_config,
            output_mode=self.output_mode,
        )

    def build_output(self, task):
        if self.output_factory is not None:
            try:
                return self.output_factory(task, self)
            except Exception:
                logger.exception(
                    "Failed to created task output %s for %s : "
                    " output_factory expected signature is '(Task, Parameter) -> Target(any structure) '",
                    self,
                    task,
                )
                raise

        if (
            not self.system
            and self.name not in ("task_band",)
            and task.task_in_memory_outputs
        ):
            return InMemoryTarget(
                path="memory://{value_type}:{task}.{p_name}".format(
                    value_type=self.value_type, task=task.task_id, p_name=self.name
                )
            )

        # we separate into two functions ,
        # as we want to be able to call build_target from output_factory implementation
        try:
            return self.build_target(task)
        except Exception as e:
            raise friendly_error.task_build.failed_to_build_output_target(
                self.name, task, e
            )

    def is_input(self):
        return self.kind == _ParameterKind.task_input

    def is_output(self):
        return self.kind == _ParameterKind.task_output

    def __hash__(self):
        return hash(self.name) ^ hash(self.task_definition)

    def modify(self, **kwargs):
        if not kwargs:
            return self
        return attr.evolve(self, **kwargs)

    def get_env_key(self, section):
        return PARAM_ENV_TEMPLATE.format(S=section.upper(), K=self.name.upper())

    def get_value_meta(self, value, meta_conf):
        # do not use meta_conf directly, you should get it merged with main config
        return self.value_type.get_value_meta(value, meta_conf=meta_conf)

    def update_value_meta_conf_from_runtime_value(self, value, tracking_config):
        # type: (Any, TrackingConfig) -> ParameterDefinition
        return self.modify(
            value_meta_conf=tracking_config.get_value_meta_conf(
                self.value_meta_conf, get_value_type_of_obj(value, ValueType())
            )
        )
Example #11
0

# 1. create file extension
z_file_ext = register_file_extension("z")


class JoblibSizedMessageMarshaller(Marshaller):
    def target_to_value(self, target, **kwargs):
        with target.open() as fp:
            from_file = joblib.load(fp.name)
            return from_file

    def value_to_target(self, value, target, **kwargs):
        with target.open("w") as fp:
            joblib.dump(value, fp.name)


# 2. register type to extension mapping
register_marshaller(SizedMessage, z_file_ext, JoblibSizedMessageMarshaller())


@task(result=output.target_config(TargetConfig(format=z_file_ext)))
def dump_as_joblib():
    # type: ()-> SizedMessage
    return SizedMessage("example message \n", 10)


@task(result=output.txt[int])
def load_as_joblib(sized_message: SizedMessage):
    return sized_message.msg * sized_message.size