def t_pipe(): output_path = current_task().get_target( "outputs", config=TargetConfig(folder=True) ) output_path.mkdir() v = calc_value(output_path=os.path.join(str(output_path), "f1")) return v
def __init__(self, task_run): super(TaskRunLogManager, self).__init__(task_run) self.local_log_file = self.task_run.local_task_run_root.partition( name="%s.log" % task_run.attempt_number ) # type: FileTarget if environ_enabled("DBND__LOG_SPARK"): self.local_spark_log_file = self.task_run.local_task_run_root.partition( name="%s-spark.log" % task_run.attempt_number ) # type: FileTarget else: self.local_spark_log_file = None self.remote_log_file = None if not isinstance(self.task.task_env, LocalEnvConfig): self.remote_log_file = self.task_run.attempt_folder.partition( name=str(task_run.attempt_number), config=TargetConfig().as_file().txt, extension=".log", ) # type: FileTarget # file handler for task log # if set -> we are in the context of capturing self._log_task_run_into_file_active = False
def my_second_target(self, pandas_data_frame): _target = self.target( "other_file.parquet", fs=PseudoLocalFileSystem(), config=TargetConfig().with_require_local_access(), ) _target.write_df(pandas_data_frame) return _target
def __init__(self, targets, properties=None, source=None): super(MultiTarget, self).__init__(properties=properties, source=source) self._targets = targets if targets and len({t.config for t in targets if hasattr(t, "config")}) == 1: self.config = targets[0].config else: self.config = TargetConfig()
def get_multi_target_from_config_ids(config_ids): configs = [] # multi-target config is based on the first target in the list # we use csv format, as spark doesn't have explicit tsv support target_config = TargetConfig(folder=True, format=FileFormat.csv, flag=None) for config_id in config_ids: config = target(base_target, f"configID={config_id}", config=target_config) configs.append(config) return data_combine(configs)
def target(self, name, config=None, output_ext=None, output_mode=None): task = self.task config = config or TargetConfig() path_pattern = task._get_task_output_path_format(output_mode) path = calculate_path( task=task, name=name, output_ext=output_ext, is_dir=config.folder, path_pattern=path_pattern, ) return target(path, config=config)
def get_target(self, name, config=None, output_ext=None, output_mode=None): name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999) config = config or TargetConfig() path_pattern = self._get_task_output_path_format(output_mode) path = calculate_path( task=self, name=name, output_ext=output_ext, is_dir=config.folder, path_pattern=path_pattern, ) return target(path, config=config)
def __init__(self, obj=_NOTHING, path=None, value_type=None, **kwargs): # type: (Any, str, ValueType, **Any) -> None from targets.values import get_value_type_of_obj super(InMemoryTarget, self).__init__(**kwargs) self._obj = obj if self._obj is _NOTHING and not path: raise Exception("InMemoryTarget requires object or path") self.value_type = value_type or get_value_type_of_obj(self._obj) self.path = path or "memory://%s:%s" % ( self.value_type, self.value_type.to_signature(self._obj), ) self.config = TargetConfig()
def _target_config(self): return self.parameter.target_config or TargetConfig()
class ParameterDefinition(object): # generics are broken: typing.Generic[T] """ Parameter whose value is a ``str``, and a base class for other parameter types. Parameters are objects set on the Task class level to make it possible to parameterize tasks. For instance:: class MyTask(dbnd.Task): foo = databand.parameter[str] class RequiringTask(dbnd.Task): def requires(self): return MyTask(foo="hello") def run(self): print(self.requires().foo) # prints "hello" This makes it possible to instantiate multiple tasks, eg ``MyTask(foo='bar')`` and ``MyTask(foo='baz')``. The task will then have the ``foo`` attribute set appropriately. When a task is instantiated, it will first use any argument as the value of the parameter, eg. if you instantiate ``a = TaskA(x=44)`` then ``a.x == 44``. When the value is not provided, the value will be resolved in this order of falling priority: * Any value provided on the command line: - To the root task (eg. ``--param xyz``) - Then to the class, using the qualified task name syntax (eg. ``--TaskA-param xyz``). * With ``[TASK_NAME]>PARAM_NAME: <serialized value>`` syntax. * Any default value set using the ``default`` flag. """ default_description = "" default_input_description = "data input" default_output_description = "data output" _total_counter = ( 0 # non-atomically increasing counter used for ordering parameters. ) default = attr.ib(default=NOTHING) name = attr.ib(default=None) # value type and sub type value_type = attr.ib(default=None) # type: ValueType value_type_defined = attr.ib(default=None) # type: ValueType sub_type = attr.ib(default=None) description = attr.ib(default=NOTHING) # type: str config_path = attr.ib(default=None) # type: Optional[ConfigPath] disable_jinja_templating = attr.ib(default=False) # type: bool require_local_access = attr.ib(default=False) # type: bool env_interpolation = attr.ib(default=True) # parameter kind significant = attr.ib(default=True) # type: bool scope = attr.ib(default=ParameterScope.task) from_task_env_config = attr.ib(default=False) system = attr.ib(default=False) kind = attr.ib(default=_ParameterKind.task_input) # output configuration output_name = attr.ib(default=None) # type: str output_ext = attr.ib(default=None) # type: str output_mode = attr.ib(default=OutputMode.regular) # used for output factories only output_factory = attr.ib(default=None) target_config = attr.ib(default=TargetConfig()) load_options = attr.ib(factory=dict) # type: Dict[FileFormat, Dict[str, Any]] save_options = attr.ib(factory=dict) # type: Dict[FileFormat, Dict[str, Any]] validator = attr.ib(default=None) choices = attr.ib(default=None) load_on_build = attr.ib(default=NOTHING) # type: bool empty_default = attr.ib(default=NOTHING) # value preview and meta settings log_preview = attr.ib(default=None) # type: Optional[bool] log_preview_size = attr.ib(default=None) # type: Optional[int] log_schema = attr.ib(default=None) # type: Optional[bool] log_size = attr.ib(default=None) # type: Optional[bool] log_stats = attr.ib(default=None) # type: Optional[bool] log_histograms = attr.ib(default=None) # type: Optional[bool] log_meta = attr.ib( default=True ) # type: bool # log all (can disable whole value log) # ParameterDefinition ownership task_definition = attr.ib(default=None) # type: TaskDefinition parameter_origin = attr.ib(default=None) parameter_id = attr.ib(default=1) value_meta_conf = attr.ib(default=None) # type: ValueMetaConf hidden = attr.ib(default=False) # type: bool @property def group(self): return ParameterGroup.system if self.system else ParameterGroup.user @property def task_definition_uid(self): if not self.task_definition: return None return self.task_definition.task_definition_uid def evolve_with_owner(self, task_definition, name): if self.task_definition and self.name != name: logger.warning( "Name of parameter has been changed from '%s' to '%s' at %s", name, self.name, task_definition, ) parameter_origin = self.parameter_origin or task_definition return attr.evolve( self, task_definition=task_definition, name=name, parameter_origin=parameter_origin, ) def parse_from_str(self, x): # type: (str) -> T """ Parse an individual value from the input. :param str x: the value to parse. :return: the parsed value. """ return self.calc_init_value(x) def calc_init_value(self, value): if value is None: # it's None # TODO: may be we still can "normalize" the value return value if isinstance(value, Path): return target(str(value), config=self.target_config) if isinstance(value, Target): # it's deferred result - > we load it lately return value # we process value regardless parse! # cf_value.require_parse: if self.env_interpolation and isinstance(value, six.string_types): try: value = expand_env_var(value) except Exception as ex: logger.warning( "failed to expand variable '%s' : %s", safe_string(value), str(ex) ) # in case we are output and have value: # it's Target or it's str to be converted as target load_value = self.load_on_build and not self.is_output() return self.value_type.parse_value( value, load_value=load_value, target_config=self.target_config ) def calc_runtime_value(self, value, task): if value is None: return value if isinstance(self.value_type, _TargetValueType): # if it "target" type, let read it into "user friendly" format # regardless it's input or output, # so if function has param = output[Path] - it will get Path return traverse(value, self.value_type.target_to_value) # usually we should not load "outputs" on read if self.is_output(): # actually we should not load it, so just return return value if isinstance(value, Target): try: runtime_value = self.load_from_target(value) if self.is_input(): self._log_parameter_value(runtime_value, value, task) return runtime_value except Exception as ex: raise friendly_error.failed_to_read_target_as_task_input( ex=ex, task=task, parameter=self, target=value ) if ( isinstance(self.value_type, _StructureValueType) and self.value_type.sub_value_type ): try: def load_with_preview(val): runtime_val = self.value_type.sub_value_type.load_runtime(val) if self.is_input() and isinstance(val, Target): # Optimisation opportunity: log all targets in a single call self._log_parameter_value(runtime_val, val, task) return runtime_val return traverse(value, convert_f=load_with_preview) except Exception as ex: raise friendly_error.failed_to_read_task_input( ex=ex, task=task, parameter=self, target=value ) return value def to_str(self, x): # type: (T) -> str """ Opposite of :py:meth:`parse`. Converts the value ``x`` to a string. :param x: the value to serialize. """ if isinstance(x, Target): return str(x) return self.value_type.to_str(x) # default impl def to_repr(self, x): # type: (T) -> str return self.value_type.to_repr(x) def signature(self, x): if x is None: return str(x) if isinstance(x, Target): return str(x) # we can have # 1. a value of value_type # 2. target with value type TargetValueType # 3. list/dict of targets with value type TargetValueType return self.value_type.to_signature(x) def load_from_target( self, target, **kwargs ): # type: (ParameterDefinition, FileTarget, **Any)-> T from targets import InMemoryTarget if isinstance(target, InMemoryTarget): value = target.load() else: if target.config: f = target.config.format if f and f in self.load_options: kwargs.update(**self.load_options[f]) value = self.value_type.load_from_target(target, **kwargs) self.validate(value) self._store_value_origin_target(value, target) return value def dump_to_target( self, target, value, **kwargs ): # type: (DataTarget, T, **Any)-> None if hasattr(target, "config"): f = target.config.format if f and f in self.save_options: kwargs.update(**self.save_options[f]) self.value_type.save_to_target(target, value, **kwargs) # default impl # we need updated target self._store_value_origin_target(value, target) def _log_parameter_value(self, runtime_value, value, task): if try_get_databand_run() and task.current_task_run: task.current_task_run.tracker.log_parameter_data( parameter=self, target=value, value=runtime_value, operation_type=DbndTargetOperationType.read, operation_status=DbndTargetOperationStatus.OK, ) def _store_value_origin_target(self, value, target): dbnd_run = try_get_databand_run() if not dbnd_run: return dbnd_run.target_origin.add(target, value, self.value_type) def normalize(self, x): # type: (T) -> T """ Given a parsed parameter value, normalizes it. The value can either be the result of parse(), the default value or arguments passed into the task's constructor by instantiation. This is very implementation defined, but can be used to validate/clamp valid values. For example, if you wanted to only accept even integers, and "correct" odd values to the nearest integer, you can implement normalize as ``x // 2 * 2``. """ if isinstance(self.value_type, _TargetValueType): # can not move to value_type, we need target_config from dbnd._core.utils.task_utils import to_targets return to_targets(x, from_string_kwargs=dict(config=self.target_config)) return self.value_type.normalize(x) def validate(self, x): if self.validator: self.validator.validate(self, x) def as_str_input(self, value): if value is None: return "@none" switch_value = self.to_str(value) if isinstance(value, Target): if self.load_on_build: # this is non-data parameter, it's int/str/bool # we are in the scenario, when something should be loaded, however, it's still Target switch_value = "@target:%s" % switch_value return switch_value def next_in_enumeration(self, value): """ This method is used by the :py:mod:`databand.execution_summary` module for pretty printing purposes. If your Parameter type has an enumerable ordering of values. You can choose to override this method. Enable it to pretty print tasks like ``MyTask(num=1), MyTask(num=2), MyTask(num=3)`` to ``MyTask(num=1..3)``. :param value: The value :return: The next value, like "value + 1". Or ``None`` if there's no enumerable ordering. """ return self.value_type.next_in_enumeration(value) # default impl def _get_help_message(self, sections=None): sections = sections or [(self.task_family)] define_via = [] define_via.append( "project.cfg : [%s]%s=VALUE" % (" | ".join(sections), self.name) ) define_via.append("cli: --set %s.%s=VALUE" % (self.task_family, self.name)) define_via.append( "constructor: %s(%s=VALUE, ...)" % (self.task_family, self.name) ) define_via = "\n".join(["\t* %s" % l for l in define_via]) return "You can change '{task_family}.{name}' value using one of the following methods: \n {methods}".format( task_family=(self.task_family), name=self.name, methods=define_via ) def parameter_exception(self, reason, ex): err_msg = "Failed to {reason} for parameter '{name}' at {task_family}()".format( reason=reason, name=self.name, task_family=self.task_family ) log_exception(err_msg, ex, logger) raise DatabandBuildError( err_msg, nested_exceptions=[ex], help_msg=self._get_help_message() ) @property def task_family(self): if self.task_definition: return self.task_definition.task_family return None @property def task_config_section(self): if self.task_definition: return self.task_definition.task_config_section return None def __repr__(self): owned_by = "" parameter_origin = "" # show it only if different if self.task_definition: owned_by = self.task_definition.task_family if self.task_definition else "" origin_cls_str = ( self.parameter_origin.task_family if self.parameter_origin else "" ) if origin_cls_str and origin_cls_str != owned_by: parameter_origin = " at %s" % origin_cls_str parameter_kind = ( "output" if self.kind == _ParameterKind.task_output else "parameter" ) return "{owned_by}.{name}({parameter_kind}[{value_type}]{parameter_origin})".format( owned_by=owned_by, parameter_origin=parameter_origin, value_type=self.value_type_str, parameter_kind=parameter_kind, name=self.name or "_unknown_", ) @property def value_type_str(self): if self.value_type is None: return "unknown" type_handler = self.value_type.type_str if isinstance(self.value_type, InlineValueType): type_handler = "!" + type_handler if self.value_type_defined != self.value_type: type_handler = "*" + type_handler return type_handler def _target_source(self, task): return TargetSource( task_id=task.task_id, parameter_name=self.name, name=self.name ) def build_target(self, task): # type: (ParameterDefinition, Task) -> DataTarget target_config = self.target_config if not target_config.format: default_config = task.settings.output.get_value_target_config( self.value_type ) # for now we take only format and compression from config target_config = target_config.with_format( default_config.format ).with_compression(default_config.compression) output_ext = self.output_ext if output_ext is None: output_ext = target_config.get_ext() return task.get_target( name=self.output_name or self.name, output_ext=output_ext, config=target_config, output_mode=self.output_mode, ) def build_output(self, task): if self.output_factory is not None: try: return self.output_factory(task, self) except Exception: logger.exception( "Failed to created task output %s for %s : " " output_factory expected signature is '(Task, Parameter) -> Target(any structure) '", self, task, ) raise if ( not self.system and self.name not in ("task_band",) and task.task_in_memory_outputs ): return InMemoryTarget( path="memory://{value_type}:{task}.{p_name}".format( value_type=self.value_type, task=task.task_id, p_name=self.name ) ) # we separate into two functions , # as we want to be able to call build_target from output_factory implementation try: return self.build_target(task) except Exception as e: raise friendly_error.task_build.failed_to_build_output_target( self.name, task, e ) def is_input(self): return self.kind == _ParameterKind.task_input def is_output(self): return self.kind == _ParameterKind.task_output def __hash__(self): return hash(self.name) ^ hash(self.task_definition) def modify(self, **kwargs): if not kwargs: return self return attr.evolve(self, **kwargs) def get_env_key(self, section): return PARAM_ENV_TEMPLATE.format(S=section.upper(), K=self.name.upper()) def get_value_meta(self, value, meta_conf): # do not use meta_conf directly, you should get it merged with main config return self.value_type.get_value_meta(value, meta_conf=meta_conf) def update_value_meta_conf_from_runtime_value(self, value, tracking_config): # type: (Any, TrackingConfig) -> ParameterDefinition return self.modify( value_meta_conf=tracking_config.get_value_meta_conf( self.value_meta_conf, get_value_type_of_obj(value, ValueType()) ) )
# 1. create file extension z_file_ext = register_file_extension("z") class JoblibSizedMessageMarshaller(Marshaller): def target_to_value(self, target, **kwargs): with target.open() as fp: from_file = joblib.load(fp.name) return from_file def value_to_target(self, value, target, **kwargs): with target.open("w") as fp: joblib.dump(value, fp.name) # 2. register type to extension mapping register_marshaller(SizedMessage, z_file_ext, JoblibSizedMessageMarshaller()) @task(result=output.target_config(TargetConfig(format=z_file_ext))) def dump_as_joblib(): # type: ()-> SizedMessage return SizedMessage("example message \n", 10) @task(result=output.txt[int]) def load_as_joblib(sized_message: SizedMessage): return sized_message.msg * sized_message.size