def task_to_targets(self, task, targets): # type: (Task, Dict[str, TargetInfo]) -> List[TargetInfo] """ :param run: :param task: :param targets: all known targets for current run, so we have uniq list of targets (by path) :return: """ run = self.run task_targets = [] def process_target(target, name): # type: (Target, str) -> None target_path = str(target) dbnd_target = targets.get(target_path) if not dbnd_target: # we see this target for the first time target_task_run_uid = ( None # let assume that Target is now owned by any task ) # let try to find it's owner, so we create target that relates to some Task # if `task` is pipeline, the target owner is going to be different task if target.task: target_task_run = run.get_task_run(target.task.task_id) if target_task_run: target_task_run_uid = target_task_run.task_run_uid dbnd_target = targets[target_path] = TargetInfo( path=target_path, created_date=utcnow(), task_run_uid=target_task_run_uid, parameter_name=name, ) logger.debug( "New Target: %s -> %s -> %s", target.task, target_task_run_uid, target_path, ) task_targets.append(dbnd_target) rels = task.ctrl.relations for io_params in chain(rels.task_outputs.values(), rels.task_inputs.values()): for name, t in io_params.items(): traverse(t, convert_f=partial(process_target, name=name)) return task_targets
def calc_runtime_value(self, value, task): if value is None: return value if isinstance(self.value_type, _TargetValueType): # if it "target" type, let read it into "user friendly" format # regardless it's input or output, # so if function has param = output[Path] - it will get Path return traverse(value, self.value_type.target_to_value) # usually we should not load "outputs" on read if self.is_output(): # actually we should not load it, so just return return value if isinstance(value, Target): try: runtime_value = self.load_from_target(value) if self.is_input(): self._log_parameter_value(runtime_value, value, task) return runtime_value except Exception as ex: raise friendly_error.failed_to_read_target_as_task_input( ex=ex, task=task, parameter=self, target=value ) if ( isinstance(self.value_type, _StructureValueType) and self.value_type.sub_value_type ): try: def load_with_preview(val): runtime_val = self.value_type.sub_value_type.load_runtime(val) if self.is_input() and isinstance(val, Target): # Optimisation opportunity: log all targets in a single call self._log_parameter_value(runtime_val, val, task) return runtime_val return traverse(value, convert_f=load_with_preview) except Exception as ex: raise friendly_error.failed_to_read_task_input( ex=ex, task=task, parameter=self, target=value ) return value
def initialize_relations(self): # STEP 0 - run band function self.initialize_band() # STEP 1 - calculate all inputs and _required self.initialize_required() # STEP 2 ( now we have all inputs, we can calculate real signature) # support for two phase build # will be called from MetaClass params = self.params.get_params_serialized(significant_only=True, input_only=True) task_inputs_as_str = traverse(self.task_inputs, convert_f=str, filter_none=True, filter_empty=True) params.append(("_task_inputs", task_inputs_as_str)) # we do it again, now we have all inputs calculated self.task_meta.initialize_task_id(params) # for airflow operator task handling: airflow_task_id_p = self.params.get_param("airflow_task_id") if airflow_task_id_p: self.task_meta.task_id = self.task.airflow_task_id # STEP 3 - now let update outputs self.initialize_outputs() self.task_meta.initialize_task_output_id(self._get_outputs_to_sign())
def to_str(self, x): if self.sub_value_type: x = traverse(x, self.sub_value_type.to_str) # we sort the set before we serialize! x = sorted(x, key=lambda x: json_utils.dumps_canonical(x)) return json_utils.dumps_canonical(x)
def initialize_relations(self): # STEP 0 - run band function self.initialize_band() # STEP 1 - calculate all inputs and _required try: self.task_inputs = self.initialize_required() except Exception: logger.warning("Failed to calculate relationships for %s" % self.task_id, exc_info=True) self.task_inputs = {} if not self.task.task_is_dynamic: raise # STEP 2 ( now we have all inputs, we can calculate real signature) # support for two phase build # will be called from MetaClass params = self.params.get_params_serialized(significant_only=True, input_only=True) if "user" in self.task_inputs: # TODO : why do we need to convert all "user side" inputs? # what if the input is insignificant? system_input = self.task_inputs.get("system") if system_input and "band" in system_input: band_input = system_input["band"] task_inputs_user_only = { "user": self.task_inputs.get("user"), "system": { "band": band_input }, } else: task_inputs_user_only = {"user": self.task_inputs.get("user")} task_inputs_as_str = traverse( task_inputs_user_only, convert_f=str, filter_none=True, filter_empty=True, ) if task_inputs_as_str is None: task_inputs_as_str = "" params.append(("_task_inputs", task_inputs_as_str)) # we do it again, now we have all inputs calculated self.task_meta.initialize_task_id(params) # for airflow operator task handling: airflow_task_id_p = self.params.get_param("airflow_task_id") if airflow_task_id_p: self.task_meta.task_id = self.task.airflow_task_id # STEP 3 - now let update outputs self.initialize_outputs() self.task_meta.initialize_task_output_id(self._get_outputs_to_sign())
def f_io(self, structure): structure_str = traverse_to_str(structure) structure_str = traverse( structure_str, lambda x: x if not x or len(x) <= 600 else ("%s... (%s files)" % (x[:400], len(x.split(",")))), ) dumped = json_utils.dumps(structure_str, indent=2) return dumped
def f_io(self, structure): structure_str = traverse_to_str(structure) structure_str = traverse( structure_str, lambda x: x if not x or len(x) <= 600 else ("%s... (%s files)" % (x[:400], len(x.split(",")))), ) dumped = json_utils.dumps(structure_str, indent=2) if isinstance(structure_str, dict): dumped = self._hjson_optimizer.sub("\g<1>", dumped) return dumped
def initialize_required(self): # regular requirements -- just all inputs inputs = {"user": {}, "system": {}} # we take all parameters that are inputs (not outputs) # however Primitive parameters are inputs only if they are Target (deferred) # if isinstance(p, _TargetParameter) or isinstance(value, Target) for p, value in self.params.get_param_values(input_only=True): if value is None: continue value = traverse(value, convert_f=_find_target, filter_none=True, filter_empty=True) if not value: continue inputs[_section(p)][p.name] = value def _extend_system_section(key, extra): if not extra: return inputs["system"][key] = extra from dbnd import PipelineTask if isinstance(self.task, PipelineTask): task_output_values = {} for p, value in self.params.get_param_values(output_only=True, user_only=True): if p.name == "task_band" or isinstance(p, FuncResultParameter): continue if is_not_defined(value): raise friendly_error.task_build.pipeline_task_has_unassigned_outputs( task=self.task, param=p) task_output_values[p.name] = value _extend_system_section("band", task_output_values) # find all child pipelines and make them upstreams to the task _extend_system_section( "pipelines", {p.task_id: p for p in self._get_all_child_pipelines()}) # now may be user still use function _requires - so let add that to dependencies _extend_system_section("required", self.task._requires()) return to_targets(inputs)
def initialize_required(self): # regular requirements -- just all inputs inputs = {"user": {}, "system": {}} # we take all parameters that are inputs (not outputs) # however Primitive parameters are inputs only if they are Target (deferred) # if isinstance(p, _TargetParameter) or isinstance(value, Target) for p, value in self.params.get_params_with_value( ParameterFilters.INPUTS): if value is None: continue value = traverse(value, convert_f=_find_target, filter_none=True, filter_empty=True) if not value: continue inputs[_section(p)][p.name] = value def _extend_system_section(key, extra): if not extra: return inputs["system"][key] = extra from dbnd import PipelineTask if isinstance(self.task, PipelineTask): task_output_values = {} for p, value in self.params.get_params_with_value( ParameterFilters.USER_OUTPUTS): if p.name == "task_band" or isinstance(p, FuncResultParameter): continue # band outputs are going to be required as inputs! # @pipeline can run only when all of it's "outputs" are ready task_output_values[p.name] = value _extend_system_section("band", task_output_values) # find all child pipelines and make them upstreams to the task _extend_system_section( "pipelines", {p.task_id: p for p in self._get_all_child_pipelines()}) # now may be user still use function _requires - so let add that to dependencies _extend_system_section("required", self.task._requires()) return to_targets(inputs)
def calc_runtime_value(self, value, task): if value is None: return value if isinstance(self.value_type, _TargetValueType): # if it "target" type, let read it into "user friendly" format # regardless it's input or output return traverse(value, self.value_type.target_to_value) # usually we should not load "outputs" on read if self.is_output(): # actually we should not load it, so just return return value if isinstance(value, Target): try: return self.load_from_target(value) except Exception as ex: raise friendly_error.failed_to_read_target_as_task_input( ex=ex, task=task, parameter=self, target=value ) if ( isinstance(self.value_type, _StructureValueType) and self.value_type.sub_value_type ): try: return traverse( value, convert_f=self.value_type.sub_value_type.load_runtime ) except Exception as ex: raise friendly_error.failed_to_read_task_input( ex=ex, task=task, parameter=self, target=value ) return value
def parse_value(self, value, load_value=None, target_config=None): """ parse structure first parse every element """ if value is None: return value if isinstance(value, six.string_types): return super(_StructureValueType, self).parse_value( value=value, load_value=load_value, target_config=target_config ) else: if self.sub_value_type: value = traverse( struct=value, convert_f=self.sub_value_type.parse_value ) return value
def parse_value(self, value, load_value=None, target_config=None, sub_value=False): """ parse structure first parse every element """ if value is None: return value if not sub_value and isinstance(value, six.string_types): return super(_StructureValueType, self).parse_value( value=value, load_value=load_value, target_config=target_config ) if self.sub_value_type: return traverse( struct=value, convert_f=partial(self.sub_value_type.parse_value, sub_value=True), ) return value
def parse_from_str(self, x): """ Parses an immutable and ordered ``dict`` from a JSON string using standard JSON library. Parse an individual value from the input. """ # if isinstance(value, Mapping): # # we are good to go, it'x dictionary already # return value if not x: return self._generate_empty_default() # this is string and we need to parse it if not isinstance(x, six.string_types): raise DatabandConfigError( "Can't parse '%x' into parameter. Value should be string" % x ) x = x.strip() if not x: return self._generate_empty_default() if x[0] in _PARSABLE_PARAM_PREFIX: value = json_utils.loads(x) else: value = self._parse_from_str_simple(x) if not self.is_type_of(value): raise DatabandConfigError( "Can't parse '%s' into %s" % (value, self.type) ) if self.sub_value_type: value = traverse(value, self.sub_value_type.parse_value) return value
def test_flattern_file_target(self): nested_v = target("/tmp") value = {"a": {"b": nested_v}} actual = traverse(value, convert_f=_find_target, filter_none=True) assert actual assert actual.get("a").get("b") == nested_v
def to_str(self, x): if self.sub_value_type: x = traverse(x, self.sub_value_type.to_str) return json_utils.dumps_safe(x)
def to_str_lines(self, x): if self.sub_value_type: x = traverse(x, self.sub_value_type.to_str) return x
def traverse_and_set_target(target, target_source): return traverse(target, convert_f=lambda t: __set_target( target=t, target_source=target_source))
def parse_from_str_lines(self, lines): value = lines if self.sub_value_type: value = traverse(value, self.sub_value_type.parse_from_str) return value
def initialize_relations(self): # STEP 0 - run band function self.initialize_band() # STEP 1 - calculate all inputs and _required try: self.task_inputs = self.initialize_required() except Exception: logger.warning("Failed to calculate relationships for %s" % self.task_id, exc_info=True) self.task_inputs = {} if not self.task.task_is_dynamic: raise # STEP 2 ( now we have all inputs, we can calculate real signature) # support for two phase build # will be called from MetaClass params = self.params.get_params_serialized( ParameterFilters.SIGNIFICANT_INPUTS) if "user" in self.task_inputs: # TODO : why do we need to convert all "user side" inputs? # what if the input is insignificant? system_input = self.task_inputs.get("system") if system_input and "band" in system_input: band_input = system_input["band"] task_inputs_user_only = { "user": self.task_inputs.get("user"), "system": { "band": band_input }, } else: task_inputs_user_only = {"user": self.task_inputs.get("user")} task_inputs_as_str = traverse( task_inputs_user_only, convert_f=str, filter_none=True, filter_empty=True, ) if task_inputs_as_str is None: task_inputs_as_str = "" params.append(("_task_inputs", task_inputs_as_str)) # IMPORTANT PART: we initialize task_id here again # after all values are calculated (all task_inputs are assigned) # we do it again, now we have all inputs calculated task = self.task task.task_signature_obj = build_signature( name=task.task_name, params=params, extra=task.task_definition.task_signature_extra, ) task.task_id = "{}__{}".format(task.task_name, task.task_signature_obj.signature) # for airflow operator task handling: airflow_task_id_p = self.params.get_param("airflow_task_id") if airflow_task_id_p: self.task.task_id = self.task.airflow_task_id # STEP 3 - now let update outputs self.initialize_outputs() outputs_sig = self._get_outputs_to_sign() if outputs_sig: sig = build_signature_from_values("task_outputs", outputs_sig) task.task_outputs_signature_obj = sig else: task.task_outputs_signature_obj = task.task_signature_obj
def targets_to_str(obj_or_struct): return traverse(obj_or_struct, convert_f=target_to_str, filter_none=True)
def to_targets(obj_or_struct, from_string_kwargs=None): return traverse( obj_or_struct, convert_f=partial(_to_target, from_string_kwargs=from_string_kwargs), filter_none=True, )
def to_tasks(obj_or_struct): return traverse(obj_or_struct, convert_f=_to_task, filter_none=True)
def normalize(self, value): if self.sub_value_type: value = traverse(value, self.sub_value_type.normalize) return value
def load_runtime(self, value, **kwargs): if self.sub_value_type: return traverse(value, self.sub_value_type.load_runtime) return value