def _build_parameter(self, context="inline"): s = self.parameter # type: ParameterDefinition update_kwargs = {} value_type = self._build_value_type(context) validator = s.validator if s.choices: validator = ChoiceValidator(s.choices) if is_not_defined(s.default): if s.empty_default: update_kwargs["default"] = value_type._generate_empty_default() if not is_defined(s.load_on_build): update_kwargs["load_on_build"] = value_type.load_on_build # create value meta if s.value_meta_conf is None: update_kwargs["value_meta_conf"] = ValueMetaConf( log_preview=s.log_preview, log_preview_size=s.log_preview_size, log_schema=s.log_schema, log_size=s.log_size, log_stats=s.log_stats, log_histograms=s.log_histograms, ) # Whether different values for this parameter will differentiate otherwise equal tasks description = s.description or "" if not is_defined(description): if s.is_output() and s.default_output_description: description = s.default_output_description elif not s.load_on_build and s.default_input_description: description = s.default_input_description else: description = s.default_description if s.validator: description = _add_description(description, validator.description) update_kwargs["description"] = description() # We need to keep track of this to get the order right (see Task class) ParameterDefinition._total_counter += 1 if s.kind == _ParameterKind.task_output: update_kwargs["significant"] = False updated = self.modify( value_type=value_type, value_type_defined=value_type, validator=validator, description=description, parameter_id=ParameterDefinition._total_counter, **update_kwargs ) return updated.parameter
def partition(self, name=NOTHING, extension=NOTHING, config=NOTHING, **kwargs): """ :param config: :param name: file name of the partition. if not provided - "part-%04d" % ID :param extension: extension. if not provided -> default extension will be used :return: FileTarget that represents the partition. """ if is_not_defined(name): name = "part-%04d" % self._auto_partition_count self._auto_partition_count += 1 if is_not_defined(config): # only if it's a call not from file,folder - we set it as file config = self.config.as_file() if is_not_defined(extension): extension = config.get_ext() if extension: name += extension return target(self.path, name, config=config, fs=self._fs, **kwargs)
def file(self, name=NOTHING, extension=NOTHING, config=NOTHING, **kwargs): """ :param config: :param name: file name of the partition. if not provided - "part-%04d" % ID :param extension: extension. if not provided -> default extension will be used :return: FileTarget: that represents the partition. """ config = self.config if is_not_defined(config) else config return self.partition( name=name, extension=extension, config=config.as_file(), **kwargs )
def initialize_required(self): # regular requirements -- just all inputs inputs = {"user": {}, "system": {}} # we take all parameters that are inputs (not outputs) # however Primitive parameters are inputs only if they are Target (deferred) # if isinstance(p, _TargetParameter) or isinstance(value, Target) for p, value in self.params.get_param_values(input_only=True): if value is None: continue value = traverse(value, convert_f=_find_target, filter_none=True, filter_empty=True) if not value: continue inputs[_section(p)][p.name] = value def _extend_system_section(key, extra): if not extra: return inputs["system"][key] = extra from dbnd import PipelineTask if isinstance(self.task, PipelineTask): task_output_values = {} for p, value in self.params.get_param_values(output_only=True, user_only=True): if p.name == "task_band" or isinstance(p, FuncResultParameter): continue if is_not_defined(value): raise friendly_error.task_build.pipeline_task_has_unassigned_outputs( task=self.task, param=p) task_output_values[p.name] = value _extend_system_section("band", task_output_values) # find all child pipelines and make them upstreams to the task _extend_system_section( "pipelines", {p.task_id: p for p in self._get_all_child_pipelines()}) # now may be user still use function _requires - so let add that to dependencies _extend_system_section("required", self.task._requires()) return to_targets(inputs)
def initialize_outputs(self): """ The default output that this Task produces. Use outputs! Override only if you are writing "base" class """ task = self.task outputs = {"user": {}, "system": {}} for p, value in self.params.get_params_with_value( ParameterFilters.OUTPUTS): if is_not_defined(value): value = p.build_output(task=task) setattr(self.task, p.name, value) if isinstance(p, FuncResultParameter): continue value = traverse_and_set_target(value, p._target_source(self.task)) outputs[_section(p)][p.name] = value custom_outputs = self.task._output() if custom_outputs: if outputs["user"]: warnings.warn( "Task %s has custom outputs in _output() function, all other outputs will be removed: %s" % (task, outputs["user"]), stacklevel=2, ) outputs["user"] = custom_outputs # take ownership of all outputs and clean it, just in case # usually all outputs are assigned to task # just in case we have some "outputs" with Tasks outputs = to_targets(outputs) self.task_outputs = traverse_and_set_target( outputs, target_source=TargetSource(task_id=self.task_id))
def initialize_band(self): try: band_context = [] if is_airflow_enabled(): from dbnd_airflow.dbnd_task_executor.airflow_operators_catcher import ( get_databand_op_catcher_dag, ) band_context.append(get_databand_op_catcher_dag()) original_param_values = [] for param_value in self.task.task_params.get_param_values( ParameterFilters.OUTPUTS): if param_value.name == "task_band" or isinstance( param_value.parameter, FuncResultParameter): continue original_param_values.append((param_value, param_value.value)) with nested(*band_context): band = self.task.band() # this one would be normalized self.task._task_band_result = band self.task_band_result = band # real value from dbnd import PipelineTask if isinstance(self.task, PipelineTask): # after .band has finished, all user outputs of the .band should be defined for param_value, _ in original_param_values: # we want to validate only user facing parameters # they should have assigned values by this moment, # pipeline task can not have None outputs, after band call if param_value.parameter.system: continue if is_not_defined(param_value.value): raise friendly_error.task_build.pipeline_task_has_unassigned_outputs( task=self.task, param=param_value.parameter) # now let's normalize if user has changed outputs for param_value, original_value in original_param_values: if param_value.value is original_value: continue try: from dbnd._core.utils.task_utils import to_targets normalized_value = to_targets(param_value.value) param_value.update_param_value(normalized_value) except Exception as ex: raise friendly_error.task_build.failed_to_assign_param_value_at_band( ex, param_value.parameter, param_value.value, self.task) except Exception as ex: logger.warning( self.visualiser.banner( msg="Failed to run %s" % _band_call_str(self.task), color="red", exc_info=sys.exc_info(), )) if self.task.task_decorator: # just re-raise, we already have an error from the "run" function raise raise friendly_error.task_build.failed_to_call_band(ex, self.task)
def __init__( self, task_name, task_definition, task_params, task_signature_obj=None, task_version=None, ): task_signature_obj = task_signature_obj or _generate_unique_tracking_signature( ) super(TrackingTask, self).__init__( task_name=task_name, task_definition=task_definition, task_signature_obj=task_signature_obj, task_params=task_params, ) self.task_definition = task_definition # type: TaskDefinition # we don't have signature for outputs self.task_outputs_signature_obj = self.task_signature_obj self.ctrl = TrackingTaskCtrl(self) self.task_call_source = [ self.dbnd_context.user_code_detector.find_user_side_frame(1) ] parent_task = try_get_current_task() if parent_task: parent_task.descendants.add_child(self.task_id) self.task_call_source.extend(parent_task.task_call_source) # inherit from parent if it has it self.task_version = task_version or parent_task.task_version self.task_target_date = parent_task.task_target_date self.task_env = parent_task.task_env # pass-through parent children scope params # task_children_scope_params will be used in case of any Task inside TrackedTask # for example tracked task creates Config objects self.task_children_scope_params = parent_task.task_children_scope_params else: # we need better definition of "what we use for tracking" self.task_version = task_version or utcnow().strftime( "%Y%m%d_%H%M%S") self.task_target_date = utcnow().date() self.task_env = get_databand_context().env self.task_children_scope_params = {} self.task_outputs = dict() for parameter, value in self._params.get_params_with_value( ParameterFilters.OUTPUTS): if is_not_defined(value): value_as_target = self.build_tracking_output(parameter) task_params.update_param_value(parameter.name, value_as_target) if isinstance(parameter, FuncResultParameter): continue # This is used to keep backward compatibility for tracking luigi behaviour # This is not something we want to keep, at least not in this form value = traverse_and_set_target(value, parameter._target_source(self)) self.task_outputs[parameter.name] = value self.ctrl._initialize_task() # so we can be found via task_id self.dbnd_context.task_instance_cache.register_task_instance(self)
def _get_result_parameter(self): context = "{}.{}".format(self.decorator_spec.name, RESULT_PARAM) return_spec = guess_func_return_type(self.decorator_spec) deco_spec = None # first of all , let parse the definition we have if RESULT_PARAM in self.decorator_kwargs: # @task(result=...) deco_spec = self.decorator_kwargs[RESULT_PARAM] if isinstance(deco_spec, dict): raise friendly_error.task_parameters.dict_in_result_definition( deco_spec) # @task(result=None) if deco_spec is None: # user explicitly don't want to have result value return {} if isinstance(deco_spec, six.string_types): # we have result = "output1,output2" # support both space and comma deco_spec = deco_spec.replace(",", " ").split() if len(deco_spec) == 1: deco_spec = deco_spec[0] elif isinstance(deco_spec, tuple): deco_spec = list(deco_spec) # user didn't specify - so we don't have any "hints" if is_not_defined(return_spec): return_spec = None elif return_spec is not None: # we will use type hints from "-> ..." spec only if it's has exact match to our params return_spec = self._validate_return_spec( deco_spec, return_spec) else: # we don't have @task(result=) if return_spec is None: # .. -> None # user explicitly don't want to have result value return {} # let return default parameter ( pickle in @task) if is_not_defined(return_spec): return build_parameter(self.decorator_spec.default_result, context) # so we have something in return speck, let use it if isinstance(return_spec, list): # we can get names from -> deco_spec = [r[0] for r in return_spec] else: # or we just use default name deco_spec = RESULT_PARAM # so now we have 2 cases # 1. we have list of results --> if isinstance(deco_spec, list): result = [] for i, deco_p in enumerate(deco_spec): value_type_hint = None if return_spec: _, value_type_hint = return_spec[i] deco_p = self._get_result_parameter_part( p=deco_p, name_hint="result_%s" % i, value_type_hint=value_type_hint) result.append(deco_p) param = self._build_multiple_outputs_result(result) # 2. we have only one result--> else: param = self._get_result_parameter_part( p=deco_spec, name_hint=RESULT_PARAM, value_type_hint=return_spec) return build_parameter(param, context)