def cleanup_after_task_run(self, task): # type: (Task) -> None rels = task.ctrl.relations # potentially, all inputs/outputs targets for current task could be removed targets_to_clean = set(flatten([rels.task_inputs, rels.task_outputs])) targets_in_use = set() # any target which appears in inputs of all not finished tasks shouldn't be removed for tr in self.task_runs: if tr.task_run_state in TaskRunState.final_states(): continue # remove all still needed inputs from targets_to_clean list for target in flatten(tr.task.ctrl.relations.task_inputs): targets_in_use.add(target) TARGET_CACHE.clear_for_targets(targets_to_clean - targets_in_use)
def build_signature_from_values(name, struct): values = set([str(value) for value in flatten(struct)]) signature_str = "|".join(sorted(values)) signature = hashlib.md5(signature_str.encode("utf-8")).hexdigest() return Signature(name=name, signature=signature, signature_source=signature_str)
def find_non_consistent(targets): non_consistent = list() for k, v in targets.items(): for partial_output in flatten(v): if not partial_output.exist_after_write_consistent(): non_consistent.append(partial_output) return non_consistent
def find_non_completed(targets): missing = defaultdict(list) for k, v in targets.items(): for partial_output in flatten(v): if not partial_output.exists(): missing[k].append(partial_output) return missing
def run(self): missing = [] for partial_output in flatten(self.data): if not partial_output.exists(): missing.append(partial_output) if missing: raise friendly_error.task_data_source_not_exists( self, missing, downstream=self.task_dag.downstream)
def validate_task_inputs(self): if not self.task.ctrl.should_run(): missing = find_non_completed(self.relations.task_outputs_user) missing_str = non_completed_outputs_to_str(missing) raise DatabandConfigError( "You are missing some input tasks in your pipeline! \n\t%s\n" "The task execution was disabled for '%s'." % (missing_str, self.task.task_id)) missing = [] for partial_output in flatten(self.relations.task_inputs_user): if not partial_output.exists(): missing.append(partial_output) if missing: raise friendly_error.task_data_source_not_exists( self, missing, downstream=[self.task])
def initialize_dag_node(self): # connect to all required tasks upstream = flatten(to_tasks(self.relations.task_inputs)) upstream = list(filter(None, upstream)) # take care of orphant tasks for child in self.task_meta.get_children(): if not child.task_dag.downstream: # it means there is no other tasks that are waiting for this task # -> we add it to task upstream # other options would be child is required by Parent task, # but it's not added yet ( anyway we create unique list) upstream.append(child) upstream = set(upstream) for upstream_task in upstream: self.set_upstream(upstream_task)
def _complete(self): """ If the task has any outputs, return ``True`` if all outputs exist. Otherwise, return ``False``. However, you may freely override this method with custom logic. """ # we check only user side task outputs # all system tasks outputs are not important (if the exists or not) # user don't see them outputs = [ o for o in flatten(self.task_outputs) if not o.config.overwrite_target ] if len(outputs) == 0: if not self.task_band: warnings.warn( "Task %r without outputs has no custom complete() and no task band!" % self, stacklevel=2, ) return False else: return self.task_band.exists() incomplete_outputs = [str(o) for o in outputs if not o.exists()] num_of_incomplete_outputs = len(incomplete_outputs) if 0 < num_of_incomplete_outputs < len(outputs): complete_outputs = [str(o) for o in outputs if o.exists()] logger.warning( "Task {} has incomplete outputs! " "This means the task might fail every time. " "Complete outputs: {} " "Incomplete outputs: {} " "Hint: clean the environment or overwrite the output".format( self.task_meta.task_name, ", ".join(complete_outputs), ", ".join(incomplete_outputs), ) ) return num_of_incomplete_outputs == 0
def _complete(self): """ If the task has any outputs, return ``True`` if all outputs exist. Otherwise, return ``False``. However, you may freely override this method with custom logic. """ # we check only user side task outputs # all system tasks outputs are not important (if the exists or not) # user don't see them outputs = flatten(self.task_outputs) if len(outputs) == 0: warnings.warn( "Task %r without outputs has no custom complete() method" % self, stacklevel=2, ) return False return all((o.exists() for o in outputs))
def _complete(self): """ If the task has any outputs, return ``True`` if all outputs exist. Otherwise, return ``False``. However, you may freely override this method with custom logic. """ # we check only user side task outputs # all system tasks outputs are not important (if the exists or not) # user don't see them outputs = [ o for o in flatten(self.task_outputs) if not o.config.overwrite_target ] if len(outputs) == 0: if not self.task_band: warnings.warn( "Task %r without outputs has no custom complete() and no task band!" % self, stacklevel=2, ) return False else: return self.task_band.exists() incomplete_outputs = [str(o) for o in outputs if not o.exists()] num_of_incomplete_outputs = len(incomplete_outputs) if 0 < num_of_incomplete_outputs < len(outputs): complete_outputs = [str(o) for o in outputs if o.exists()] exc = incomplete_output_found_for_task(self.task_meta.task_name, complete_outputs, incomplete_outputs) if self.task_env.settings.run.validate_task_outputs_on_build: raise exc else: logger.warning(str(exc)) return num_of_incomplete_outputs == 0
def exists(self): return all( (t.exists() for t in flatten(self.targets) if t is not None))
def tensorflow_graph(task): from tensorboardX.src.attr_value_pb2 import AttrValue from tensorboardX.src.graph_pb2 import GraphDef from tensorboardX.src.node_def_pb2 import NodeDef from tensorboardX.src.step_stats_pb2 import ( RunMetadata, StepStats, DeviceStepStats, NodeExecStats, AllocatorMemoryUsed, ) from tensorboardX.src.tensor_shape_pb2 import TensorShapeProto from tensorboardX.src.versions_pb2 import VersionDef # assert isinstance(task, Task) tasks = task.ctrl.task_dag.subdag_tasks() children = defaultdict(list) parents = defaultdict(list) for task in tasks: children[task.task_id] = task.descendants.get_children() for t_child in children[task.task_id]: parents[t_child.task_id].append(task) friendly_name = {t.task_id: t.task_af_id for t in tasks} def _get_name(t): cur_id = t.task_id cur_scope = [friendly_name[cur_id]] while parents[cur_id]: # right now just do simple scope, don'task take care of shared tasks cur_id = parents[cur_id][0].task_id cur_scope.append(friendly_name[cur_id]) return "/".join(reversed(cur_scope)) nodes = [] # calculate scope for all tasks names = {} for task in tasks: names[task.task_id] = _get_name(task) # attrs = attrs.replace("'", ' ') # singlequote will be escaped by tensorboard def _name(t, *path): _path = [names[t.task_id]] if path: _path.extend(list(path)) return "/".join(_path) def _op_name(task): return _name(task, task.task_id) def _target_name(target): return _name(task.source_task, target.name or "output") for task in tasks: tr = task.ctrl.relations assert isinstance(tr, TaskRelations) task_inputs = [] op_name = _op_name(task) for target in flatten(tr.task_inputs_user): if not target.owner: continue task_inputs.append(_target_name(target.source_task, target)) for target in flatten(tr.task_outputs_user): if not target.owner: continue output_inputs = [op_name] # current task if target.source_task != task: real_output = _target_name(target.source_task, target) output_inputs.append(real_output) task_inputs.append(real_output) node_info = { "name": _target_name(task, target), "op": "output", "task_inputs": [op_name], "attrs": {"path": AttrValue(s=str(target).encode(encoding="utf_8"))}, } # node_info['outputsize'] = [] nodes.append(node_info) attrs = {} for k, v in task._params.get_params_serialized(): attrs[k] = AttrValue(s=v.encode(encoding="utf_8")) node_info = { "name": op_name, "op": task.task_name, "task_inputs": task_inputs, "attrs": attrs, } # node_info['outputsize'] = [] nodes.append(node_info) # mapping = {} # for n in nodes: # mapping[n['name']] = scope[n['name']] + '/' + \ # n['op'] + '_' + n['name'] # for n in nodes: # n['name'] = mapping[n['name']] # for i, s in enumerate(n['task_inputs']): # n['task_inputs'][i] = mapping[s] node_stats = [] graph_nodes = [] for node in nodes: attrs = node.get("attrs", {}) if "outputsize" in node.keys(): shapeproto = TensorShapeProto( dim=[TensorShapeProto.Dim(size=d) for d in node["outputsize"]] ) attrs["_output_shapes"] = AttrValue( list=AttrValue.ListValue(shape=[shapeproto]) ) if "exec_stats" in node: node_stats.append( NodeExecStats( node_name=node["name"], all_start_micros=int(time.time() * 1e7), all_end_rel_micros=42, memory=[ AllocatorMemoryUsed( allocator_name="cpu", total_bytes=19950829, peak_bytes=19950829, live_bytes=19950829, ) ], ) ) graph_nodes.append( NodeDef( name=node["name"], op=node["op"], input=node["task_inputs"], attr=attrs ) ) stepstats = RunMetadata( step_stats=StepStats( dev_stats=[DeviceStepStats(device="/device:CPU:0", node_stats=node_stats)] ) ) return GraphDef(node=graph_nodes, versions=VersionDef(producer=22)), stepstats
def data_combine(inputs, sort=False): targets = flatten(to_targets(inputs)) if sort: targets = sorted(targets, key=lambda x: x.path) data = MultiTarget(targets) return data