Exemple #1
0
    def cleanup_after_task_run(self, task):
        # type: (Task) -> None
        rels = task.ctrl.relations
        # potentially, all inputs/outputs targets for current task could be removed
        targets_to_clean = set(flatten([rels.task_inputs, rels.task_outputs]))

        targets_in_use = set()
        # any target which appears in inputs of all not finished tasks shouldn't be removed
        for tr in self.task_runs:
            if tr.task_run_state in TaskRunState.final_states():
                continue
            # remove all still needed inputs from targets_to_clean list
            for target in flatten(tr.task.ctrl.relations.task_inputs):
                targets_in_use.add(target)

        TARGET_CACHE.clear_for_targets(targets_to_clean - targets_in_use)
Exemple #2
0
def build_signature_from_values(name, struct):
    values = set([str(value) for value in flatten(struct)])
    signature_str = "|".join(sorted(values))
    signature = hashlib.md5(signature_str.encode("utf-8")).hexdigest()
    return Signature(name=name,
                     signature=signature,
                     signature_source=signature_str)
Exemple #3
0
def find_non_consistent(targets):
    non_consistent = list()
    for k, v in targets.items():
        for partial_output in flatten(v):
            if not partial_output.exist_after_write_consistent():
                non_consistent.append(partial_output)

    return non_consistent
Exemple #4
0
def find_non_completed(targets):
    missing = defaultdict(list)
    for k, v in targets.items():
        for partial_output in flatten(v):
            if not partial_output.exists():
                missing[k].append(partial_output)

    return missing
Exemple #5
0
    def run(self):

        missing = []
        for partial_output in flatten(self.data):
            if not partial_output.exists():
                missing.append(partial_output)
        if missing:
            raise friendly_error.task_data_source_not_exists(
                self, missing, downstream=self.task_dag.downstream)
Exemple #6
0
    def validate_task_inputs(self):
        if not self.task.ctrl.should_run():
            missing = find_non_completed(self.relations.task_outputs_user)
            missing_str = non_completed_outputs_to_str(missing)
            raise DatabandConfigError(
                "You are missing some input tasks in your pipeline! \n\t%s\n"
                "The task execution was disabled for '%s'." %
                (missing_str, self.task.task_id))

        missing = []
        for partial_output in flatten(self.relations.task_inputs_user):
            if not partial_output.exists():
                missing.append(partial_output)
        if missing:
            raise friendly_error.task_data_source_not_exists(
                self, missing, downstream=[self.task])
Exemple #7
0
    def initialize_dag_node(self):
        # connect to all required tasks
        upstream = flatten(to_tasks(self.relations.task_inputs))
        upstream = list(filter(None, upstream))

        # take care of orphant tasks
        for child in self.task_meta.get_children():
            if not child.task_dag.downstream:
                # it means there is no other tasks that are waiting for this task
                # -> we add it to task upstream
                # other options would be child is required by Parent task,
                # but it's not added yet ( anyway we create unique list)
                upstream.append(child)

        upstream = set(upstream)
        for upstream_task in upstream:
            self.set_upstream(upstream_task)
Exemple #8
0
    def _complete(self):
        """
        If the task has any outputs, return ``True`` if all outputs exist.
        Otherwise, return ``False``.

        However, you may freely override this method with custom logic.
        """
        # we check only user side task outputs
        # all system tasks outputs are not important (if the exists or not)
        # user don't see them
        outputs = [
            o for o in flatten(self.task_outputs) if not o.config.overwrite_target
        ]
        if len(outputs) == 0:
            if not self.task_band:
                warnings.warn(
                    "Task %r without outputs has no custom complete() and no task band!"
                    % self,
                    stacklevel=2,
                )
                return False
            else:
                return self.task_band.exists()

        incomplete_outputs = [str(o) for o in outputs if not o.exists()]

        num_of_incomplete_outputs = len(incomplete_outputs)

        if 0 < num_of_incomplete_outputs < len(outputs):
            complete_outputs = [str(o) for o in outputs if o.exists()]
            logger.warning(
                "Task {} has incomplete outputs! "
                "This means the task might fail every time. "
                "Complete outputs: {} "
                "Incomplete outputs: {} "
                "Hint: clean the environment or overwrite the output".format(
                    self.task_meta.task_name,
                    ", ".join(complete_outputs),
                    ", ".join(incomplete_outputs),
                )
            )

        return num_of_incomplete_outputs == 0
Exemple #9
0
    def _complete(self):
        """
        If the task has any outputs, return ``True`` if all outputs exist.
        Otherwise, return ``False``.

        However, you may freely override this method with custom logic.
        """
        # we check only user side task outputs
        # all system tasks outputs are not important (if the exists or not)
        # user don't see them
        outputs = flatten(self.task_outputs)
        if len(outputs) == 0:
            warnings.warn(
                "Task %r without outputs has no custom complete() method" %
                self,
                stacklevel=2,
            )
            return False

        return all((o.exists() for o in outputs))
Exemple #10
0
    def _complete(self):
        """
        If the task has any outputs, return ``True`` if all outputs exist.
        Otherwise, return ``False``.

        However, you may freely override this method with custom logic.
        """
        # we check only user side task outputs
        # all system tasks outputs are not important (if the exists or not)
        # user don't see them
        outputs = [
            o for o in flatten(self.task_outputs)
            if not o.config.overwrite_target
        ]
        if len(outputs) == 0:
            if not self.task_band:
                warnings.warn(
                    "Task %r without outputs has no custom complete() and no task band!"
                    % self,
                    stacklevel=2,
                )
                return False
            else:
                return self.task_band.exists()

        incomplete_outputs = [str(o) for o in outputs if not o.exists()]

        num_of_incomplete_outputs = len(incomplete_outputs)

        if 0 < num_of_incomplete_outputs < len(outputs):
            complete_outputs = [str(o) for o in outputs if o.exists()]
            exc = incomplete_output_found_for_task(self.task_meta.task_name,
                                                   complete_outputs,
                                                   incomplete_outputs)

            if self.task_env.settings.run.validate_task_outputs_on_build:
                raise exc
            else:
                logger.warning(str(exc))

        return num_of_incomplete_outputs == 0
Exemple #11
0
 def exists(self):
     return all(
         (t.exists() for t in flatten(self.targets) if t is not None))
Exemple #12
0
def tensorflow_graph(task):
    from tensorboardX.src.attr_value_pb2 import AttrValue
    from tensorboardX.src.graph_pb2 import GraphDef
    from tensorboardX.src.node_def_pb2 import NodeDef
    from tensorboardX.src.step_stats_pb2 import (
        RunMetadata,
        StepStats,
        DeviceStepStats,
        NodeExecStats,
        AllocatorMemoryUsed,
    )
    from tensorboardX.src.tensor_shape_pb2 import TensorShapeProto
    from tensorboardX.src.versions_pb2 import VersionDef

    # assert isinstance(task, Task)

    tasks = task.ctrl.task_dag.subdag_tasks()
    children = defaultdict(list)
    parents = defaultdict(list)
    for task in tasks:
        children[task.task_id] = task.descendants.get_children()
        for t_child in children[task.task_id]:
            parents[t_child.task_id].append(task)
    friendly_name = {t.task_id: t.task_af_id for t in tasks}

    def _get_name(t):
        cur_id = t.task_id
        cur_scope = [friendly_name[cur_id]]
        while parents[cur_id]:
            # right now just do simple scope, don'task take care of shared tasks
            cur_id = parents[cur_id][0].task_id
            cur_scope.append(friendly_name[cur_id])
        return "/".join(reversed(cur_scope))

    nodes = []

    # calculate scope for all tasks
    names = {}
    for task in tasks:
        names[task.task_id] = _get_name(task)

    # attrs = attrs.replace("'", ' ')  # singlequote will be escaped by tensorboard
    def _name(t, *path):
        _path = [names[t.task_id]]
        if path:
            _path.extend(list(path))
        return "/".join(_path)

    def _op_name(task):
        return _name(task, task.task_id)

    def _target_name(target):
        return _name(task.source_task, target.name or "output")

    for task in tasks:
        tr = task.ctrl.relations
        assert isinstance(tr, TaskRelations)

        task_inputs = []
        op_name = _op_name(task)

        for target in flatten(tr.task_inputs_user):
            if not target.owner:
                continue
            task_inputs.append(_target_name(target.source_task, target))

        for target in flatten(tr.task_outputs_user):
            if not target.owner:
                continue

            output_inputs = [op_name]  # current task
            if target.source_task != task:
                real_output = _target_name(target.source_task, target)
                output_inputs.append(real_output)
                task_inputs.append(real_output)
            node_info = {
                "name": _target_name(task, target),
                "op": "output",
                "task_inputs": [op_name],
                "attrs": {"path": AttrValue(s=str(target).encode(encoding="utf_8"))},
            }
            # node_info['outputsize'] = []
            nodes.append(node_info)

            attrs = {}
        for k, v in task._params.get_params_serialized():
            attrs[k] = AttrValue(s=v.encode(encoding="utf_8"))
        node_info = {
            "name": op_name,
            "op": task.task_name,
            "task_inputs": task_inputs,
            "attrs": attrs,
        }

        # node_info['outputsize'] = []
        nodes.append(node_info)

    # mapping = {}
    # for n in nodes:
    #     mapping[n['name']] = scope[n['name']] + '/' + \
    #                          n['op'] + '_' + n['name']
    # for n in nodes:
    #     n['name'] = mapping[n['name']]
    #     for i, s in enumerate(n['task_inputs']):
    #         n['task_inputs'][i] = mapping[s]

    node_stats = []
    graph_nodes = []
    for node in nodes:
        attrs = node.get("attrs", {})
        if "outputsize" in node.keys():
            shapeproto = TensorShapeProto(
                dim=[TensorShapeProto.Dim(size=d) for d in node["outputsize"]]
            )
            attrs["_output_shapes"] = AttrValue(
                list=AttrValue.ListValue(shape=[shapeproto])
            )

        if "exec_stats" in node:
            node_stats.append(
                NodeExecStats(
                    node_name=node["name"],
                    all_start_micros=int(time.time() * 1e7),
                    all_end_rel_micros=42,
                    memory=[
                        AllocatorMemoryUsed(
                            allocator_name="cpu",
                            total_bytes=19950829,
                            peak_bytes=19950829,
                            live_bytes=19950829,
                        )
                    ],
                )
            )

        graph_nodes.append(
            NodeDef(
                name=node["name"], op=node["op"], input=node["task_inputs"], attr=attrs
            )
        )

    stepstats = RunMetadata(
        step_stats=StepStats(
            dev_stats=[DeviceStepStats(device="/device:CPU:0", node_stats=node_stats)]
        )
    )
    return GraphDef(node=graph_nodes, versions=VersionDef(producer=22)), stepstats
Exemple #13
0
def data_combine(inputs, sort=False):
    targets = flatten(to_targets(inputs))
    if sort:
        targets = sorted(targets, key=lambda x: x.path)
    data = MultiTarget(targets)
    return data