Exemple #1
0
def _create_dag_dict(tasks):
    dag_dict = {}
    for id_, task_info in tasks.items():
        # Add the task to the graph as a node.
        depends_on = ensure_list(task_info.get("depends_on", [])).copy()
        depends_on.extend(ensure_list(task_info.get("template", [])))
        depends_on.append(task_info["config"])

        dag_dict[id_] = depends_on

        # If the task produces anything, register the output as a node.
        for target in ensure_list(task_info.get("produces", [])):
            dag_dict[target] = [id_]

    return dag_dict
Exemple #2
0
def save_hash_of_task_target(id_, dag):
    """Loop over the targets of a task and save the hashes of the files."""
    for path in ensure_list(dag.nodes[id_]["produces"]):
        paths = _path_to_file_or_directory_to_path_iterator(path)

        for path in paths:
            hash_ = _compute_hash_of_file(path, path.stat().st_mtime)
            create_or_update_hash(id_, path.as_posix(), hash_)
Exemple #3
0
def load_config(debug=None, n_jobs=None, priority=None, config=None):
    if config is None:
        path = Path.cwd() / ".pipeline.yaml"

        if path.exists():
            config = read_yaml(path.read_text())
            config = {} if not config else config
            config["user_config_file"] = path.as_posix()
            config["user_config_directory"] = path.parent.as_posix()
        else:
            raise ValueError(
                "Cannot find '.pipeline.yaml' in current directory.")

    for key, default, default_parent in [
        ("project_directory", ".", "user_config_directory"),
        ("source_directory", "src", "project_directory"),
        ("build_directory", "bld", "project_directory"),
        ("hidden_build_directory", ".pipeline", "build_directory"),
        ("hidden_task_directory", ".tasks", "build_directory"),
    ]:
        config[key] = _generate_path(key, default, default_parent, config)

    custom_templates_dirs = ensure_list(config.get("custom_templates", []))
    config["custom_templates"] = [
        _generate_path(path, default_parent="project_directory", config=config)
        for path in custom_templates_dirs
    ]

    config["_is_debug"] = debug if debug is not None else False

    config["globals"] = config.get("globals", {})

    config["priority_scheduling"] = (config.get("priority_scheduling", False)
                                     if priority is None else priority)
    config["priority_discount_factor"] = config.get("priority_discount_factor",
                                                    0)

    if config["_is_debug"]:
        # Turn off parallelization if debug modus is requested.
        config["n_jobs"] = 1
    else:
        # The command-line input has precedence over the value in the config file.
        config["n_jobs"] = n_jobs if n_jobs is not None else config.get(
            "n_jobs", 1)

    Path(config["hidden_build_directory"]).mkdir(parents=True, exist_ok=True)
    config["db"] = config.get(
        "db",
        {
            "provider": "sqlite",
            "filename": config["hidden_build_directory"] + "/db.sql",
            "create_db": True,
        },
    )

    return config
Exemple #4
0
def execute_dag_parallelly(dag, env, config):
    n_jobs = config["n_jobs"]

    unfinished_tasks = _collect_unfinished_tasks(dag, env, config)

    padding = _compute_padding_to_prevent_task_description_from_moving(
        unfinished_tasks)

    scheduler = Scheduler(dag, unfinished_tasks, config["priority_scheduling"])
    submitted_tasks = {}

    with tqdm(
            total=len(unfinished_tasks),
            bar_format=TQDM_BAR_FORMAT,
    ) as t, ProcessPoolExecutor(n_jobs) as executor:
        while scheduler.are_tasks_left:
            # Add new tasks to the queue.
            n_proposals = (n_jobs - sum(not task.done()
                                        for task in submitted_tasks.values())
                           if config["priority_scheduling"] else -1)
            proposals = scheduler.propose(n_proposals)

            for id_ in ensure_list(proposals):
                save_hashes_of_task_dependencies(id_, env, dag, config)

                path = _preprocess_task(id_, dag, env, config)

                future = executor.submit(_execute_task, id_, path, config)
                future.add_done_callback(lambda x: t.update())
                submitted_tasks[id_] = future

                t.set_description(id_.ljust(padding))

            # Evaluate finished tasks.
            newly_finished_tasks = {
                id_
                for id_, task in submitted_tasks.items() if task.done()
            }

            # Check for exceptions.
            exceptions = [
                str(future.exception()) for future in submitted_tasks.values()
                if future.exception()
            ]
            if exceptions:
                raise TaskError("\n\n".join(exceptions))

            for id_ in newly_finished_tasks:
                _process_task_targets(id_, dag)
                del submitted_tasks[id_]

            scheduler.process_finished(newly_finished_tasks)

            # A little bit of sleep time to wait for tasks to finish.
            time.sleep(0.1)
Exemple #5
0
def _check_missing_targets(id_, dag):
    targets = ensure_list(dag.nodes[id_]["produces"])
    missing_targets = [
        Path(target).as_posix() for target in targets
        if not Path(target).exists()
    ]
    if missing_targets:
        raise FileNotFoundError(
            f"Target(s) {missing_targets} was(were) not produced by task '{id_}'."
        )

    save_hash_of_task_target(id_, dag)
Exemple #6
0
    def _create_task_dependency_dict(self, unfinished_tasks):
        """Create a task-dependency dictionary.

        For each unfinished task, this function collects the tasks which have to be
        executed in advance.

        """
        task_dict = {}
        for id_ in unfinished_tasks:
            task_dict[id_] = {
                preceding_task
                for dependency in ensure_list(self.dag.nodes[id_].get(
                    "depends_on", []))
                for preceding_task in self.dag.predecessors(dependency)
                if preceding_task in unfinished_tasks
            }

        return task_dict
Exemple #7
0
    def process_finished(self, finished_tasks):
        """Process finished tasks.

        The executor passes an id or a list of ids of finished tasks back to the
        scheduler. The scheduler removes the ids from the set of submitted tasks and
        removes the finished tasks from the dependency sets of all unfinished tasks in
        `task_dict`.

        Parameters
        ----------
        finished_tasks : str or list
            An id or a list of ids of finished tasks.

        """
        finished_tasks = ensure_list(finished_tasks)
        for id_ in finished_tasks:
            self.submitted_tasks.remove(id_)
            for id__ in self.task_dict:
                self.task_dict[id__].discard(id_)
Exemple #8
0
def _preprocess_task(id_, dag, env, config):
    file = render_task_template(id_, dag.nodes[id_], env, config)

    for target in ensure_list(dag.nodes[id_].get("produces", [])):
        Path(target).parent.mkdir(parents=True, exist_ok=True)

    if dag.nodes[id_]["template"].endswith(".py"):
        path = Path(config["hidden_task_directory"], id_ + ".py")
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(file)

    elif dag.nodes[id_]["template"].endswith(".r"):
        path = Path(config["hidden_task_directory"], id_ + ".r")
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(file)

    else:
        raise NotImplementedError("Only Python and R tasks are allowed.")

    return path