def ignore_task(self, infiles, outfiles, params):
        """return True if task should be ignored.

        This method will also create the output file(s).
        """
        if self._ignore:
            m = str(outfiles)
            for ignore in IOTools.val2list(self._ignore):
                if ignore in m:
                    E.warn("task {} will be ignored".format(self.__name__))
                    for f in IOTools.val2list(outfiles):
                        E.info("creating empty file {}".format(f))
                        IOTools.touch_file(f)
                    return True
        return False
Exemple #2
0
def add_collations_to_pipeline(pipeline,
                               map_tool_to_runner,
                               collations,
                               tasks=None,
                               config=None,
                               **kwargs):

    runners = []

    ignore = config["setup"].get("ignore", [])
    ignore.extend(config["input"].get("ignore", []))

    for coll in collations:

        if coll not in config:
            raise KeyError(
                "configuration file requires a section for '{}'".format(coll))

        coll_info = config[coll]

        for keyword in ("runner", "regex_in", "pattern_out"):
            if keyword not in coll_info:
                raise ValueError("section {} is missing required keyword '{}'".format(
                    coll, keyword))

        runner_options = config.get(coll_info["runner"], {})
        runner_name = runner_options.get("name", coll_info["runner"]).strip()

        colcc = map_tool_to_runner[runner_name]
        taskf = colcc(**runner_options)

        # automatically set alias through regex (required field)
        taskf._input_regex = coll_info.get("regex", None)
        taskf._input_alias = coll_info.get("alias", None)
        taskf.__name__ = coll

        if tasks is not None:
            input_tasks = tasks
        elif "glob" in coll_info:
            input_tasks = coll_info["glob"]
        else:
            raise ValueError("need either tasks or glob expression "
                             "for collation")

        filter_regex = ruffus.regex(coll_info["regex_in"])

        filter_regex = ruffus.regex(coll_info["regex_in"])
        result_dir = os.path.join(coll + ".dir")

        output_pattern = coll_info["pattern_out"]
        output_prefix = r"{}/{}".format(result_dir, output_pattern)
        output_dir = os.path.dirname(output_prefix)

        if hasattr(taskf, "output"):
            output, multiple_outputs, flexible_outputs, _suffix = \
                build_output(taskf, output_dir)
        else:
            multiple_outputs = False
            output = output_prefix

        found = False
        for i in IOTools.val2list(ignore):
            if i in result_dir:
                P.get_logger().warn(
                    "the following task will be ignored: "
                    "{} matching {}".format(
                        result_dir, i))
                found = True
        if found:
            continue

        metric_task = pipeline.collate(
            task_func=taskf,
            input=input_tasks,
            filter=filter_regex,
            output=output,
            **kwargs).mkdir(
                input_tasks,
                filter_regex,
                output_dir)

        if multiple_outputs:
            f = EmptyRunner()
            f.__name__ = taskf.__name__ + "_passthrough"
            output = [re.sub(r"\\\d+", "*", x) for x in output]
            metric_task = pipeline.split(
                task_func=f,
                input=metric_task,
                output=output)

        runners.append(metric_task)

    return runners
Exemple #3
0
def add_tools_to_pipeline(pipeline,
                          map_tool_to_runner,
                          config=None,
                          input_files=None,
                          **kwargs):
    """add tools to a workflow pipeline.

    This function adds for each input and tool combination
    a task to the workflow.

    The configuration dictionary should contain the following
    sections:

    input:
       Configuration of input files. Key/value pairs and possibly
       hierarchical.

       The following keys are optional:
          regex
          alias
          group_regex
          group_alias

    tool:
       A list of tools to apply.

    A typical configuration dictionary might look like this::

        {"input": {"bam": "*.bam"}, "tool": ["bwa_mem", "isaac"]}

    Arguments
    ---------
    pipeline : object
        The ruffus pipeline that tasks will be added to.
    map_tool_to_runner: dict
        Dictionary mapping tools to functions in the
        :ref:`TaskLibrary`.
    config: dict
        Configuration dictionary.
    input_files: list
        List of (optional) input files.
    """
    tool_functions = build_tool_functions(map_tool_to_runner, config)

    if "input" not in config:
        raise KeyError("configuration file requires an 'input' section")

    if config["input"] is None:
        raise ValueError("input section is empty")

    input_regex = config["input"].pop("regex", None)
    input_alias = config["input"].pop("alias", None)
    input_group_regex = config["input"].pop("group_regex", None)
    input_group_alias = config["input"].pop("group_alias", "\\1")

    is_test = "is_test" in config

    # update selected fields for testing purposes
    if "test" in config["input"]:
        config["input"].update(config["input"]["test"])
        del config["input"]["test"]

    config_files = expand_globs(config["input"], is_test=is_test)

    if input_group_regex:
        config_files = group_files(config_files,
                                   input_group_regex,
                                   input_group_alias)

    input_combos = build_combinations(config_files)
    tool_runners = []

    ignore = config["setup"].get("ignore", [])
    ignore.extend(config["input"].get("ignore", []))

    make_unique = check_unique(tool_functions,
                               input_combos=input_combos,
                               input_regex=input_regex,
                               input_alias=input_alias,
                               is_test=is_test)

    suffix = None

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        # create a copy of the task function and give it its unique name
        # by mangling it with the input_files
        taskf = copy.copy(toolf)

        taskf.register_input(input_files,
                             regex=input_regex,
                             alias=input_alias,
                             make_unique=make_unique,
                             is_test=is_test)

        if "name" in input_files:
            # create copy of input_files without name, do
            # not modify original as different tools require
            # the 'name'
            input_files = dict([(x, y) for x, y in list(input_files.items())
                                if x != "name"])

        result_dir = os.path.join(taskf.__name__ + ".dir")

        found = False

        for i in IOTools.val2list(ignore):
            if i in result_dir:
                P.get_logger().warn(
                    "the following task will be ignored: "
                    "{} matching {}".format(
                        result_dir, i))
                found = True
        if found:
            continue

        output, multiple_outputs, flexible_outputs, _suffix = \
            build_output(taskf, result_dir)
        if suffix is None:
            suffix = _suffix
        elif suffix != _suffix:
            raise ValueError(
                "tools produce output files of different type, "
                "got {}, expected {}".format(_suffix, suffix))

        tool_task = pipeline.merge(
            task_func=taskf,
            input=list(input_files.values()),
            output=output,
            **kwargs).mkdir(result_dir)

        # if there are multilpe output files, split the task so that
        # each output file will be processed separately further down the
        # pipeline.
        if multiple_outputs:
            f = EmptyRunner()
            f.__name__ = taskf.__name__ + "_split"
            tool_task = pipeline.split(
                task_func=f,
                input=tool_task,
                output=output)

        tool_runners.append(tool_task)

    # convenience target
    f = EmptyRunner()
    f.__name__ = "tools"
    pipeline.merge(task_func=f,
                   input=tool_runners,
                   output=None)

    return suffix, tool_runners