Ejemplo n.º 1
0
def check_unique(tool_functions,
                 input_combos=None,
                 input_regex=None,
                 input_alias=None,
                 is_test=False):
    # compute a list of task names
    names = []
    if input_combos:
        for toolf, input_files in itertools.product(tool_functions,
                                                    input_combos):
            taskf = copy.copy(toolf)
            taskf.register_input(input_files,
                                 regex=input_regex,
                                 alias=input_alias,
                                 is_test=is_test)
            names.append(taskf.__name__)
    else:
        for toolf in tool_functions:
            taskf = copy.copy(toolf)
            taskf.register_input(regex=input_regex,
                                 alias=input_alias,
                                 is_test=is_test)
            names.append(taskf.__name__)

    counts = collections.Counter(names)
    for name, count in list(counts.items()):
        if count > 1:
            make_unique = True
            P.get_logger().debug(
                "adding hash identifier because of duplicate name: {}={}".format(name, count))
            break
    else:
        make_unique = False

    return make_unique
Ejemplo n.º 2
0
def add_metrics_to_pipeline(pipeline,
                            metrics,
                            map_metric_to_runner,
                            tool_runners,
                            suffix="tsv",
                            prefix=None,
                            config=None,
                            **kwargs):

    single_input_metric_functions = []

    for metric in metrics:
        metricc = map_metric_to_runner[metric.strip()]
        if metricc.name in config:
            conf = config[metricc.name]
        else:
            conf = {}

        conf = expand_generators(conf)
        configurations = build_combinations(conf)
        for configuration in configurations:
            single_input_metric_functions.append(metricc(**configuration))

    make_unique = check_unique(single_input_metric_functions,
                               input_combos=None,
                               input_regex=None,
                               input_alias=None)

    metric_runners = []
    for taskf in single_input_metric_functions:

        ignore = config.get(taskf.name, {}).get("ignore", [])
        taskf.register_input(make_unique=make_unique)
        unique_name = taskf.__name__

        # make task name unique by adding 'prefix' as this method might
        # be called multiple times for straight, collated and split tasks
        if prefix:
            taskf.__name__ = prefix + taskf.__name__

        filter_regex = ruffus.regex("(.*)/(.*).{}".format(suffix))
        result_dir = os.path.join(unique_name + ".dir")
        output = r"\1/{}/{}.tsv".format(result_dir, taskf.name)

        found = False
        # Note that ignore will only work on the static parts of a task
        # as result_dir contains a pattern that will be filled in at runtime,
        for i in ignore:
            if i in result_dir:
                P.get_logger().warn(
                    "the following task will be ignored: "
                    "{} matching {}".format(
                        result_dir, i))
                found = True

        if found:
            continue

        metric_task = pipeline.transform(
            task_func=taskf,
            input=tool_runners,
            filter=filter_regex,
            output=output,
            **kwargs)

        metric_runners.append(metric_task)

    f = EmptyRunner()
    if prefix:
        f.__name__ = prefix + "metrics"
    else:
        f.__name__ = "metrics"
    pipeline.merge(task_func=f,
                   input=metric_runners,
                   output=None)

    return metric_runners
Ejemplo n.º 3
0
def add_collations_to_pipeline(pipeline,
                               map_tool_to_runner,
                               collations,
                               tasks=None,
                               config=None,
                               **kwargs):

    runners = []

    ignore = config["setup"].get("ignore", [])
    ignore.extend(config["input"].get("ignore", []))

    for coll in collations:

        if coll not in config:
            raise KeyError(
                "configuration file requires a section for '{}'".format(coll))

        coll_info = config[coll]

        for keyword in ("runner", "regex_in", "pattern_out"):
            if keyword not in coll_info:
                raise ValueError("section {} is missing required keyword '{}'".format(
                    coll, keyword))

        runner_options = config.get(coll_info["runner"], {})
        runner_name = runner_options.get("name", coll_info["runner"]).strip()

        colcc = map_tool_to_runner[runner_name]
        taskf = colcc(**runner_options)

        # automatically set alias through regex (required field)
        taskf._input_regex = coll_info.get("regex", None)
        taskf._input_alias = coll_info.get("alias", None)
        taskf.__name__ = coll

        if tasks is not None:
            input_tasks = tasks
        elif "glob" in coll_info:
            input_tasks = coll_info["glob"]
        else:
            raise ValueError("need either tasks or glob expression "
                             "for collation")

        filter_regex = ruffus.regex(coll_info["regex_in"])

        filter_regex = ruffus.regex(coll_info["regex_in"])
        result_dir = os.path.join(coll + ".dir")

        output_pattern = coll_info["pattern_out"]
        output_prefix = r"{}/{}".format(result_dir, output_pattern)
        output_dir = os.path.dirname(output_prefix)

        if hasattr(taskf, "output"):
            output, multiple_outputs, flexible_outputs, _suffix = \
                build_output(taskf, output_dir)
        else:
            multiple_outputs = False
            output = output_prefix

        found = False
        for i in IOTools.val2list(ignore):
            if i in result_dir:
                P.get_logger().warn(
                    "the following task will be ignored: "
                    "{} matching {}".format(
                        result_dir, i))
                found = True
        if found:
            continue

        metric_task = pipeline.collate(
            task_func=taskf,
            input=input_tasks,
            filter=filter_regex,
            output=output,
            **kwargs).mkdir(
                input_tasks,
                filter_regex,
                output_dir)

        if multiple_outputs:
            f = EmptyRunner()
            f.__name__ = taskf.__name__ + "_passthrough"
            output = [re.sub(r"\\\d+", "*", x) for x in output]
            metric_task = pipeline.split(
                task_func=f,
                input=metric_task,
                output=output)

        runners.append(metric_task)

    return runners
Ejemplo n.º 4
0
def add_tools_to_pipeline(pipeline,
                          map_tool_to_runner,
                          config=None,
                          input_files=None,
                          **kwargs):
    """add tools to a workflow pipeline.

    This function adds for each input and tool combination
    a task to the workflow.

    The configuration dictionary should contain the following
    sections:

    input:
       Configuration of input files. Key/value pairs and possibly
       hierarchical.

       The following keys are optional:
          regex
          alias
          group_regex
          group_alias

    tool:
       A list of tools to apply.

    A typical configuration dictionary might look like this::

        {"input": {"bam": "*.bam"}, "tool": ["bwa_mem", "isaac"]}

    Arguments
    ---------
    pipeline : object
        The ruffus pipeline that tasks will be added to.
    map_tool_to_runner: dict
        Dictionary mapping tools to functions in the
        :ref:`TaskLibrary`.
    config: dict
        Configuration dictionary.
    input_files: list
        List of (optional) input files.
    """
    tool_functions = build_tool_functions(map_tool_to_runner, config)

    if "input" not in config:
        raise KeyError("configuration file requires an 'input' section")

    if config["input"] is None:
        raise ValueError("input section is empty")

    input_regex = config["input"].pop("regex", None)
    input_alias = config["input"].pop("alias", None)
    input_group_regex = config["input"].pop("group_regex", None)
    input_group_alias = config["input"].pop("group_alias", "\\1")

    is_test = "is_test" in config

    # update selected fields for testing purposes
    if "test" in config["input"]:
        config["input"].update(config["input"]["test"])
        del config["input"]["test"]

    config_files = expand_globs(config["input"], is_test=is_test)

    if input_group_regex:
        config_files = group_files(config_files,
                                   input_group_regex,
                                   input_group_alias)

    input_combos = build_combinations(config_files)
    tool_runners = []

    ignore = config["setup"].get("ignore", [])
    ignore.extend(config["input"].get("ignore", []))

    make_unique = check_unique(tool_functions,
                               input_combos=input_combos,
                               input_regex=input_regex,
                               input_alias=input_alias,
                               is_test=is_test)

    suffix = None

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        # create a copy of the task function and give it its unique name
        # by mangling it with the input_files
        taskf = copy.copy(toolf)

        taskf.register_input(input_files,
                             regex=input_regex,
                             alias=input_alias,
                             make_unique=make_unique,
                             is_test=is_test)

        if "name" in input_files:
            # create copy of input_files without name, do
            # not modify original as different tools require
            # the 'name'
            input_files = dict([(x, y) for x, y in list(input_files.items())
                                if x != "name"])

        result_dir = os.path.join(taskf.__name__ + ".dir")

        found = False

        for i in IOTools.val2list(ignore):
            if i in result_dir:
                P.get_logger().warn(
                    "the following task will be ignored: "
                    "{} matching {}".format(
                        result_dir, i))
                found = True
        if found:
            continue

        output, multiple_outputs, flexible_outputs, _suffix = \
            build_output(taskf, result_dir)
        if suffix is None:
            suffix = _suffix
        elif suffix != _suffix:
            raise ValueError(
                "tools produce output files of different type, "
                "got {}, expected {}".format(_suffix, suffix))

        tool_task = pipeline.merge(
            task_func=taskf,
            input=list(input_files.values()),
            output=output,
            **kwargs).mkdir(result_dir)

        # if there are multilpe output files, split the task so that
        # each output file will be processed separately further down the
        # pipeline.
        if multiple_outputs:
            f = EmptyRunner()
            f.__name__ = taskf.__name__ + "_split"
            tool_task = pipeline.split(
                task_func=f,
                input=tool_task,
                output=output)

        tool_runners.append(tool_task)

    # convenience target
    f = EmptyRunner()
    f.__name__ = "tools"
    pipeline.merge(task_func=f,
                   input=tool_runners,
                   output=None)

    return suffix, tool_runners