def check_unique(tool_functions, input_combos=None, input_regex=None, input_alias=None, is_test=False): # compute a list of task names names = [] if input_combos: for toolf, input_files in itertools.product(tool_functions, input_combos): taskf = copy.copy(toolf) taskf.register_input(input_files, regex=input_regex, alias=input_alias, is_test=is_test) names.append(taskf.__name__) else: for toolf in tool_functions: taskf = copy.copy(toolf) taskf.register_input(regex=input_regex, alias=input_alias, is_test=is_test) names.append(taskf.__name__) counts = collections.Counter(names) for name, count in list(counts.items()): if count > 1: make_unique = True P.get_logger().debug( "adding hash identifier because of duplicate name: {}={}".format(name, count)) break else: make_unique = False return make_unique
def add_metrics_to_pipeline(pipeline, metrics, map_metric_to_runner, tool_runners, suffix="tsv", prefix=None, config=None, **kwargs): single_input_metric_functions = [] for metric in metrics: metricc = map_metric_to_runner[metric.strip()] if metricc.name in config: conf = config[metricc.name] else: conf = {} conf = expand_generators(conf) configurations = build_combinations(conf) for configuration in configurations: single_input_metric_functions.append(metricc(**configuration)) make_unique = check_unique(single_input_metric_functions, input_combos=None, input_regex=None, input_alias=None) metric_runners = [] for taskf in single_input_metric_functions: ignore = config.get(taskf.name, {}).get("ignore", []) taskf.register_input(make_unique=make_unique) unique_name = taskf.__name__ # make task name unique by adding 'prefix' as this method might # be called multiple times for straight, collated and split tasks if prefix: taskf.__name__ = prefix + taskf.__name__ filter_regex = ruffus.regex("(.*)/(.*).{}".format(suffix)) result_dir = os.path.join(unique_name + ".dir") output = r"\1/{}/{}.tsv".format(result_dir, taskf.name) found = False # Note that ignore will only work on the static parts of a task # as result_dir contains a pattern that will be filled in at runtime, for i in ignore: if i in result_dir: P.get_logger().warn( "the following task will be ignored: " "{} matching {}".format( result_dir, i)) found = True if found: continue metric_task = pipeline.transform( task_func=taskf, input=tool_runners, filter=filter_regex, output=output, **kwargs) metric_runners.append(metric_task) f = EmptyRunner() if prefix: f.__name__ = prefix + "metrics" else: f.__name__ = "metrics" pipeline.merge(task_func=f, input=metric_runners, output=None) return metric_runners
def add_collations_to_pipeline(pipeline, map_tool_to_runner, collations, tasks=None, config=None, **kwargs): runners = [] ignore = config["setup"].get("ignore", []) ignore.extend(config["input"].get("ignore", [])) for coll in collations: if coll not in config: raise KeyError( "configuration file requires a section for '{}'".format(coll)) coll_info = config[coll] for keyword in ("runner", "regex_in", "pattern_out"): if keyword not in coll_info: raise ValueError("section {} is missing required keyword '{}'".format( coll, keyword)) runner_options = config.get(coll_info["runner"], {}) runner_name = runner_options.get("name", coll_info["runner"]).strip() colcc = map_tool_to_runner[runner_name] taskf = colcc(**runner_options) # automatically set alias through regex (required field) taskf._input_regex = coll_info.get("regex", None) taskf._input_alias = coll_info.get("alias", None) taskf.__name__ = coll if tasks is not None: input_tasks = tasks elif "glob" in coll_info: input_tasks = coll_info["glob"] else: raise ValueError("need either tasks or glob expression " "for collation") filter_regex = ruffus.regex(coll_info["regex_in"]) filter_regex = ruffus.regex(coll_info["regex_in"]) result_dir = os.path.join(coll + ".dir") output_pattern = coll_info["pattern_out"] output_prefix = r"{}/{}".format(result_dir, output_pattern) output_dir = os.path.dirname(output_prefix) if hasattr(taskf, "output"): output, multiple_outputs, flexible_outputs, _suffix = \ build_output(taskf, output_dir) else: multiple_outputs = False output = output_prefix found = False for i in IOTools.val2list(ignore): if i in result_dir: P.get_logger().warn( "the following task will be ignored: " "{} matching {}".format( result_dir, i)) found = True if found: continue metric_task = pipeline.collate( task_func=taskf, input=input_tasks, filter=filter_regex, output=output, **kwargs).mkdir( input_tasks, filter_regex, output_dir) if multiple_outputs: f = EmptyRunner() f.__name__ = taskf.__name__ + "_passthrough" output = [re.sub(r"\\\d+", "*", x) for x in output] metric_task = pipeline.split( task_func=f, input=metric_task, output=output) runners.append(metric_task) return runners
def add_tools_to_pipeline(pipeline, map_tool_to_runner, config=None, input_files=None, **kwargs): """add tools to a workflow pipeline. This function adds for each input and tool combination a task to the workflow. The configuration dictionary should contain the following sections: input: Configuration of input files. Key/value pairs and possibly hierarchical. The following keys are optional: regex alias group_regex group_alias tool: A list of tools to apply. A typical configuration dictionary might look like this:: {"input": {"bam": "*.bam"}, "tool": ["bwa_mem", "isaac"]} Arguments --------- pipeline : object The ruffus pipeline that tasks will be added to. map_tool_to_runner: dict Dictionary mapping tools to functions in the :ref:`TaskLibrary`. config: dict Configuration dictionary. input_files: list List of (optional) input files. """ tool_functions = build_tool_functions(map_tool_to_runner, config) if "input" not in config: raise KeyError("configuration file requires an 'input' section") if config["input"] is None: raise ValueError("input section is empty") input_regex = config["input"].pop("regex", None) input_alias = config["input"].pop("alias", None) input_group_regex = config["input"].pop("group_regex", None) input_group_alias = config["input"].pop("group_alias", "\\1") is_test = "is_test" in config # update selected fields for testing purposes if "test" in config["input"]: config["input"].update(config["input"]["test"]) del config["input"]["test"] config_files = expand_globs(config["input"], is_test=is_test) if input_group_regex: config_files = group_files(config_files, input_group_regex, input_group_alias) input_combos = build_combinations(config_files) tool_runners = [] ignore = config["setup"].get("ignore", []) ignore.extend(config["input"].get("ignore", [])) make_unique = check_unique(tool_functions, input_combos=input_combos, input_regex=input_regex, input_alias=input_alias, is_test=is_test) suffix = None for toolf, input_files in itertools.product(tool_functions, input_combos): # create a copy of the task function and give it its unique name # by mangling it with the input_files taskf = copy.copy(toolf) taskf.register_input(input_files, regex=input_regex, alias=input_alias, make_unique=make_unique, is_test=is_test) if "name" in input_files: # create copy of input_files without name, do # not modify original as different tools require # the 'name' input_files = dict([(x, y) for x, y in list(input_files.items()) if x != "name"]) result_dir = os.path.join(taskf.__name__ + ".dir") found = False for i in IOTools.val2list(ignore): if i in result_dir: P.get_logger().warn( "the following task will be ignored: " "{} matching {}".format( result_dir, i)) found = True if found: continue output, multiple_outputs, flexible_outputs, _suffix = \ build_output(taskf, result_dir) if suffix is None: suffix = _suffix elif suffix != _suffix: raise ValueError( "tools produce output files of different type, " "got {}, expected {}".format(_suffix, suffix)) tool_task = pipeline.merge( task_func=taskf, input=list(input_files.values()), output=output, **kwargs).mkdir(result_dir) # if there are multilpe output files, split the task so that # each output file will be processed separately further down the # pipeline. if multiple_outputs: f = EmptyRunner() f.__name__ = taskf.__name__ + "_split" tool_task = pipeline.split( task_func=f, input=tool_task, output=output) tool_runners.append(tool_task) # convenience target f = EmptyRunner() f.__name__ = "tools" pipeline.merge(task_func=f, input=tool_runners, output=None) return suffix, tool_runners