def expand_generators(config): """expand generator expressions in option lists. A generator expression are valid python syntax and has the following syntax:: options: generate=["--chrom={}".format(x) for x in [1,2,3,4,5]] """ to_delete = [] for d, key, value in IOTools.nested_iter(config): if isinstance(value, str): if value.startswith("generate="): expression = re.sub("^generate=\s*", "", value) if expression.startswith("'") and expression.startswith("'"): expression = expression[1:-1] try: argument_list = eval(expression) except SyntaxError as ex: raise ValueError( "error occured while evaluating generator " "expression {}: {}".format(expression, ex)) if isinstance(d, list): d.extend(argument_list) to_delete.append((d, key)) else: d[key] = argument_list for d, key in to_delete[::-1]: del d[key] return config
def expand_globs(config, is_test=False): """detect and expand glob expressions in the input section. A glob expression is any filename that contains a '*'. Multiple glob expressions can be combined on the same line by a ','. A "find" expression is detected starting with 'find'. These expressions will be evaluated in a shell and the results insterted into the dictionary. If a filename starts with "file=", the contents of the file following the "=" are read and inserted. Multiple files can be separated by a ','. If a glob or find expression is evaluated to nothing, an exception is raised unless ``is_test`` is set. In that case, two files will be returned called "test1" and "test2". """ for d, key, value in IOTools.nested_iter(config): if isinstance(value, str): if value.startswith("find"): try: data = E.run(value, return_stdout=True) except Exception as e: data = e.output d[key] = [x for x in data.split("\n") if x] elif "*" in value: if "," in value: v = [glob.glob(x.strip()) for x in value.split(",")] v = [item for sublist in v for item in sublist] else: v = glob.glob(value) d[key] = v elif value.startswith("file="): filenames = [x.strip() for x in value.split("=")[1].split(",")] paths = [] for fn in filenames: with IOTools.open_file(fn) as inf: paths.extend([x.strip() for x in inf if x.strip()]) d[key] = paths if len(d[key]) == 0: if not is_test: raise ValueError( "expression '{}' expanded to nothing".format(value)) else: # insert some random files for testing purposes: if "*" in value: # replace glob expressions value = re.sub(",.*", "", value) d[key] = [re.sub("[*]", "test1", value), re.sub("[*]", "test2", value)] else: if "bam" in value: d[key] = ["test1.bam", "test2.bam"] elif "vcf" in value: d[key] = ["test1.vcf.gz", "test2.vcf.gz"] else: d[key] = ["test1.txt", "test2.txt"] return config
def resolve_argument(argument, sep=","): """if argument is a container type (dict, list, tuple) resolve its contents to comma-separated list. """ if isinstance(argument, dict): if len(argument) != 1: raise ValueError( "expected a single entry dictionary, got '{}'".format( argument)) return sep.join(x[2] for x in IOTools.nested_iter(argument)) elif isinstance(argument, list) or isinstance(argument, tuple): return sep.join(argument) # special treatment for output from run_collate_link_output elif "filelist" in argument: f = [ x.strip() for x in IOTools.open_file(argument).readlines() if not x.startswith("#") ] return sep.join([x for x in f if x]) return argument
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS if self.mountpoint: # revert mount redirection for arvados to allow redirection # on individual cluster nodes for d, key, value in IOTools.nested_iter(infiles): d[key] = re.sub(self.mountpoint, "arv=", value) self.instantiate_input(infiles) self.save_meta(outfile, output_file=outfile) if only_info: E.warn("only_info - meta information has been updated") return params = self.build_params(output_file=outfile) benchmark = self.run(outfile, as_namedtuple(params)) self.save_benchmark(outfile, benchmark)
def collect_file_meta_information(file_dict, nostats=False): """collect meta information on files Arg: file_dict(dict) : nested dictionary Returns: info(list) """ results = [] for d, key, filenames in IOTools.nested_iter(file_dict): if filenames is None: continue if isinstance(filenames, str): filenames = filenames.split(",") filenames = [x.strip() for x in filenames] for filename in filenames: abspath = os.path.realpath(filename) if nostats: st_size, st_mtime, st_ctime = 0, 0, 0 else: if not os.path.exists(abspath): raise OSError("file {} does not exist".format(filename)) s = os.stat(filename) st_size, st_mtime, st_ctime = s.st_size, s.st_mtime, s.st_ctime results.append( collections.OrderedDict( list( zip(("path", "abspath", "size", "modification_time", "creation_time"), (filename, abspath, st_size, st_mtime, st_ctime))))) return results
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-n", "--dry-run", dest="dry_run", action="store_true", help="only show what will be done, don't do it [%default]") parser.add_option("-l", "--link", dest="link", action="store_true", help="link instead of rename [%default]") parser.set_defaults(dry_run=False, link=False) (options, args) = E.start(parser, argv) config = P.get_parameters("benchmark.yml") old_data, new_data = [], [] for old_info in glob.glob("*.dir/tool.info"): old_dir, old_file = os.path.split(old_info) old_info = Toolkit.read_data(old_info) old_data.append((old_dir, old_info)) tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config) config_files = Workflow.expand_globs(config["input"]) input_combos = Workflow.build_combinations(config_files) map_property_to_dir = collections.defaultdict(list) for toolf, input_files in itertools.product(tool_functions, input_combos): # create a copy of the task function and give it its unique name # by mangling it with the input_files taskf = copy.copy(toolf) taskf.register_input(input_files) result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir")) new_data.append((result_dir, taskf)) for a, x, y in IOTools.nested_iter(taskf.input_files): map_property_to_dir[(x, y)].append(result_dir) map_property_to_dir[("name", taskf.name)].append(result_dir) for x, y in list(taskf._option_dict.items()): map_property_to_dir[(x, y)].append(result_dir) # match by input_files options.stdout.write("\t".join(("old", "new", "matching")) + "\n") for old_dir, old_info in old_data: targets = [] for a, x, y in IOTools.nested_iter(old_info["input_files"]): if (x, y) in map_property_to_dir: targets.extend(map_property_to_dir[(x, y)]) for x, y in list(old_info.items()): try: targets.extend(map_property_to_dir[(x, y)]) except TypeError: pass counts = collections.Counter(targets) max_count = max(counts.values()) max_count_items = [ x for x, y in list(counts.items()) if y == max_count ] if len(max_count_items) > 1: E.warn("multiple matches for {}, ignored".format(old_dir)) continue new_dir = max_count_items[0] options.stdout.write("\t".join(map(str, (old_dir, new_dir, max_count))) + "\n") if os.path.exists(new_dir): raise ValueError("directory {} already exists".format(new_dir)) if options.dry_run: continue if options.link: os.symlink(old_dir, new_dir) else: os.rename(old_dir, new_dir) E.stop()