Exemple #1
0
def expand_generators(config):
    """expand generator expressions in option lists.

    A generator expression are valid python syntax and
    has the following syntax::

      options: generate=["--chrom={}".format(x) for x in [1,2,3,4,5]]

    """

    to_delete = []
    for d, key, value in IOTools.nested_iter(config):
        if isinstance(value, str):
            if value.startswith("generate="):
                expression = re.sub("^generate=\s*", "", value)
                if expression.startswith("'") and expression.startswith("'"):
                    expression = expression[1:-1]
                try:
                    argument_list = eval(expression)
                except SyntaxError as ex:
                    raise ValueError(
                        "error occured while evaluating generator "
                        "expression {}: {}".format(expression, ex))
                if isinstance(d, list):
                    d.extend(argument_list)
                    to_delete.append((d, key))
                else:
                    d[key] = argument_list

    for d, key in to_delete[::-1]:
        del d[key]

    return config
Exemple #2
0
def expand_globs(config, is_test=False):
    """detect and expand glob expressions in the input section.

    A glob expression is any filename that contains a '*'. Multiple
    glob expressions can be combined on the same line by a ','.

    A "find" expression is detected starting with 'find'. These
    expressions will be evaluated in a shell and the results insterted
    into the dictionary.

    If a filename starts with "file=", the contents of the file
    following the "=" are read and inserted. Multiple files can be
    separated by a ','.

    If a glob or find expression is evaluated to nothing, an exception
    is raised unless ``is_test`` is set. In that case, two files will be
    returned called "test1" and "test2".
    """

    for d, key, value in IOTools.nested_iter(config):
        if isinstance(value, str):
            if value.startswith("find"):
                try:
                    data = E.run(value, return_stdout=True)
                except Exception as e:
                    data = e.output
                d[key] = [x for x in data.split("\n") if x]
            elif "*" in value:
                if "," in value:
                    v = [glob.glob(x.strip()) for x in value.split(",")]
                    v = [item for sublist in v for item in sublist]
                else:
                    v = glob.glob(value)
                d[key] = v
            elif value.startswith("file="):
                filenames = [x.strip() for x in value.split("=")[1].split(",")]
                paths = []
                for fn in filenames:
                    with IOTools.open_file(fn) as inf:
                        paths.extend([x.strip() for x in inf if x.strip()])
                d[key] = paths
            if len(d[key]) == 0:
                if not is_test:
                    raise ValueError(
                        "expression '{}' expanded to nothing".format(value))
                else:
                    # insert some random files for testing purposes:
                    if "*" in value:
                        # replace glob expressions
                        value = re.sub(",.*", "", value)
                        d[key] = [re.sub("[*]", "test1", value),
                                  re.sub("[*]", "test2", value)]
                    else:
                        if "bam" in value:
                            d[key] = ["test1.bam", "test2.bam"]
                        elif "vcf" in value:
                            d[key] = ["test1.vcf.gz", "test2.vcf.gz"]
                        else:
                            d[key] = ["test1.txt", "test2.txt"]
    return config
def resolve_argument(argument, sep=","):
    """if argument is a container type (dict, list, tuple)
    resolve its contents to comma-separated list.
    """
    if isinstance(argument, dict):
        if len(argument) != 1:
            raise ValueError(
                "expected a single entry dictionary, got '{}'".format(
                    argument))
        return sep.join(x[2] for x in IOTools.nested_iter(argument))
    elif isinstance(argument, list) or isinstance(argument, tuple):
        return sep.join(argument)
    # special treatment for output from run_collate_link_output
    elif "filelist" in argument:
        f = [
            x.strip() for x in IOTools.open_file(argument).readlines()
            if not x.startswith("#")
        ]
        return sep.join([x for x in f if x])

    return argument
Exemple #4
0
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        if self.mountpoint:
            # revert mount redirection for arvados to allow redirection
            # on individual cluster nodes
            for d, key, value in IOTools.nested_iter(infiles):
                d[key] = re.sub(self.mountpoint, "arv=", value)

        self.instantiate_input(infiles)
        self.save_meta(outfile, output_file=outfile)

        if only_info:
            E.warn("only_info - meta information has been updated")
            return

        params = self.build_params(output_file=outfile)
        benchmark = self.run(outfile, as_namedtuple(params))
        self.save_benchmark(outfile, benchmark)
def collect_file_meta_information(file_dict, nostats=False):
    """collect meta information on files

    Arg:
       file_dict(dict) : nested dictionary

    Returns:
       info(list)
    """
    results = []

    for d, key, filenames in IOTools.nested_iter(file_dict):
        if filenames is None:
            continue

        if isinstance(filenames, str):
            filenames = filenames.split(",")

        filenames = [x.strip() for x in filenames]
        for filename in filenames:
            abspath = os.path.realpath(filename)
            if nostats:
                st_size, st_mtime, st_ctime = 0, 0, 0
            else:
                if not os.path.exists(abspath):
                    raise OSError("file {} does not exist".format(filename))
                s = os.stat(filename)
                st_size, st_mtime, st_ctime = s.st_size, s.st_mtime, s.st_ctime

            results.append(
                collections.OrderedDict(
                    list(
                        zip(("path", "abspath", "size", "modification_time",
                             "creation_time"), (filename, abspath, st_size,
                                                st_mtime, st_ctime)))))

    return results
Exemple #6
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-n",
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-l",
                      "--link",
                      dest="link",
                      action="store_true",
                      help="link instead of rename [%default]")

    parser.set_defaults(dry_run=False, link=False)

    (options, args) = E.start(parser, argv)

    config = P.get_parameters("benchmark.yml")

    old_data, new_data = [], []

    for old_info in glob.glob("*.dir/tool.info"):
        old_dir, old_file = os.path.split(old_info)
        old_info = Toolkit.read_data(old_info)
        old_data.append((old_dir, old_info))

    tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config)

    config_files = Workflow.expand_globs(config["input"])
    input_combos = Workflow.build_combinations(config_files)

    map_property_to_dir = collections.defaultdict(list)

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        # create a copy of the task function and give it its unique name
        # by mangling it with the input_files
        taskf = copy.copy(toolf)
        taskf.register_input(input_files)
        result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir"))
        new_data.append((result_dir, taskf))
        for a, x, y in IOTools.nested_iter(taskf.input_files):
            map_property_to_dir[(x, y)].append(result_dir)
        map_property_to_dir[("name", taskf.name)].append(result_dir)
        for x, y in list(taskf._option_dict.items()):
            map_property_to_dir[(x, y)].append(result_dir)

    # match by input_files
    options.stdout.write("\t".join(("old", "new", "matching")) + "\n")

    for old_dir, old_info in old_data:
        targets = []
        for a, x, y in IOTools.nested_iter(old_info["input_files"]):
            if (x, y) in map_property_to_dir:
                targets.extend(map_property_to_dir[(x, y)])
        for x, y in list(old_info.items()):
            try:
                targets.extend(map_property_to_dir[(x, y)])
            except TypeError:
                pass

        counts = collections.Counter(targets)
        max_count = max(counts.values())
        max_count_items = [
            x for x, y in list(counts.items()) if y == max_count
        ]

        if len(max_count_items) > 1:
            E.warn("multiple matches for {}, ignored".format(old_dir))
            continue

        new_dir = max_count_items[0]

        options.stdout.write("\t".join(map(str, (old_dir, new_dir,
                                                 max_count))) + "\n")

        if os.path.exists(new_dir):
            raise ValueError("directory {} already exists".format(new_dir))

        if options.dry_run:
            continue

        if options.link:
            os.symlink(old_dir, new_dir)
        else:
            os.rename(old_dir, new_dir)

    E.stop()