def _do_test_builder__pop_option(setter): builder = AtomicCmdBuilder("find") setter(builder, "-empty", fixed=False) setter(builder, "-size", "1", fixed=False) setter(builder, "-name", "*.txt", fixed=False) builder.pop_option("-size") assert_equal(builder.call, ["find", "-empty", "-name", "*.txt"])
def __init__(self, infile, outfile, regions, options, dependencies=()): vcffilter = factory.new("vcf_filter") vcffilter.add_value("%(IN_VCF)s") for contig in regions["HomozygousContigs"]: vcffilter.add_option("--homozygous-chromosome", contig) vcffilter.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE) apply_options(vcffilter, options) bgzip = AtomicCmdBuilder(["bgzip"], IN_STDIN=vcffilter, OUT_STDOUT=outfile) description = "<VCFFilter: '%s' -> '%s'>" % ( infile, outfile, ) CommandNode.__init__( self, description=description, command=ParallelCmds([vcffilter.finalize(), bgzip.finalize()]), dependencies=dependencies, )
def __init__(self, input_file, output_file, algorithm="auto", options={}, dependencies=()): command = AtomicCmdBuilder( _PRESETS[algorithm.lower()] + ["%(IN_FASTA)s"], IN_FASTA=input_file, OUT_STDOUT=output_file, CHECK_VERSION=MAFFT_VERSION, ) apply_options(command, options) self._output_file = output_file CommandNode.__init__( self, command=command.finalize(), description="<MAFFTNode (%s): '%s' -> '%s'>" % ( algorithm, input_file, output_file, ), dependencies=dependencies, )
def customize(self, config, reference, input_files, output_file, directory, dependencies=()): input_files = safe_coerce_to_tuple(input_files) stats_out_fname = "Stats_out_MCMC_correct_prob.csv" command = AtomicCmdBuilder(["mapDamage", "--rescale-only", "-i", "%(TEMP_IN_BAM)s", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", "--rescale-out", "%(OUT_BAM)s"], TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE, IN_REFERENCE=reference, TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_CSV=stats_out_fname, OUT_BAM=output_file, CHECK_VERSION=MAPDAMAGE_VERSION) command.add_multiple_kwargs(input_files) return {"command": command, "config": config, "input_files": input_files, "directory": directory, "dependencies": dependencies}
def test_builder__set__kwargs__overwriting(): expected = {"IN_PATH": "/a/b/"} builder = AtomicCmdBuilder("echo") builder.set_kwargs(IN_PATH="/a/b/") assert_raises(AtomicCmdBuilderError, builder.set_kwargs, IN_PATH="/dst/file") assert_equal(builder.kwargs, expected)
def customize(self, config, reference, input_files, output_file, directory, dependencies=()): input_files = safe_coerce_to_tuple(input_files) stats_out_fname = "Stats_out_MCMC_correct_prob.csv" command = AtomicCmdBuilder([ "mapDamage", "--rescale-only", "-i", "%(TEMP_IN_BAM)s", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", "--rescale-out", "%(OUT_BAM)s" ], TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE, IN_REFERENCE=reference, TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_CSV=stats_out_fname, OUT_BAM=output_file, CHECK_VERSION=MAPDAMAGE_VERSION) command.add_multiple_kwargs(input_files) return { "command": command, "config": config, "input_files": input_files, "directory": directory, "dependencies": dependencies }
def _bowtie2_template(call, prefix, iotype = "IN", **kwargs): params = AtomicCmdBuilder(call, **kwargs) for postfix in ("1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"): key = "%s_PREFIX_%s" % (iotype, postfix.upper()) params.set_kwargs(**{key : (prefix + "." + postfix)}) return params
def test_builder__set_kwargs__after_finalize(): expected = {"IN_PATH": "/a/b/"} builder = AtomicCmdBuilder("echo") builder.set_kwargs(IN_PATH="/a/b/") builder.finalize() assert_raises(AtomicCmdBuilderError, builder.set_kwargs, OUT_PATH="/dst/file") assert_equal(builder.kwargs, expected)
def customize(cls, input_file, output_file, algorithm = "auto", dependencies = ()): command = AtomicCmdBuilder(_PRESETS[algorithm.lower()]) command.add_value("%(IN_FASTA)s") command.set_kwargs(IN_FASTA = input_file, OUT_STDOUT = output_file, CHECK_VERSION = MAFFT_VERSION) return {"command" : command, "dependencies" : dependencies}
def test_builder__add_multiple_kwargs_with_template(): values = ("file_a", "file_b") expected = {"OUT_BAM_1": "file_a", "OUT_BAM_2": "file_b"} builder = AtomicCmdBuilder("ls") kwargs = builder.add_multiple_kwargs(values, template="OUT_BAM_%i") assert_equal(kwargs, expected) assert_equal(builder.kwargs, expected) assert_equal(builder.call, ["ls"])
def test_builder__add_multiple_kwargs(): values = ("file_a", "file_b") expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"} builder = AtomicCmdBuilder("ls") kwargs = builder.add_multiple_kwargs(values) assert_equal(kwargs, expected) assert_equal(builder.kwargs, expected) assert_equal(builder.call, ["ls"])
def test_builder__add_multiple_values(): values = ("file_a", "file_b") expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"} builder = AtomicCmdBuilder("ls") kwargs = builder.add_multiple_values(values) assert_equal(kwargs, expected) assert_equal(builder.kwargs, expected) assert_equal(builder.call, ["ls", "%(IN_FILE_01)s", "%(IN_FILE_02)s"])
def merge_bam_files_command(input_files): merge = AtomicCmdBuilder( ["samtools", "merge", "-u", "-"], OUT_STDOUT=AtomicCmd.PIPE, CHECK_VERSION=SAMTOOLS_VERSION, ) merge.add_multiple_values(input_files) return merge.finalize()
def test_builder__add_multiple_values_with_template(): values = ("file_a", "file_b") expected = {"OUT_BAM_1": "file_a", "OUT_BAM_2": "file_b"} builder = AtomicCmdBuilder("ls") kwargs = builder.add_multiple_values(values, template="OUT_BAM_%i") assert_equal(kwargs, expected) assert_equal(builder.kwargs, expected) assert_equal(builder.call, ["ls", "%(OUT_BAM_1)s", "%(OUT_BAM_2)s"])
def test_builder__add_multiple_options_with_sep(): values = ("file_a", "file_b") expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"} builder = AtomicCmdBuilder("ls") kwargs = builder.add_multiple_options("-i", values, sep="=") assert_equal(kwargs, expected) assert_equal(builder.kwargs, expected) assert_equal(builder.call, ["ls", "-i=%(IN_FILE_01)s", "-i=%(IN_FILE_02)s"])
def test_builder__add_multiple_kwargs_multiple_times(): expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"} builder = AtomicCmdBuilder("ls") kwargs = builder.add_multiple_kwargs(("file_a",)) assert_equal(kwargs, {"IN_FILE_01": "file_a"}) kwargs = builder.add_multiple_kwargs(("file_b",)) assert_equal(kwargs, {"IN_FILE_02": "file_b"}) assert_equal(builder.kwargs, expected) assert_equal(builder.call, ["ls"])
def test_builder__add_multiple_kwargs_multiple_times(): expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"} builder = AtomicCmdBuilder("ls") kwargs = builder.add_multiple_kwargs(("file_a", )) assert_equal(kwargs, {"IN_FILE_01": "file_a"}) kwargs = builder.add_multiple_kwargs(("file_b", )) assert_equal(kwargs, {"IN_FILE_02": "file_b"}) assert_equal(builder.kwargs, expected) assert_equal(builder.call, ["ls"])
def __init__( self, reference, input_files, output_directory, title="mapDamage", options={}, dependencies=(), ): merge = merge_bam_files_command(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--no-stats", # Prevent references with many contigs from using excessive # amounts of memory, at the cost of per-contig statistics: "--merge-reference-sequences", "-t", title, "-i", "-", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", ], IN_STDIN=merge, IN_REFERENCE=reference, OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"), OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"), OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"), OUT_PLOT_FRAG=os.path.join( output_directory, "Fragmisincorporation_plot.pdf" ), OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"), OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"), OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"), OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"), TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION, ) apply_options(command, options) CommandNode.__init__( self, command=ParallelCmds([merge, command.finalize()]), description="<mapDamage (plots): %s -> '%s'>" % (describe_files(merge.input_files), output_directory,), dependencies=dependencies, )
def test_builder__add_option__overwrite(): builder = AtomicCmdBuilder("find") builder.add_option("-name", "*.txt") builder.add_option("-or") builder.add_option("-name", "*.bat") assert_equal(builder.call, ["find", "-name", "*.txt", "-or", "-name", "*.bat"])
def customize(self, config, reference, input_files, output_directory, title="mapDamage", dependencies=()): input_files = safe_coerce_to_tuple(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--no-stats", # Prevent references with many contigs from using excessive # amounts of memory, at the cost of per-contig statistics: "--merge-reference-sequences", "-t", title, "-i", "%(TEMP_IN_BAM)s", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s" ], TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE, IN_REFERENCE=reference, OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"), OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"), OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"), OUT_PLOT_FRAG=os.path.join(output_directory, "Fragmisincorporation_plot.pdf"), OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"), OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"), OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"), OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"), TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION) command.add_multiple_kwargs(input_files) return { "command": command, "config": config, "input_files": input_files, "dependencies": dependencies }
def _get_bwa_template(call, prefix, iotype="IN", **kwargs): extensions = ["amb", "ann", "bwt", "pac", "sa"] try: if BWA_VERSION.version < (0, 6, 0): extensions.extend(("rbwt", "rpac", "rsa")) except versions.VersionRequirementError: pass # Ignored here, handled elsewhere params = AtomicCmdBuilder(call, **kwargs) for postfix in extensions: key = "%s_PREFIX_%s" % (iotype, postfix.upper()) params.set_kwargs(**{key: (prefix + "." + postfix)}) return params
def customize(self, reference, directory, dependencies=()): command = AtomicCmdBuilder( [ "mapDamage", "--stats-only", "-r", "%(IN_REFERENCE)s", "-d", "%(TEMP_DIR)s" ], IN_REFERENCE=reference, TEMP_OUT_FREQ_3p="3pGtoA_freq.txt", TEMP_OUT_FREQ_5p="5pCtoT_freq.txt", TEMP_OUT_COMP_USER="******", TEMP_OUT_MISINCORP="misincorporation.txt", TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", OUT_COMP_GENOME=os.path.join(directory, "dnacomp_genome.csv"), OUT_MCMC_PROBS=os.path.join(directory, "Stats_out_MCMC_correct_prob.csv"), OUT_MCMC_HIST=os.path.join(directory, "Stats_out_MCMC_hist.pdf"), OUT_MCMC_ITER=os.path.join(directory, "Stats_out_MCMC_iter.csv"), OUT_MCMC_ITERSUM=os.path.join(directory, "Stats_out_MCMC_iter_summ_stat.csv"), OUT_MCMC_POSTPRED=os.path.join(directory, "Stats_out_MCMC_post_pred.pdf"), OUT_MCMC_TRACE=os.path.join(directory, "Stats_out_MCMC_trace.pdf"), CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION, CHECK_R_INLINE=rtools.requirement("inline"), CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_RCPP=rtools.requirement("Rcpp"), CHECK_R_GAM=rtools.requirement("gam"), CHECK_R_RCPPGSL=rtools.requirement("RcppGSL")) return {"command": command, "dependencies": dependencies}
def _bowtie2_template(call, prefix, iotype="IN", **kwargs): for postfix in ("1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"): key = "%s_PREFIX_%s" % (iotype, postfix.upper()) kwargs[key] = prefix + "." + postfix return AtomicCmdBuilder(call, **kwargs)
def __init__(self, input_file, k_groups, output_root, samples=None, dependencies=()): self._samples = samples self._input_file = input_file self._k_groups = k_groups group_key = "Group(%i)" % (self._k_groups,) self._supervised = samples and any((row[group_key] != '-') for row in samples.itervalues()) assert k_groups in (2, 3), k_groups prefix = os.path.splitext(os.path.basename(input_file))[0] output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups)) cmd = AtomicCmdBuilder("admixture", IN_FILE_BED=input_file, IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"), IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"), TEMP_OUT_FILE_BED=prefix + ".bed", TEMP_OUT_FILE_BIM=prefix + ".bim", TEMP_OUT_FILE_FAM=prefix + ".fam", TEMP_OUT_FILE_POP=prefix + ".pop", OUT_P=output_prefix + ".P", OUT_Q=output_prefix + ".Q", OUT_STDOUT=output_prefix + ".log", CHECK_VERSION=ADMIXTURE_VERSION, set_cwd=True) cmd.set_option("-s", random.randint(0, 2 ** 16 - 1)) if self._supervised: cmd.set_option("--supervised") cmd.add_value("%(TEMP_OUT_FILE_BED)s") cmd.add_value(int(k_groups)) CommandNode.__init__(self, description="<Admixture -> '%s.*''>" % (output_prefix,), command=cmd.finalize(), dependencies=dependencies)
def _build_paleomix_command(*args, **kwargs): """Returns an AtomicCmdBuilder for a regular 'paleomix ...' command.""" interpreter = sys.executable script = paleomix.main.__file__ return AtomicCmdBuilder((interpreter, script) + args, AUX_PALEOMIX=script, **kwargs)
def _new_bwa_command(call, prefix, iotype="IN", **kwargs): _check_bwa_prefix(prefix) kwargs["CHECK_BWA"] = BWA_VERSION for postfix in ("amb", "ann", "bwt", "pac", "sa"): kwargs["%s_PREFIX_%s" % (iotype, postfix.upper())] = prefix + "." + postfix return AtomicCmdBuilder(call, **kwargs)
def __init__( self, reference, input_files, output_file, directory, options={}, dependencies=(), ): stats_out_fname = "Stats_out_MCMC_correct_prob.csv" merge = merge_bam_files_command(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--rescale-only", "-i", "-", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", "--rescale-out", "%(OUT_BAM)s", ], IN_STDIN=merge, IN_REFERENCE=reference, TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_CSV=stats_out_fname, OUT_BAM=output_file, CHECK_VERSION=MAPDAMAGE_VERSION, ) apply_options(command, options) self._directory = directory CommandNode.__init__( self, command=ParallelCmds([merge, command.finalize()]), description="<mapDamage (rescale): %s -> %r>" % (describe_files(merge.input_files), output_file,), dependencies=dependencies, )
def new(*args, **kwargs): """Returns AtomicCmdBuilder setup to call the tools accessible through the 'paleomix' command-line tool. This builder adds executable / version checks for the specified command, but does not add any arguments. Thus, calling new with the argument "cat" produces the equivalent of ["paleomix", "cat"]. """ interpreter = sys.executable script = paleomix.main.__file__ return AtomicCmdBuilder((interpreter, script) + args, AUX_PALEOMIX=script, **kwargs)
def _get_common_parameters(options, threads=1): cmd = AtomicCmdBuilder("AdapterRemoval", CHECK_VERSION=_VERSION_CHECK) # Gzip compress FASTQ files cmd.set_option("--gzip") # Trim Ns at read ends cmd.set_option("--trimns", fixed=False) # Trim low quality scores cmd.set_option("--trimqualities", fixed=False) # Fix number of threads to ensure consistency when scheduling node cmd.set_option("--threads", threads) # Ensure that any user-specified list of adapters is tracked adapter_list = options.pop("--adapter-list", None) if adapter_list is not None: cmd.cmd.set_option("--adapter-list", "%(IN_ADAPTER_LIST)s") cmd.command.set_kwargs(IN_ADAPTER_LIST=adapter_list) return cmd
def test_builder__finalize__calls_atomiccmd(): was_called = [] class _AtomicCmdMock(object): def __init__(self, *args, **kwargs): assert_equal(args, (["echo", "-out", "%(OUT_FILE)s", "%(IN_FILE)s"], )) assert_equal(kwargs, { "IN_FILE": "/in/file", "OUT_FILE": "/out/file", "set_cwd": True }) was_called.append(True) with Monkeypatch("paleomix.atomiccmd.builder.AtomicCmd", _AtomicCmdMock): builder = AtomicCmdBuilder("echo", set_cwd=True) builder.add_option("-out", "%(OUT_FILE)s") builder.add_value("%(IN_FILE)s") builder.set_kwargs(OUT_FILE="/out/file", IN_FILE="/in/file") builder.finalize() assert was_called
def customize(self, config, reference, input_files, output_directory, title="mapDamage", dependencies=()): input_files = safe_coerce_to_tuple(input_files) command = AtomicCmdBuilder( ["mapDamage", "--no-stats", # Prevent references with many contigs from using excessive # amounts of memory, at the cost of per-contig statistics: "--merge-reference-sequences", "-t", title, "-i", "%(TEMP_IN_BAM)s", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s"], TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE, IN_REFERENCE=reference, OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"), OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"), OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"), OUT_PLOT_FRAG=os.path.join(output_directory, "Fragmisincorporation_plot.pdf"), OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"), OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"), OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"), OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"), TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION) command.add_multiple_kwargs(input_files) return {"command": command, "config": config, "input_files": input_files, "dependencies": dependencies}
def customize(cls, input_alignment, output_tree, dependencies=()): """ Arguments: input_alignment -- An alignment file in a format readable by RAxML. output_tree -- Filename for the output newick tree.""" command = AtomicCmdBuilder("parsimonator", set_cwd=True) command.set_option("-s", "%(TEMP_OUT_ALN)s") command.set_option("-n", "output") # Random seed for the stepwise addition process command.set_option("-p", int(random.random() * 2**31 - 1), fixed=False) command.set_kwargs( # Auto-delete: Symlinks TEMP_OUT_ALN=os.path.basename(input_alignment), # Input files, are not used directly (see below) IN_ALIGNMENT=input_alignment, # Final output file, are not created directly OUT_TREE=output_tree) return {"command": command}
def test_builder__finalize__calls_atomiccmd(): was_called = [] class _AtomicCmdMock: def __init__(self, *args, **kwargs): assert_equal(args, (["echo", "-out", "%(OUT_FILE)s", "%(IN_FILE)s"],)) assert_equal(kwargs, {"IN_FILE": "/in/file", "OUT_FILE": "/out/file", "set_cwd": True}) was_called.append(True) with Monkeypatch("paleomix.atomiccmd.builder.AtomicCmd", _AtomicCmdMock): builder = AtomicCmdBuilder("echo", set_cwd=True) builder.add_option("-out", "%(OUT_FILE)s") builder.add_value("%(IN_FILE)s") builder.set_kwargs(OUT_FILE="/out/file", IN_FILE="/in/file") builder.finalize() assert was_called
def customize(cls, input_alignment, output_tree, dependencies = ()): """ Arguments: input_alignment -- An alignment file in a format readable by RAxML. output_tree -- Filename for the output newick tree.""" command = AtomicCmdBuilder("parsimonator", set_cwd = True) command.set_option("-s", "%(TEMP_OUT_ALN)s") command.set_option("-n", "output") # Random seed for the stepwise addition process command.set_option("-p", int(random.random() * 2**31 - 1), fixed = False) command.set_kwargs(# Auto-delete: Symlinks TEMP_OUT_ALN = os.path.basename(input_alignment), # Input files, are not used directly (see below) IN_ALIGNMENT = input_alignment, # Final output file, are not created directly OUT_TREE = output_tree) return {"command" : command}
def customize(cls, pileup, infile, outfile, regions, dependencies=()): cat = factory.new("cat") cat.add_value("%(IN_VCF)s") cat.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE) vcffilter = factory.new("vcf_filter") vcffilter.add_option("--pileup", "%(IN_PILEUP)s") for contig in regions["HomozygousContigs"]: vcffilter.add_option("--homozygous-chromosome", contig) vcffilter.set_kwargs(IN_PILEUP=pileup, IN_STDIN=cat, OUT_STDOUT=AtomicCmd.PIPE) bgzip = AtomicCmdBuilder(["bgzip"], IN_STDIN=vcffilter, OUT_STDOUT=outfile) return {"commands": {"cat": cat, "filter": vcffilter, "bgzip": bgzip}}
def __init__(self, config, reference, input_bam, output_bam, tags, min_mapq=0, filter_unmapped=False, dependencies=()): flt_params = AtomicCmdBuilder(("samtools", "view", "-bu"), IN_BAM=input_bam, OUT_STDOUT=AtomicCmd.PIPE) if min_mapq: flt_params.set_option("-q", min_mapq, sep="") if filter_unmapped: flt_params.set_option("-F", "0x4", sep="") flt_params.add_value("%(IN_BAM)s") jar_params = picard.picard_command(config, "AddOrReplaceReadGroups") jar_params.set_option("INPUT", "/dev/stdin", sep="=") # Output is written to a named pipe, since the JVM may, in some cases, # emit warning messages to stdout, resulting in a malformed BAM. jar_params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") jar_params.set_option("COMPRESSION_LEVEL", "0", sep="=") # Ensure that the BAM is sorted; this is required by the pipeline, and # needs to be done before calling calmd (avoiding pathologic runtimes). jar_params.set_option("SORT_ORDER", "coordinate", sep="=") # All tags are overwritten; ID is set since the default (e.g. '1') # causes problems with pysam due to type inference (is read as a length # 1 string, but written as a character). for tag in ("ID", "SM", "LB", "PU", "PL"): jar_params.set_option(tag, tags[tag], sep="=") jar_params.set_kwargs(IN_STDIN=flt_params, TEMP_OUT_BAM="bam.pipe") calmd = AtomicCmdBuilder(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], IN_REF=reference, TEMP_IN_BAM="bam.pipe", OUT_STDOUT=output_bam) commands = [cmd.finalize() for cmd in (flt_params, jar_params, calmd)] description = "<Cleanup BAM: %s -> '%s'>" \ % (input_bam, output_bam) PicardNode.__init__(self, command=ParallelCmds(commands), description=description, dependencies=dependencies)
def customize(cls, input_alignment, input_partition, output_alignment, output_partition, dependencies = ()): command = AtomicCmdBuilder("raxmlHPC") # Read and (in the case of empty columns) reduce input command.set_option("-f", "c") # Output files are saved with a .Pypeline postfix, and subsequently renamed command.set_option("-n", "Pypeline") # Model required, but not used command.set_option("-m", "GTRGAMMA") # Ensures that output is saved to the temporary directory command.set_option("-w", "%(TEMP_DIR)s") # Symlink to sequence and partitions, to prevent the creation of *.reduced files outside temp folder # In addition, it may be nessesary to remove the .reduced files if created command.set_option("-s", "%(TEMP_IN_ALIGNMENT)s") command.set_option("-q", "%(TEMP_IN_PARTITION)s") command.set_kwargs(IN_ALIGNMENT = input_alignment, IN_PARTITION = input_partition, TEMP_IN_ALIGNMENT = "RAxML_alignment", TEMP_IN_PARTITION = "RAxML_partitions", TEMP_OUT_INFO = "RAxML_info.Pypeline", OUT_ALIGNMENT = output_alignment, OUT_PARTITION = output_partition, CHECK_VERSION = RAXML_VERSION) return {"command" : command}
def test_builder__finalize__returns_singleton(): builder = AtomicCmdBuilder("echo") assert builder.finalize() is builder.finalize()
def customize(cls, input_alignment, output_template, input_partition=None, threads=1, dependencies=()): """ Arguments: input_alignment -- An alignment file in a format readable by RAxML. input_partition -- A set of partitions in a format readable by RAxML. output_template -- A template string used to construct final filenames. Should consist of a full path, including a single '%s', which is replaced with the variable part of RAxML output files (e.g. 'info', 'bestTree', ...). Example destination: '/disk/project/SN013420.RAxML.%s' Example output: '/disk/project/SN013420.RAxML.bestTree' """ if threads > 1: command = AtomicCmdBuilder("raxmlHPC-PTHREADS") command.set_option("-T", threads) version = RAXML_PTHREADS_VERSION else: command = AtomicCmdBuilder("raxmlHPC") version = RAXML_VERSION # Perform rapid bootstrapping command.set_option("-f", "a") # Output files are saved with a .PALEOMIX postfix, and subsequently renamed command.set_option("-n", "PALEOMIX") # Ensures that output is saved to the temporary directory command.set_option("-w", "%(TEMP_DIR)s") # Symlink to sequence and partitions, to prevent the creation of *.reduced files outside temp folder # In addition, it may be nessesary to remove the .reduced files if created command.set_option("-s", "%(TEMP_OUT_ALN)s") if input_partition is not None: command.set_option("-q", "%(TEMP_OUT_PART)s") command.set_kwargs(IN_PARTITION=input_partition, TEMP_OUT_PART=os.path.basename(input_partition), TEMP_OUT_PART_R=os.path.basename(input_partition) + ".reduced") command.set_kwargs( # Auto-delete: Symlinks and .reduced files that RAxML may generate TEMP_OUT_ALN=os.path.basename(input_alignment), TEMP_OUT_ALN_R=os.path.basename(input_alignment) + ".reduced", # Input files, are not used directly (see below) IN_ALIGNMENT=input_alignment, # Final output files, are not created directly OUT_INFO=output_template % "info", OUT_BESTTREE=output_template % "bestTree", OUT_BOOTSTRAP=output_template % "bootstrap", OUT_BIPART=output_template % "bipartitions", OUT_BIPARTLABEL=output_template % "bipartitionsBranchLabels", CHECK_VERSION=version) # Use the GTRGAMMA model of NT substitution by default command.set_option("-m", "GTRGAMMAI", fixed=False) # Enable Rapid Boostrapping and set random seed. May be set to a fixed value to allow replicability. command.set_option("-x", int(random.random() * 2**31 - 1), fixed=False) # Set random seed for parsimony inference. May be set to a fixed value to allow replicability. command.set_option("-p", int(random.random() * 2**31 - 1), fixed=False) # Terminate bootstrapping upon convergence, rather than after a fixed number of repetitions command.set_option("-N", "autoMRE", fixed=False) return {"command": command}
def test_builder__set_kwargs__called_twice(): expected = {"IN_PATH": "/a/b/", "OUT_PATH": "/dst/file"} builder = AtomicCmdBuilder("echo") builder.set_kwargs(OUT_PATH="/dst/file") builder.set_kwargs(IN_PATH="/a/b/") assert_equal(builder.kwargs, expected)
def test_builder__add_value__two_values(): builder = AtomicCmdBuilder("ls") builder.add_value("%(IN_FILE)s") builder.add_value("%(OUT_FILE)s") assert_equal(builder.call, ["ls", "%(IN_FILE)s", "%(OUT_FILE)s"])
def test_builder__pop_option__missing_key(): builder = AtomicCmdBuilder("find") builder.set_option("-size", 0) assert_raises(KeyError, builder.pop_option, "-isize")
def __init__(self, input_alignment, input_partitions, output_tree, dependencies=()): command = AtomicCmdBuilder("raxmlHPC") # Compute a randomized parsimony starting tree command.set_option("-y") # Output files are saved with a .Pypeline postfix, and subsequently renamed command.set_option("-n", "Pypeline") # Model required, but not used command.set_option("-m", "GTRGAMMA") # Ensures that output is saved to the temporary directory command.set_option("-w", "%(TEMP_DIR)s") # Set random seed for bootstrap generation. May be set to allow replicability command.set_option("-p", int(random.random() * 2 ** 31 - 1), fixed=False) # Symlink to sequence and partitions, to prevent the creation of *.reduced files # outside temp folder command.set_option("-s", "%(TEMP_OUT_ALIGNMENT)s") command.set_option("-q", "%(TEMP_OUT_PARTITION)s") command.set_kwargs( IN_ALIGNMENT=input_alignment, IN_PARTITION=input_partitions, # TEMP_OUT_ is used to automatically remove these files TEMP_OUT_ALIGNMENT="RAxML_alignment", TEMP_OUT_PARTITION="RAxML_partitions", TEMP_OUT_INFO="RAxML_info.Pypeline", OUT_TREE=output_tree, CHECK_VERSION=RAXML_VERSION, ) self._input_alignment = input_alignment self._input_partitions = input_partitions self._output_tree = output_tree CommandNode.__init__( self, command=command.finalize(), description="<RAxMLParsimonyTree: '%s' -> '%s'>" % (input_alignment, output_tree), dependencies=dependencies, )
def test_builder__set_option__overwrite_fixed(): builder = AtomicCmdBuilder("find") builder.set_option("-name", "*.txt") assert_raises(AtomicCmdBuilderError, builder.set_option, "-name", "*.bat")
def test_builder__set_option(): builder = AtomicCmdBuilder("find") builder.set_option("-name", "*.txt") assert_equal(builder.call, ["find", "-name", "*.txt"])
def customize(cls, input_alignment, input_partition, output_file, dependencies = ()): """ Arguments: input_alignment -- An alignment file in a format readable by RAxML. input_partition -- A set of partitions in a format readable by RAxML. output_filename -- Filename for the output binary sequence.""" command = AtomicCmdBuilder("parse-examl", set_cwd = True) command.set_option("-s", "%(TEMP_OUT_ALN)s") command.set_option("-q", "%(TEMP_OUT_PART)s") # Output file will be named output.binary, and placed in the CWD command.set_option("-n", "output") # Substitution model command.set_option("-m", "DNA", fixed = False) command.set_kwargs(# Auto-delete: Symlinks TEMP_OUT_PART = os.path.basename(input_partition), TEMP_OUT_ALN = os.path.basename(input_alignment), # Input files, are not used directly (see below) IN_ALIGNMENT = input_alignment, IN_PARTITION = input_partition, # Final output file, are not created directly OUT_BINARY = output_file, CHECK_EXAML = PARSER_VERSION) return {"command" : command}
def customize(cls, input_alignment, input_partition, output_file, dependencies=()): """ Arguments: input_alignment -- An alignment file in a format readable by RAxML. input_partition -- A set of partitions in a format readable by RAxML. output_filename -- Filename for the output binary sequence.""" command = AtomicCmdBuilder("parse-examl", set_cwd=True) command.set_option("-s", "%(TEMP_OUT_ALN)s") command.set_option("-q", "%(TEMP_OUT_PART)s") # Output file will be named output.binary, and placed in the CWD command.set_option("-n", "output") # Substitution model command.set_option("-m", "DNA", fixed=False) command.set_kwargs( # Auto-delete: Symlinks TEMP_OUT_PART=os.path.basename(input_partition), TEMP_OUT_ALN=os.path.basename(input_alignment), # Input files, are not used directly (see below) IN_ALIGNMENT=input_alignment, IN_PARTITION=input_partition, # Final output file, are not created directly OUT_BINARY=output_file, CHECK_EXAML=PARSER_VERSION) return {"command": command}
def _do_test_builder__add_or_set_option__after_finalize(setter): builder = AtomicCmdBuilder("find") builder.finalize() assert_raises(AtomicCmdBuilderError, setter, builder, "-size", "1")
def test_builder__pop_option__last_option(): builder = AtomicCmdBuilder("find") builder.add_option("-size", "0", fixed=False) builder.add_option("-size", "1", fixed=False) builder.pop_option("-size") assert_equal(builder.call, ["find", "-size", "0"])
def test_builder__set_option__overwrite(): builder = AtomicCmdBuilder("find") builder.set_option("-name", "*.txt", fixed=False) builder.set_option("-name", "*.bat") assert_equal(builder.call, ["find", "-name", "*.bat"])
def customize(cls, input_alignment, input_partitions, output_tree, dependencies = ()): command = AtomicCmdBuilder("raxmlHPC") # Compute a randomized parsimony starting tree command.set_option("-y") # Output files are saved with a .Pypeline postfix, and subsequently renamed command.set_option("-n", "Pypeline") # Model required, but not used command.set_option("-m", "GTRGAMMA") # Ensures that output is saved to the temporary directory command.set_option("-w", "%(TEMP_DIR)s") # Set random seed for bootstrap generation. May be set to a fixed value to allow replicability. command.set_option("-p", int(random.random() * 2**31 - 1), fixed = False) # Symlink to sequence and partitions, to prevent the creation of *.reduced files outside temp folder command.set_option("-s", "%(TEMP_OUT_ALIGNMENT)s") command.set_option("-q", "%(TEMP_OUT_PARTITION)s") command.set_kwargs(IN_ALIGNMENT = input_alignment, IN_PARTITION = input_partitions, # TEMP_OUT_ is used to automatically remove these files TEMP_OUT_ALIGNMENT = "RAxML_alignment", TEMP_OUT_PARTITION = "RAxML_partitions", TEMP_OUT_INFO = "RAxML_info.Pypeline", OUT_TREE = output_tree, CHECK_VERSION = RAXML_VERSION) return {"command" : command}
def __init__(self, input_file, k_groups, output_root, samples=None, dependencies=()): self._samples = samples self._input_file = input_file self._k_groups = k_groups group_key = "Group(%i)" % (self._k_groups, ) self._supervised = samples and any( (row[group_key] != '-') for row in samples.itervalues()) assert k_groups in (2, 3), k_groups prefix = os.path.splitext(os.path.basename(input_file))[0] output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups)) cmd = AtomicCmdBuilder( "admixture", IN_FILE_BED=input_file, IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"), IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"), TEMP_OUT_FILE_BED=prefix + ".bed", TEMP_OUT_FILE_BIM=prefix + ".bim", TEMP_OUT_FILE_FAM=prefix + ".fam", TEMP_OUT_FILE_POP=prefix + ".pop", OUT_P=output_prefix + ".P", OUT_Q=output_prefix + ".Q", OUT_STDOUT=output_prefix + ".log", CHECK_VERSION=ADMIXTURE_VERSION, set_cwd=True) cmd.set_option("-s", random.randint(0, 2**16 - 1)) if self._supervised: cmd.set_option("--supervised") cmd.add_value("%(TEMP_OUT_FILE_BED)s") cmd.add_value(int(k_groups)) CommandNode.__init__(self, description="<Admixture -> '%s.*''>" % (output_prefix, ), command=cmd.finalize(), dependencies=dependencies)
def __init__( self, input_alignment, output_template, input_partition=None, model="GTRGAMMAI", replicates="autoMRE", threads=1, dependencies=(), ): """ Arguments: input_alignment -- An alignment file in a format readable by RAxML. input_partition -- A set of partitions in a format readable by RAxML. output_template -- A template string used to construct final filenames. Should consist of a full path, including a single '%s', which is replaced with the variable part of RAxML output files (e.g. 'info', 'bestTree', ...). Example destination: '/disk/project/SN013420.RAxML.%s' Example output: '/disk/project/SN013420.RAxML.bestTree' """ if threads > 1: command = AtomicCmdBuilder("raxmlHPC-PTHREADS") command.set_option("-T", threads) version = RAXML_PTHREADS_VERSION else: command = AtomicCmdBuilder("raxmlHPC") version = RAXML_VERSION # Perform rapid bootstrapping command.set_option("-f", "a") # Output files are saved with a .PALEOMIX postfix, and subsequently renamed command.set_option("-n", "PALEOMIX") # Ensures that output is saved to the temporary directory command.set_option("-w", "%(TEMP_DIR)s") # Symlink to sequence and partitions, to prevent the creation of *.reduced files # outside temp folder. In addition, it may be nessesary to remove the .reduced # files if created command.set_option("-s", "%(TEMP_OUT_ALN)s") if input_partition is not None: command.set_option("-q", "%(TEMP_OUT_PART)s") command.set_kwargs( IN_PARTITION=input_partition, TEMP_OUT_PART=os.path.basename(input_partition), TEMP_OUT_PART_R=os.path.basename(input_partition) + ".reduced", ) command.set_kwargs( # Auto-delete: Symlinks and .reduced files that RAxML may generate TEMP_OUT_ALN=os.path.basename(input_alignment), TEMP_OUT_ALN_R=os.path.basename(input_alignment) + ".reduced", # Input files, are not used directly (see below) IN_ALIGNMENT=input_alignment, # Final output files, are not created directly OUT_INFO=output_template % "info", OUT_BESTTREE=output_template % "bestTree", OUT_BOOTSTRAP=output_template % "bootstrap", OUT_BIPART=output_template % "bipartitions", OUT_BIPARTLABEL=output_template % "bipartitionsBranchLabels", CHECK_VERSION=version, ) # Use the GTRGAMMA model of NT substitution by default command.set_option("-m", model, fixed=False) # Enable Rapid Boostrapping and set random seed. May be set to a fixed value to # allow replicability. command.set_option("-x", int(random.random() * 2 ** 31 - 1), fixed=False) # Set random seed for parsimony inference. May be set to allow replicability. command.set_option("-p", int(random.random() * 2 ** 31 - 1), fixed=False) # Terminate bootstrapping upon convergence, not after N repetitions command.set_option("-N", replicates, fixed=False) self._symlinks = [input_alignment, input_partition] self._template = os.path.basename(output_template) CommandNode.__init__( self, command=command.finalize(), description="<RAxMLRapidBS: '%s' -> '%s'>" % (input_alignment, output_template % ("*",)), threads=threads, dependencies=dependencies, )