def __init__(self, config, reference, intervals, infiles, outfile, dependencies=()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), IN_INTERVALS=intervals, OUT_BAMFILE=outfile, CHECK_GATK=_get_gatk_version_check(config)) calmd = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM=self._basename, IN_REF=reference, TEMP_OUT_STDOUT=self._basename + ".calmd", CHECK_VERSION=SAMTOOLS_VERSION) description = "<GATK Indel Realigner (aligning): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=ParallelCmds([command.finalize(), calmd]), dependencies=dependencies)
def __init__(self, samples, prefix, output_prefix, dependencies=()): abs_prefix = os.path.abspath(prefix) basename = os.path.basename(output_prefix) # TreeMix plots with migration edges cmd_1 = self._plot_command(prefix, "plot_tree", abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s", IN_SAMPLES=samples, TEMP_OUT_PREFIX=basename + "_tree", OUT_PDF=output_prefix + "_tree.pdf", OUT_PNG=output_prefix + "_tree.png") # Heatmap showing TreeMix residuals cmd_2 = self._plot_command(prefix, "plot_residuals", abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s", IN_SAMPLES=samples, TEMP_OUT_PREFIX=basename + "_residuals", OUT_PDF=output_prefix + "_residuals.pdf", OUT_PNG=output_prefix + "_residuals.png") # Text file containing % of variance explained by model cmd_3 = self._plot_command(prefix, "variance", abs_prefix, "%(OUT_TXT)s", OUT_TXT=output_prefix + "_variance.txt") CommandNode.__init__(self, description="<PlotTreemix -> '%s.*'>" % (output_prefix,), command=SequentialCmds((cmd_1, cmd_2, cmd_3)), dependencies=dependencies)
def __init__(self, config, reference, infiles, outfile, threads=1, dependencies=()): threads = _get_max_threads(reference, threads) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") command.set_option("-nt", threads) _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile, CHECK_GATK=_get_gatk_version_check(config)) description = "<GATK Indel Realigner (training): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, threads=threads, description=description, command=command.finalize(), dependencies=dependencies)
def __init__(self, output_prefix, tfam, tped, indep_filter=None, indep_parameters=None, plink_parameters=None, dependencies=()): temp_prefix = os.path.basename(output_prefix) plink_cmd = [ "plink", "--make-bed", "--noweb", "--tped", "%(IN_TPED)s", "--tfam", "%(IN_TFAM)s", "--out", "%(TEMP_OUT_PREFIX)s" ] plink_cmd.extend(self._parse_parameters(plink_parameters)) command = AtomicCmd(plink_cmd, IN_TPED=tped, IN_TFAM=tfam, TEMP_OUT_PREFIX=temp_prefix, OUT_BED=output_prefix + ".bed", OUT_BIM=output_prefix + ".bim", OUT_FAM=output_prefix + ".fam", OUT_LOG=output_prefix + ".log", TEMP_OUT_NOSEX=temp_prefix + ".nosex", TEMP_OUT_NOF=temp_prefix + ".nof", CHECK_VERSION=PLINK_VERSION, set_cwd=True) CommandNode.__init__(self, description="<BuildBEDFiles -> '%s.*'>" % (output_prefix, ), command=command, dependencies=dependencies)
def __init__(self, infile, outfile, genome, from_start=0, from_end=0, strand_relative=False, dependencies=()): if type(from_start) != type(from_end): raise ValueError("Parameters 'from_start' and 'from_end' should " "be of same type!") call = ["bedtools", "slop", "-i", "%(IN_FILE)s", "-g", "%(IN_GENOME)s", "-l", str(from_start), "-r", str(from_end)] if strand_relative: call.append("-s") if type(from_start) is float: call.append("-pct") command = AtomicCmd(call, IN_FILE=infile, IN_GENOME=genome, OUT_STDOUT=outfile, CHECK_VERSION=BEDTOOLS_VERSION) description = "<SlopBed: '%s' -> '%s'>" % (infile, outfile) CommandNode.__init__(self, description=description, command=command, dependencies=dependencies)
def __init__(self, output_root, table, bamfile, downsample, dependencies=()): cmd = factory.new("zonkey_tped") cmd.set_option("--name", "Sample") cmd.set_option("--downsample", downsample) cmd.add_value("%(TEMP_DIR)s") cmd.add_value("%(IN_TABLE)s") cmd.add_value("%(IN_BAM)s") if not downsample: # Needed for random access (chromosomes are read 1 ... 31) cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai")) cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"), OUT_SUMMARY=os.path.join(output_root, "common.summary"), OUT_TPED_INCL_TS=os.path.join(output_root, "incl_ts.tped"), OUT_TPED_EXCL_TS=os.path.join(output_root, "excl_ts.tped"), IN_TABLE=table, IN_BAM=bamfile) CommandNode.__init__(self, description="<BuildTPEDFiles -> %r>" % (os.path.join(output_root, '*'), ), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, samples, prefix, output_prefix, dependencies=()): abs_prefix = os.path.abspath(prefix) script = rtools.rscript("zonkey", "pca.r") call = [ "Rscript", script, abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s" ] cmd = AtomicCmd(call, AUX_SCRIPT=script, IN_FILE_EVAL=prefix + ".eval", IN_FILE_EVEC=prefix + ".evec", IN_SAMPLES=samples, TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_LABELS=rtools.requirement("ggrepel"), set_cwd=True) CommandNode.__init__(self, description="<PlotPCA -> '%s.*'>" % (output_prefix, ), command=cmd, dependencies=dependencies)
def __init__(self, infile, index_format='.bai', dependencies=()): basename = os.path.basename(infile) if index_format == '.bai': samtools_version = SAMTOOLS_VERSION samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"] elif index_format == '.csi': samtools_version = SAMTOOLS_VERSION_1x samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"] else: raise ValueError("Unknown format type %r; expected .bai or .csi" % (index_format, )) cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"], IN_BAM=infile, TEMP_OUT_BAM=basename, set_cwd=True) cmd_index = AtomicCmd(samtools_call, TEMP_IN_BAM=basename, CHECK_SAM=samtools_version) cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"], TEMP_IN_BAM=basename + index_format, OUT_BAM=swap_ext(infile, index_format)) commands = SequentialCmds((cmd_link, cmd_index, cmd_rename)) CommandNode.__init__(self, description="<BAMIndex (%s): '%s'>" % (index_format[1:].upper(), infile), command=commands, dependencies=dependencies)
def __init__(self, samples, prefix, output_prefix, dependencies=()): abs_prefix = os.path.abspath(prefix) basename = os.path.basename(output_prefix) # TreeMix plots with migration edges cmd_1 = self._plot_command(prefix, "plot_tree", abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s", IN_SAMPLES=samples, TEMP_OUT_PREFIX=basename + "_tree", OUT_PDF=output_prefix + "_tree.pdf", OUT_PNG=output_prefix + "_tree.png") # Heatmap showing TreeMix residuals cmd_2 = self._plot_command(prefix, "plot_residuals", abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s", IN_SAMPLES=samples, TEMP_OUT_PREFIX=basename + "_residuals", OUT_PDF=output_prefix + "_residuals.pdf", OUT_PNG=output_prefix + "_residuals.png") # Text file containing % of variance explained by model cmd_3 = self._plot_command(prefix, "variance", abs_prefix, "%(OUT_TXT)s", OUT_TXT=output_prefix + "_variance.txt") CommandNode.__init__(self, description="<PlotTreemix -> '%s.*'>" % (output_prefix,), command=SequentialCmds((cmd_1, cmd_2, cmd_3)), dependencies=dependencies)
def __init__(self, samples, prefix, output_prefix, dependencies=()): abs_prefix = os.path.abspath(prefix) script = rtools.rscript("zonkey", "pca.r") call = ["Rscript", script, abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s"] cmd = AtomicCmd(call, AUX_SCRIPT=script, IN_FILE_EVAL=prefix + ".eval", IN_FILE_EVEC=prefix + ".evec", IN_SAMPLES=samples, TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_LABELS=rtools.requirement("directlabels"), set_cwd=True) CommandNode.__init__(self, description="<PlotPCA -> '%s.*'>" % (output_prefix,), command=cmd, dependencies=dependencies)
def __init__(self, input_prefix, output_prefix, tfam, parameters=None, dependencies=()): basename = os.path.basename(output_prefix) plink_cmd = ["plink", "--freq", "--missing", "--noweb", "--bfile", input_prefix, "--within", "%(TEMP_OUT_CLUST)s", "--out", "%(TEMP_OUT_PREFIX)s"] if parameters: plink_cmd.extend(parameters.split()) plink = AtomicCmd(plink_cmd, IN_BED=input_prefix + ".bed", IN_BIM=input_prefix + ".bim", IN_FAM=input_prefix + ".fam", TEMP_OUT_CLUST="samples.clust", OUT_NOSEX=output_prefix + ".frq.strat.nosex", OUT_LOG=output_prefix + ".frq.strat.log", TEMP_OUT_PREFIX=basename, CHECK_VERSION=PLINK_VERSION) gzip = AtomicCmd(["gzip", "%(TEMP_IN_FREQ)s"], TEMP_IN_FREQ=basename + ".frq.strat", OUT_FREQ=output_prefix + ".frq.strat.gz") # FIXME! Can be self._tfam = tfam self._basename = basename CommandNode.__init__(self, description="<BuildFreqFiles -> '%s.*'" % (output_prefix,), command=SequentialCmds((plink, gzip)), dependencies=dependencies)
def __init__(self, samples, treefile, bootstraps, output_prefix, dependencies=()): rscript = rtools.rscript("zonkey", "tinytree.r") cmd = AtomicCmd(("Rscript", rscript, "%(TEMP_OUT_FILE)s", "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=rscript, IN_SAMPLES=samples, IN_FILE=treefile, IN_BOOTSTRAPS=bootstraps, TEMP_OUT_FILE="rerooted.newick", TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_TREE_PDF=output_prefix + ".pdf", OUT_TREE_PNG=output_prefix + ".png", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_RSCRIPT_APE=rtools.requirement("ape"), CHECK_RSCRIPT_GGPLOT2=rtools.requirement("ggplot2"), CHECK_RSCRIPT_GRID=rtools.requirement("grid")) self._treefile = treefile self._bootstraps = bootstraps CommandNode.__init__(self, description="<DrawPhylogeny -> '%s.*'>" % (output_prefix,), command=cmd, dependencies=dependencies)
def __init__(self, infile, outfile, regions, options, dependencies=()): vcffilter = factory.new("vcf_filter") vcffilter.add_value("%(IN_VCF)s") for contig in regions["HomozygousContigs"]: vcffilter.add_option("--homozygous-chromosome", contig) vcffilter.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE) apply_options(vcffilter, options) bgzip = AtomicCmdBuilder(["bgzip"], IN_STDIN=vcffilter, OUT_STDOUT=outfile) description = "<VCFFilter: '%s' -> '%s'>" % ( infile, outfile, ) CommandNode.__init__( self, description=description, command=ParallelCmds([vcffilter.finalize(), bgzip.finalize()]), dependencies=dependencies, )
def __init__(self, contigs, mapping, input_file, output_prefix, dependencies=()): self._contigs = contigs self._mapping = dict(zip(mapping.values(), mapping)) self._input_file = input_file script = rtools.rscript("zonkey", "coverage.r") cmd = AtomicCmd( ("Rscript", script, "%(TEMP_OUT_TABLE)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=script, IN_FILE=input_file, TEMP_OUT_TABLE="contigs.table", OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", TEMP_OUT_PREFIX=os.path.basename(output_prefix), CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), set_cwd=True, ) CommandNode.__init__( self, description="<CoveragePlot -> '%s.*'>" % (output_prefix, ), command=cmd, dependencies=dependencies, )
def __init__( self, input_file_1, output_file, reference, prefix, input_file_2=None, threads=1, algorithm="mem", mapping_options={}, cleanup_options={}, dependencies=(), ): if algorithm not in ("mem", "bwasw"): raise NotImplementedError("BWA algorithm %r not implemented" % (algorithm, )) threads = _get_max_threads(reference, threads) aln = _new_bwa_command( ("bwa", algorithm, prefix, "%(IN_FILE_1)s"), prefix, IN_FILE_1=input_file_1, OUT_STDOUT=AtomicCmd.PIPE, ) if input_file_2: aln.add_value("%(IN_FILE_2)s") aln.set_kwargs(IN_FILE_2=input_file_2) aln.set_option("-t", threads) # Mark alternative hits as secondary; required by e.g. Picard aln.set_option("-M") cleanup = _new_cleanup_command(aln, output_file, reference, paired_end=input_file_1 and input_file_2) apply_options(aln, mapping_options) apply_options(cleanup, cleanup_options) description = _get_node_description( name="BWA", algorithm="%s%s" % (algorithm.upper(), "_PE" if input_file_2 else "_SE"), input_files_1=input_file_1, input_files_2=input_file_2, prefix=prefix, ) CommandNode.__init__( self, command=ParallelCmds([aln.finalize(), cleanup.finalize()]), description=description, threads=threads, dependencies=dependencies, )
def __init__(self, samples, treefile, bootstraps, output_prefix, dependencies=()): rscript = rtools.rscript("zonkey", "tinytree.r") cmd = AtomicCmd(("Rscript", rscript, "%(TEMP_OUT_FILE)s", "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=rscript, IN_SAMPLES=samples, IN_FILE=treefile, IN_BOOTSTRAPS=bootstraps, TEMP_OUT_FILE="rerooted.newick", TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_TREE_PDF=output_prefix + ".pdf", OUT_TREE_PNG=output_prefix + ".png", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_RSCRIPT_APE=rtools.requirement("ape"), CHECK_RSCRIPT_GGPLOT2=rtools.requirement("ggplot2"), CHECK_RSCRIPT_GRID=rtools.requirement("grid")) self._treefile = treefile self._bootstraps = bootstraps CommandNode.__init__(self, description="<DrawPhylogeny -> '%s.*'>" % (output_prefix, ), command=cmd, dependencies=dependencies)
def __init__( self, target_name, input_file, output_file, prefix, regions_file=None, dependencies=(), ): index_format = regions_file and prefix["IndexFormat"] builder = factory.new("depths") builder.add_value("%(IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(OUT_FILE=output_file, IN_BAM=input_file) if regions_file: builder.set_option("--regions-file", "%(IN_REGIONS)s") builder.set_kwargs(IN_REGIONS=regions_file, TEMP_IN_INDEX=input_file + index_format) description = "<DepthHistogram: %s -> '%s'>" % ( input_file, output_file, ) CommandNode.__init__( self, command=builder.finalize(), description=description, dependencies=dependencies, )
def __init__(self, parameters): self._kwargs = parameters.command.kwargs CommandNode.__init__(self, command = parameters.command.finalize(), description = "<RAxMLReduce: '%s' -> '%s'>" \ % (parameters.input_alignment, parameters.output_alignment), dependencies = parameters.dependencies)
def __init__(self, infile, index_format='.bai', dependencies=()): basename = os.path.basename(infile) if index_format == '.bai': samtools_version = SAMTOOLS_VERSION samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"] elif index_format == '.csi': samtools_version = SAMTOOLS_VERSION_1x samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"] else: raise ValueError("Unknown format type %r; expected .bai or .csi" % (index_format,)) cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"], IN_BAM=infile, TEMP_OUT_BAM=basename, set_cwd=True) cmd_index = AtomicCmd(samtools_call, TEMP_IN_BAM=basename, CHECK_SAM=samtools_version) cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"], TEMP_IN_BAM=basename + index_format, OUT_BAM=swap_ext(infile, index_format)) commands = SequentialCmds((cmd_link, cmd_index, cmd_rename)) CommandNode.__init__(self, description="<BAMIndex (%s): '%s'>" % (index_format[1:].upper(), infile), command=commands, dependencies=dependencies)
def __init__(self, output_root, table, bamfile, downsample, dependencies=()): cmd = factory.new("build_tped") cmd.set_option("--name", "Sample") cmd.set_option("--downsample", downsample) cmd.add_value("%(TEMP_DIR)s") cmd.add_value("%(IN_TABLE)s") cmd.add_value("%(IN_BAM)s") if not downsample: # Needed for random access (chromosomes are read 1 ... 31) cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai")) cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"), OUT_SUMMARY=os.path.join(output_root, "common.summary"), OUT_TPED_INCL_TS=os.path.join(output_root, "incl_ts.tped"), OUT_TPED_EXCL_TS=os.path.join(output_root, "excl_ts.tped"), IN_TABLE=table, IN_BAM=bamfile) CommandNode.__init__(self, description="<BuildTPEDFiles -> %r>" % (os.path.join(output_root, '*'),), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, infile, bedfile, outfile, padding, options={}, dependencies=()): params = factory.new("vcf_to_fasta") params.set_option("--padding", padding) params.set_option("--genotype", "%(IN_VCFFILE)s") params.set_option("--intervals", "%(IN_INTERVALS)s") params.set_kwargs( IN_VCFFILE=infile, IN_TABIX=infile + ".tbi", IN_INTERVALS=bedfile, OUT_STDOUT=outfile, ) apply_options(params, options) description = "<BuildRegions: '%s' -> '%s'>" % ( infile, outfile, ) CommandNode.__init__( self, description=description, command=params.finalize(), dependencies=dependencies, )
def __init__(self, output_prefix, tfam, tped, indep_filter=None, indep_parameters=None, plink_parameters=None, dependencies=()): temp_prefix = os.path.basename(output_prefix) plink_cmd = ["plink", "--make-bed", "--noweb", "--tped", "%(IN_TPED)s", "--tfam", "%(IN_TFAM)s", "--out", "%(TEMP_OUT_PREFIX)s"] plink_cmd.extend(self._parse_parameters(plink_parameters)) command = AtomicCmd(plink_cmd, IN_TPED=tped, IN_TFAM=tfam, TEMP_OUT_PREFIX=temp_prefix, OUT_BED=output_prefix + ".bed", OUT_BIM=output_prefix + ".bim", OUT_FAM=output_prefix + ".fam", OUT_LOG=output_prefix + ".log", TEMP_OUT_NOSEX=temp_prefix + ".nosex", TEMP_OUT_NOF=temp_prefix + ".nof", CHECK_VERSION=PLINK_VERSION, set_cwd=True) CommandNode.__init__(self, description="<BuildBEDFiles -> '%s.*'>" % (output_prefix,), command=command, dependencies=dependencies)
def __init__(self, input_file, output_prefix, order, samples, dependencies=()): self._samples = samples self._order = tuple(order) + ("Sample",) script = rtools.rscript("zonkey", "admixture.r") cmd = AtomicCmd(("Rscript", script, "%(IN_FILE)s", "%(TEMP_OUT_NAMES)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=script, IN_FILE=input_file, IN_SAMPLES=samples, OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", TEMP_OUT_NAMES="samples.txt", TEMP_OUT_PREFIX=os.path.basename(output_prefix), CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_RESHAPE2=rtools.requirement("reshape2"), set_cwd=True) CommandNode.__init__(self, description="<AdmixturePlot -> '%s.*'>" % (output_prefix,), command=cmd, dependencies=dependencies)
def __init__(self, infile, index_format=".bai", dependencies=()): if index_format == ".bai": samtools_call = ["samtools", "index", "%(IN_BAM)s", "%(OUT_IDX)s"] elif index_format == ".csi": samtools_call = [ "samtools", "index", "-c", "%(IN_BAM)s", "%(OUT_IDX)s" ] else: raise ValueError("Unknown format type %r; expected .bai or .csi" % (index_format, )) command = AtomicCmd( samtools_call, IN_BAM=infile, OUT_IDX=infile + index_format, CHECK_SAM=SAMTOOLS_VERSION, ) CommandNode.__init__( self, description="<BAMIndex (%s): '%s'>" % (index_format[1:].upper(), infile), command=command, dependencies=dependencies, )
def __init__(self, input_file, output_prefix, order, samples, dependencies=()): self._samples = samples self._order = tuple(order) + ("Sample", ) script = rtools.rscript("zonkey", "admixture.r") cmd = AtomicCmd(("Rscript", script, "%(IN_FILE)s", "%(TEMP_OUT_NAMES)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=script, IN_FILE=input_file, IN_SAMPLES=samples, OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", TEMP_OUT_NAMES="samples.txt", TEMP_OUT_PREFIX=os.path.basename(output_prefix), CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_RESHAPE2=rtools.requirement("reshape2"), set_cwd=True) CommandNode.__init__(self, description="<AdmixturePlot -> '%s.*'>" % (output_prefix, ), command=cmd, dependencies=dependencies)
def __init__(self, config, input_bams, command, index_format=None, description=None, threads=1, dependencies=()): self._input_bams = safe_coerce_to_tuple(input_bams) self._index_format = index_format if not self._input_bams: raise ValueError("No input BAM files specified!") elif len(self._input_bams) > 1 and index_format: raise ValueError("BAM index cannot be required for > 1 file") elif index_format not in (None, ".bai", ".csi"): raise ValueError("Unknown index format %r" % (index_format,)) if len(self._input_bams) > 1: merge = picard_command(config, "MergeSamFiles") merge.set_option("SO", "coordinate", sep="=") merge.set_option("COMPRESSION_LEVEL", 0, sep="=") merge.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") # Validation is mostly left to manual ValidateSamFile runs; this # is because .csi indexed BAM records can have "invalid" bins. merge.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=") merge.add_multiple_options("I", input_bams, sep="=") merge.set_kwargs(TEMP_OUT_BAM=self.PIPE_FILE) command = ParallelCmds([merge.finalize(), command]) CommandNode.__init__(self, command=command, description=description, threads=threads, dependencies=dependencies)
def __init__(self, input_file, output_file, algorithm="auto", options={}, dependencies=()): command = AtomicCmdBuilder( _PRESETS[algorithm.lower()] + ["%(IN_FASTA)s"], IN_FASTA=input_file, OUT_STDOUT=output_file, CHECK_VERSION=MAFFT_VERSION, ) apply_options(command, options) self._output_file = output_file CommandNode.__init__( self, command=command.finalize(), description="<MAFFTNode (%s): '%s' -> '%s'>" % ( algorithm, input_file, output_file, ), dependencies=dependencies, )
def __init__(self, input_prefix, output_prefix, nchroms, dependencies=()): self._input_prefix = input_prefix self._output_prefix = output_prefix self._nchroms = nchroms cmd = AtomicCmd( ("smartpca", "-p", "%(TEMP_OUT_PARAMS)s"), TEMP_OUT_PARAMS="parameters.txt", IN_FILE_BED=input_prefix + ".bed", IN_FILE_BIM=input_prefix + ".bim", IN_FILE_FAM=input_prefix + ".fam", OUT_STDOUT=output_prefix + ".log", OUT_EVEC=output_prefix + ".evec", OUT_EVAL=output_prefix + ".eval", OUT_SNPS=output_prefix + ".deleted_snps", CHECK_VERSION=SMARTPCA_VERSION, set_cwd=True, ) CommandNode.__init__( self, description="<SmartPCA -> '%s.*>" % (output_prefix, ), command=cmd, dependencies=dependencies, )
def __init__(self, control_file, sequence_file, trees_file, output_tar, exclude_groups=(), dependencies=()): self._exclude_groups = safe_coerce_to_frozenset(exclude_groups) self._control_file = control_file self._sequence_file = sequence_file self._trees_file = trees_file paml_cmd = AtomicCmd(["codeml", "template.ctl"], IN_CONTROL_FILE = control_file, IN_SEQUENCE_FILE = sequence_file, IN_TREES_FILE = trees_file, TEMP_OUT_CTL = "template.ctl", TEMP_OUT_SEQS = "template.seqs", TEMP_OUT_TREES = "template.trees", TEMP_OUT_STDOUT = "template.stdout", TEMP_OUT_STDERR = "template.stderr", TEMP_OUT_4FOLD = "4fold.nuc", IN_STDIN = "/dev/null", # Prevent promts from blocking set_cwd = True, **CodemlNode._get_codeml_files("TEMP_OUT_CODEML")) tar_pairs = CodemlNode._get_codeml_files("TEMP_IN_CODEML") tar_files = ["%%(%s)s" % (key,) for key in tar_pairs] tar_cmd = AtomicCmd(["tar", "cvzf", "%(OUT_FILE)s"] + tar_files, OUT_FILE = output_tar, set_cwd = True, **tar_pairs) CommandNode.__init__(self, description = "<CodemlNode: %r -> %r>" % (sequence_file, output_tar), command = SequentialCmds([paml_cmd, tar_cmd]), dependencies = dependencies)
def __init__(self, input_file, output_prefix, threads=1, options={}, dependencies=()): # See below for parameters in common between SE/PE cmd = _get_common_parameters(threads=threads, options=options) # Prefix for output files, ensure that all end up in temp folder cmd.set_option("--basename", "%(TEMP_OUT_BASENAME)s") output_tmpl = output_prefix + ".%s.gz" cmd.set_kwargs( TEMP_OUT_BASENAME=os.path.basename(output_prefix), OUT_SETTINGS=output_prefix + ".settings", OUT_MATE_1=output_tmpl % ("truncated", ), OUT_DISCARDED=output_tmpl % ("discarded", ), ) cmd.set_option("--file1", "%(IN_READS_1)s") cmd.set_kwargs(IN_READS_1=input_file) apply_options(cmd, options) CommandNode.__init__( self, command=cmd.finalize(), threads=threads, description="<AdapterRM (SE): %s -> '%s.*'>" % ( fileutils.describe_files(input_file), output_prefix, ), dependencies=dependencies, )
def __init__(self, parameters): self._directory = parameters.directory description = "<mapDamage (model): %r>" % (parameters.directory,) CommandNode.__init__(self, command=parameters.command.finalize(), description=description, dependencies=parameters.dependencies)
def __init__(self, parameters): command = parameters.command.finalize() description = "<BWA Index '%s' -> '%s.*'>" % (parameters.input_file, parameters.prefix) CommandNode.__init__(self, command=command, description=description, dependencies=parameters.dependencies)
def __init__(self, parameters): command = parameters.command.finalize() description = "<BuildRegions: '%s' -> '%s'>" % (parameters.infile, parameters.outfile) CommandNode.__init__(self, description=description, command=command, dependencies=parameters.dependencies)
def __init__(self, parameters): self._directory = parameters.directory description = "<mapDamage (model): %r>" % (parameters.directory, ) CommandNode.__init__(self, command=parameters.command.finalize(), description=description, dependencies=parameters.dependencies)
def __init__(self, data, input_file, output_prefix, m=0, k=100, outgroup=(), dependencies=()): call = [ "treemix", "-i", "%(IN_FILE)s", "-o", "%(TEMP_OUT_PREFIX)s", "-global", "-m", m ] if outgroup: call.extend(("-root", ",".join(outgroup))) self._param_m = m self._param_outgroup = outgroup self._params_file = output_prefix + ".parameters.txt" if isinstance(k, int): call.extend(("-k", k)) self._param_k = k self._k_file = self._k_field = None elif isinstance(k, tuple) and all(isinstance(v, str) for v in k): self._k_field, self._k_file = k self._genome_size = sum(value["Size"] for value in data.contigs.itervalues()) self._snp_distance = data.settings["SNPDistance"] else: raise ValueError("k must be int or (key, path) in TreemixNode") self._parameters_hash \ = "%s.%s" % (output_prefix, hash_params(k=k, m=m, global_set=True, outgroup=tuple(sorted(outgroup)))) cmd = AtomicCmd(call, IN_FILE=input_file, TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_FILE_COV=output_prefix + ".cov.gz", OUT_FILE_COVSE=output_prefix + ".covse.gz", OUT_FILE_EDGES=output_prefix + ".edges.gz", OUT_FILE_LLIK=output_prefix + ".llik", OUT_FILE_MODELCOV=output_prefix + ".modelcov.gz", OUT_FILE_TREEOUT=output_prefix + ".treeout.gz", OUT_FILE_VERTICES=output_prefix + ".vertices.gz", OUT_FILE_PARAMS=self._params_file, OUT_FILE_PARAMS_HASH=self._parameters_hash, CHECK_VERSION=TREEMIX_VERSION, set_cwd=True) CommandNode.__init__(self, description="<Treemix -> '%s.*'>" % (output_prefix, ), command=cmd, dependencies=dependencies)
def __init__( self, input_binary, initial_tree, output_template, model="GAMMA", threads=1, dependencies=(), ): """ Arguments: input_binary -- A binary alignment file in a format readable by ExaML. output_template -- A template string used to construct final filenames. Should consist of a full path, including a single '%s', which is replaced with the variable part of RAxML output files (e.g. 'info', 'bestTree', ...). Example destination: '/disk/project/SN013420.RAxML.%s' Example output: '/disk/project/SN013420.RAxML.bestTree' """ # TODO: Make MPIParams! command = AtomicMPICmdBuilder("examl", threads=threads) # Ensures that output is saved to the temporary directory command.set_option("-w", "%(TEMP_DIR)s") command.set_option("-s", "%(IN_ALN)s") command.set_option("-t", "%(IN_TREE)s") command.set_option("-n", "Pypeline") command.set_kwargs( IN_ALN=input_binary, IN_TREE=initial_tree, # Final output files, are not created directly OUT_INFO=output_template % "info", OUT_BESTTREE=output_template % "result", OUT_BOOTSTRAP=output_template % "log", # Only generated by newer versions of ExaML TEMP_OUT_MODELFILE=os.path.basename(output_template % "modelFile"), CHECK_EXAML=EXAML_VERSION, ) # Use the GAMMA model of NT substitution by default command.set_option("-m", model) self._dirname = os.path.dirname(output_template) self._template = os.path.basename(output_template) CommandNode.__init__( self, command=command.finalize(), description="<ExaML (%i thread(s)): '%s' -> '%s'>" % (threads, input_binary, output_template), threads=threads, dependencies=dependencies, )
def __init__(self, parameters): self._symlinks = [os.path.abspath(parameters.input_alignment)] self._output_tree = os.path.basename(parameters.output_tree) CommandNode.__init__(self, command = parameters.command.finalize(), description = "<Parsimonator: '%s' -> '%s'>" \ % (parameters.input_alignment, parameters.output_tree), dependencies = parameters.dependencies)
def __init__(self, parameters): self._symlinks = [os.path.abspath(parameters.input_alignment)] self._output_tree = os.path.basename(parameters.output_tree) CommandNode.__init__(self, command = parameters.command.finalize(), description = "<Parsimonator: '%s' -> '%s'>" \ % (parameters.input_alignment, parameters.output_tree), dependencies = parameters.dependencies)
def __init__(self, parameters): self._input_alignment = parameters.input_alignment self._input_partitions = parameters.input_partitions self._output_tree = parameters.output_tree CommandNode.__init__(self, command = parameters.command.finalize(), description = "<RAxMLParsimonyTree: '%s' -> '%s'>" \ % (parameters.input_alignment, parameters.output_tree), dependencies = parameters.dependencies)
def __init__(self, output_prefix, tfam, tped, indep_filter=None, indep_parameters=None, plink_parameters=None, dependencies=()): assert indep_filter in ('indep', 'indep-pairphase', 'indep-pairwise'), indep_filter assert len(indep_parameters) == 3, indep_parameters parameters = self._parse_parameters(plink_parameters) plink_cmd = [ "plink", "--noweb", "--tped", "%(IN_TPED)s", "--tfam", "%(IN_TFAM)s", "--out", "%(TEMP_OUT_PREFIX)s", '--' + indep_filter ] plink_cmd.extend(indep_parameters) plink_cmd.extend(parameters) cmd_indep = AtomicCmd(plink_cmd, IN_TFAM=tfam, IN_TPED=tped, TEMP_OUT_PREFIX="indep", TEMP_OUT_LOG="indep.log", TEMP_OUT_NOSEX="indep.nosex", TEMP_OUT_PRUNE_IN="indep.prune.in", TEMP_OUT_PRUNE_OUT="indep.prune.out", set_cwd=True) basename = os.path.basename(output_prefix) cmd_filter = AtomicCmd([ "plink", "--noweb", "--make-bed", "--tped", "%(IN_TPED)s", "--tfam", "%(IN_TFAM)s", "--extract", "%(TEMP_IN_PRUNE)s", "--out", "%(TEMP_OUT_PREFIX)s" ] + parameters, IN_TFAM=tfam, IN_TPED=tped, TEMP_OUT_PREFIX=basename, TEMP_IN_PRUNE="indep.prune.in", TEMP_OUT_NOSEX=basename + ".nosex", TEMP_OUT_LOG=basename + ".log", OUT_LOG=output_prefix + ".log", OUT_BED=output_prefix + ".bed", OUT_BIM=output_prefix + ".bim", OUT_FAM=output_prefix + ".fam", set_cwd=True) CommandNode.__init__(self, description="<BuildFilteredBEDFiles -> '%s.*'>" % (output_prefix, ), command=SequentialCmds((cmd_indep, cmd_filter)), dependencies=dependencies)
def __init__(self, parameters): self._output_file = parameters.output_file description = "<MAFFTNode (%s): '%s' -> '%s'>" \ % (parameters.algorithm, parameters.input_file, parameters.output_file) CommandNode.__init__(self, command = parameters.command.finalize(), description = description, dependencies = parameters.dependencies)
def __init__(self, parameters): self._in_vcf = parameters.infile_vcf command = parameters.command.finalize() description = "<VCFPileup: '%s' -> '%s'>" \ % (parameters.infile_vcf, parameters.outfile) CommandNode.__init__(self, description=description, command=command, dependencies=parameters.dependencies)
def __init__(self, parameters): self._symlinks = [os.path.abspath(parameters.input_alignment), os.path.abspath(parameters.input_partition)] self._output_file = os.path.basename(parameters.output_file) CommandNode.__init__(self, command = parameters.command.finalize(), description = "<ExaMLParser: '%s' -> '%s'>" \ % (parameters.input_alignment, parameters.output_file), dependencies = parameters.dependencies)
def __init__(self, parameters): self._output_file = parameters.output_file description = "<MAFFTNode (%s): '%s' -> '%s'>" \ % (parameters.algorithm, parameters.input_file, parameters.output_file) CommandNode.__init__(self, command = parameters.command.finalize(), description = description, dependencies = parameters.dependencies)
def __init__(self, output_prefix, tfam, tped, indep_filter=None, indep_parameters=None, plink_parameters=None, dependencies=()): assert indep_filter in ('indep', 'indep-pairphase', 'indep-pairwise'), indep_filter assert len(indep_parameters) == 3, indep_parameters parameters = self._parse_parameters(plink_parameters) plink_cmd = ["plink", "--noweb", "--tped", "%(IN_TPED)s", "--tfam", "%(IN_TFAM)s", "--out", "%(TEMP_OUT_PREFIX)s", '--' + indep_filter] plink_cmd.extend(indep_parameters) plink_cmd.extend(parameters) cmd_indep = AtomicCmd(plink_cmd, IN_TFAM=tfam, IN_TPED=tped, TEMP_OUT_PREFIX="indep", TEMP_OUT_LOG="indep.log", TEMP_OUT_NOSEX="indep.nosex", TEMP_OUT_PRUNE_IN="indep.prune.in", TEMP_OUT_PRUNE_OUT="indep.prune.out", set_cwd=True) basename = os.path.basename(output_prefix) cmd_filter = AtomicCmd(["plink", "--noweb", "--make-bed", "--tped", "%(IN_TPED)s", "--tfam", "%(IN_TFAM)s", "--extract", "%(TEMP_IN_PRUNE)s", "--out", "%(TEMP_OUT_PREFIX)s"] + parameters, IN_TFAM=tfam, IN_TPED=tped, TEMP_OUT_PREFIX=basename, TEMP_IN_PRUNE="indep.prune.in", TEMP_OUT_NOSEX=basename + ".nosex", TEMP_OUT_LOG=basename + ".log", OUT_LOG=output_prefix + ".log", OUT_BED=output_prefix + ".bed", OUT_BIM=output_prefix + ".bim", OUT_FAM=output_prefix + ".fam", set_cwd=True) CommandNode.__init__(self, description="<BuildFilteredBEDFiles -> '%s.*'>" % (output_prefix,), command=SequentialCmds((cmd_indep, cmd_filter)), dependencies=dependencies)
def __init__(self, input_file, destination, dependencies=()): md5_cmd = AtomicCmd(("md5sum", "%(IN_FILE)s"), IN_FILE=input_file, OUT_STDOUT=destination) description = "<MD5Sum %s -> %s>" \ % (input_file, destination) CommandNode.__init__(self, description=description, command=md5_cmd, dependencies=dependencies)
def __init__(self, input_prefix, output_prefix, tfam, parameters=None, dependencies=()): basename = os.path.basename(output_prefix) plink_cmd = [ "plink", "--freq", "--missing", "--noweb", "--bfile", os.path.abspath(input_prefix), "--within", "%(TEMP_OUT_CLUST)s", "--out", "%(TEMP_OUT_PREFIX)s", ] if parameters: plink_cmd.extend(parameters.split()) plink = AtomicCmd( plink_cmd, IN_BED=input_prefix + ".bed", IN_BIM=input_prefix + ".bim", IN_FAM=input_prefix + ".fam", TEMP_OUT_CLUST="samples.clust", TEMP_OUT_IMISS=basename + ".imiss", TEMP_OUT_LMISS=basename + ".lmiss", OUT_NOSEX=output_prefix + ".frq.strat.nosex", OUT_LOG=output_prefix + ".frq.strat.log", TEMP_OUT_PREFIX=basename, CHECK_VERSION=PLINK_VERSION, set_cwd=True, ) gzip = AtomicCmd( ["gzip", "%(TEMP_IN_FREQ)s"], TEMP_IN_FREQ=basename + ".frq.strat", OUT_FREQ=output_prefix + ".frq.strat.gz", ) self._tfam = tfam self._basename = basename CommandNode.__init__( self, description="<BuildFreqFiles -> '%s.*'" % (output_prefix, ), command=SequentialCmds((plink, gzip)), dependencies=dependencies, )
def __init__(self, parameters): self._symlinks = [parameters.input_alignment, parameters.input_partition] self._template = os.path.basename(parameters.output_template) CommandNode.__init__(self, command=parameters.command.finalize(), description="<RAxMLRapidBS: '%s' -> '%s'>" % (parameters.input_alignment, parameters.output_template % ("*",)), threads=parameters.threads, dependencies=parameters.dependencies)
def __init__(self, parameters): self._dirname = os.path.dirname(parameters.output_template) self._template = os.path.basename(parameters.output_template) CommandNode.__init__(self, command = parameters.command.finalize(), description = "<ExaML (%i thread(s)): '%s' -> '%s'>" \ % (parameters.threads, parameters.input_binary, parameters.output_template), threads = parameters.threads, dependencies = parameters.dependencies)
def __init__(self, parameters): self._dirname = os.path.dirname(parameters.output_template) self._template = os.path.basename(parameters.output_template) CommandNode.__init__(self, command = parameters.command.finalize(), description = "<ExaML (%i thread(s)): '%s' -> '%s'>" \ % (parameters.threads, parameters.input_binary, parameters.output_template), threads = parameters.threads, dependencies = parameters.dependencies)
def __init__(self, infile, dependencies=()): self._infile = infile cmd_faidx = AtomicCmd(["samtools", "faidx", "%(TEMP_IN_FASTA)s"], TEMP_IN_FASTA=os.path.basename(infile), IN_FASTA=infile, OUT_TBI=infile + ".fai", CHECK_SAM=SAMTOOLS_VERSION) CommandNode.__init__(self, description="<FastaIndex: '%s'>" % (infile,), command=cmd_faidx, dependencies=dependencies)
def __init__(self, data, input_file, output_prefix, m=0, k=100, outgroup=(), dependencies=()): call = ["treemix", "-i", "%(IN_FILE)s", "-o", "%(TEMP_OUT_PREFIX)s", "-global", "-m", m] if outgroup: call.extend(("-root", ",".join(outgroup))) self._param_m = m self._param_outgroup = outgroup self._params_file = output_prefix + ".parameters.txt" if isinstance(k, int): call.extend(("-k", k)) self._param_k = k self._k_file = self._k_field = None elif isinstance(k, tuple) and all(isinstance(v, str) for v in k): self._k_field, self._k_file = k self._genome_size = sum(value["Size"] for value in data.contigs.itervalues()) self._snp_distance = data.settings["SNPDistance"] else: raise ValueError("k must be int or (key, path) in TreemixNode") self._parameters_hash \ = "%s.%s" % (output_prefix, hash_params(k=k, m=m, global_set=True, outgroup=tuple(sorted(outgroup)))) cmd = AtomicCmd(call, IN_FILE=input_file, TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_FILE_COV=output_prefix + ".cov.gz", OUT_FILE_COVSE=output_prefix + ".covse.gz", OUT_FILE_EDGES=output_prefix + ".edges.gz", OUT_FILE_LLIK=output_prefix + ".llik", OUT_FILE_MODELCOV=output_prefix + ".modelcov.gz", OUT_FILE_TREEOUT=output_prefix + ".treeout.gz", OUT_FILE_VERTICES=output_prefix + ".vertices.gz", OUT_FILE_PARAMS=self._params_file, OUT_FILE_PARAMS_HASH=self._parameters_hash, CHECK_VERSION=TREEMIX_VERSION, set_cwd=True) CommandNode.__init__(self, description="<Treemix -> '%s.*'>" % (output_prefix,), command=cmd, dependencies=dependencies)
def __init__(self, parameters): self._input_alignment = parameters.input_alignment self._input_partition = parameters.input_partition self._output_template = parameters.template self._bootstrap_num = parameters.bootstraps self._bootstrap_start = parameters.start CommandNode.__init__(self, command = parameters.command.finalize(), description = "<RAxMLBootstrap: '%s' -> '%s' (%i .. %i>" \ % (parameters.input_alignment, parameters.template, parameters.start, parameters.start + parameters.bootstraps - 1), dependencies = parameters.dependencies)
def __init__(self, parameters): command = ParallelCmds([parameters.commands[key].finalize() for key in parameters.order]) input_file = parameters.input_file_fq description = _get_node_description(name="BWA Samse", input_files_1=input_file, prefix=parameters.prefix) CommandNode.__init__(self, command=command, description=description, dependencies=parameters.dependencies)
def __init__(self, parameters): command = ParallelCmds([parameters.commands[key].finalize() for key in parameters.order]) description \ = _get_node_description(name="BWA", algorithm='Backtrack', input_files_1=parameters.input_file, prefix=parameters.prefix, threads=parameters.threads) CommandNode.__init__(self, command=command, description=description, threads=parameters.threads, dependencies=parameters.dependencies)
def __init__(self, parameters): command = ParallelCmds([parameters.commands[key].finalize() for key in parameters.order]) algorithm = "PE" if parameters.input_file_2 else "SE" description = _get_node_description(name = "Bowtie2", algorithm = algorithm, input_files_1 = parameters.input_file_1, input_files_2 = parameters.input_file_2, prefix = parameters.prefix, threads = parameters.threads) CommandNode.__init__(self, command = command, description = description, threads = parameters.threads, dependencies = parameters.dependencies)