def __init__(self, config, reference, intervals, infiles, outfile, dependencies=()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), IN_INTERVALS=intervals, OUT_BAMFILE=outfile, CHECK_GATK=_get_gatk_version_check(config)) calmd = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM=self._basename, IN_REF=reference, TEMP_OUT_STDOUT=self._basename + ".calmd", CHECK_VERSION=SAMTOOLS_VERSION) description = "<GATK Indel Realigner (aligning): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=ParallelCmds([command.finalize(), calmd]), dependencies=dependencies)
def __init__(self, samples, prefix, output_prefix, dependencies=()): abs_prefix = os.path.abspath(prefix) basename = os.path.basename(output_prefix) # TreeMix plots with migration edges cmd_1 = self._plot_command(prefix, "plot_tree", abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s", IN_SAMPLES=samples, TEMP_OUT_PREFIX=basename + "_tree", OUT_PDF=output_prefix + "_tree.pdf", OUT_PNG=output_prefix + "_tree.png") # Heatmap showing TreeMix residuals cmd_2 = self._plot_command(prefix, "plot_residuals", abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s", IN_SAMPLES=samples, TEMP_OUT_PREFIX=basename + "_residuals", OUT_PDF=output_prefix + "_residuals.pdf", OUT_PNG=output_prefix + "_residuals.png") # Text file containing % of variance explained by model cmd_3 = self._plot_command(prefix, "variance", abs_prefix, "%(OUT_TXT)s", OUT_TXT=output_prefix + "_variance.txt") CommandNode.__init__(self, description="<PlotTreemix -> '%s.*'>" % (output_prefix,), command=SequentialCmds((cmd_1, cmd_2, cmd_3)), dependencies=dependencies)
def __init__(self, config, reference, infiles, outfile, threads=1, dependencies=()): threads = _get_max_threads(reference, threads) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") command.set_option("-nt", threads) _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile, CHECK_GATK=_get_gatk_version_check(config)) description = "<GATK Indel Realigner (training): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, threads=threads, description=description, command=command.finalize(), dependencies=dependencies)
def __init__(self, infile, bedfile, outfile, padding, options={}, dependencies=()): params = factory.new("vcf_to_fasta") params.set_option("--padding", padding) params.set_option("--genotype", "%(IN_VCFFILE)s") params.set_option("--intervals", "%(IN_INTERVALS)s") params.set_kwargs( IN_VCFFILE=infile, IN_TABIX=infile + ".tbi", IN_INTERVALS=bedfile, OUT_STDOUT=outfile, ) apply_options(params, options) description = "<BuildRegions: '%s' -> '%s'>" % ( infile, outfile, ) CommandNode.__init__( self, description=description, command=params.finalize(), dependencies=dependencies, )
def __init__(self, output_prefix, tfam, tped, indep_filter=None, indep_parameters=None, plink_parameters=None, dependencies=()): temp_prefix = os.path.basename(output_prefix) plink_cmd = [ "plink", "--make-bed", "--noweb", "--tped", "%(IN_TPED)s", "--tfam", "%(IN_TFAM)s", "--out", "%(TEMP_OUT_PREFIX)s" ] plink_cmd.extend(self._parse_parameters(plink_parameters)) command = AtomicCmd(plink_cmd, IN_TPED=tped, IN_TFAM=tfam, TEMP_OUT_PREFIX=temp_prefix, OUT_BED=output_prefix + ".bed", OUT_BIM=output_prefix + ".bim", OUT_FAM=output_prefix + ".fam", OUT_LOG=output_prefix + ".log", TEMP_OUT_NOSEX=temp_prefix + ".nosex", TEMP_OUT_NOF=temp_prefix + ".nof", CHECK_VERSION=PLINK_VERSION, set_cwd=True) CommandNode.__init__(self, description="<BuildBEDFiles -> '%s.*'>" % (output_prefix, ), command=command, dependencies=dependencies)
def __init__(self, samples, treefile, bootstraps, output_prefix, dependencies=()): rscript = rtools.rscript("zonkey", "tinytree.r") cmd = AtomicCmd(("Rscript", rscript, "%(TEMP_OUT_FILE)s", "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=rscript, IN_SAMPLES=samples, IN_FILE=treefile, IN_BOOTSTRAPS=bootstraps, TEMP_OUT_FILE="rerooted.newick", TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_TREE_PDF=output_prefix + ".pdf", OUT_TREE_PNG=output_prefix + ".png", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_RSCRIPT_APE=rtools.requirement("ape"), CHECK_RSCRIPT_GGPLOT2=rtools.requirement("ggplot2"), CHECK_RSCRIPT_GRID=rtools.requirement("grid")) self._treefile = treefile self._bootstraps = bootstraps CommandNode.__init__(self, description="<DrawPhylogeny -> '%s.*'>" % (output_prefix,), command=cmd, dependencies=dependencies)
def __init__(self, infile, outfile, genome, from_start=0, from_end=0, strand_relative=False, dependencies=()): if type(from_start) != type(from_end): raise ValueError("Parameters 'from_start' and 'from_end' should " "be of same type!") call = ["bedtools", "slop", "-i", "%(IN_FILE)s", "-g", "%(IN_GENOME)s", "-l", str(from_start), "-r", str(from_end)] if strand_relative: call.append("-s") if type(from_start) is float: call.append("-pct") command = AtomicCmd(call, IN_FILE=infile, IN_GENOME=genome, OUT_STDOUT=outfile, CHECK_VERSION=BEDTOOLS_VERSION) description = "<SlopBed: '%s' -> '%s'>" % (infile, outfile) CommandNode.__init__(self, description=description, command=command, dependencies=dependencies)
def __init__( self, target_name, input_file, output_file, prefix, regions_file=None, dependencies=(), ): index_format = regions_file and prefix["IndexFormat"] builder = factory.new("depths") builder.add_value("%(IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(OUT_FILE=output_file, IN_BAM=input_file) if regions_file: builder.set_option("--regions-file", "%(IN_REGIONS)s") builder.set_kwargs(IN_REGIONS=regions_file, TEMP_IN_INDEX=input_file + index_format) description = "<DepthHistogram: %s -> '%s'>" % ( input_file, output_file, ) CommandNode.__init__( self, command=builder.finalize(), description=description, dependencies=dependencies, )
def _setup(self, config, temp): """See CommandNode._setup.""" infile = os.path.abspath(self._infile) outfile = reroot_path(temp, self._infile) os.symlink(infile, outfile) CommandNode._setup(self, config, temp)
def _teardown(self, config, temp): os.remove(os.path.join(temp, self.PIPE_FILE)) if self._index_format: os.remove(os.path.join(temp, swap_ext(self.PIPE_FILE, self._index_format))) CommandNode._teardown(self, config, temp)
def __init__(self, config, input_bams, command, index_format=None, description=None, threads=1, dependencies=()): self._input_bams = safe_coerce_to_tuple(input_bams) self._index_format = index_format if not self._input_bams: raise ValueError("No input BAM files specified!") elif len(self._input_bams) > 1 and index_format: raise ValueError("BAM index cannot be required for > 1 file") elif index_format not in (None, ".bai", ".csi"): raise ValueError("Unknown index format %r" % (index_format,)) if len(self._input_bams) > 1: merge = picard_command(config, "MergeSamFiles") merge.set_option("SO", "coordinate", sep="=") merge.set_option("COMPRESSION_LEVEL", 0, sep="=") merge.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") # Validation is mostly left to manual ValidateSamFile runs; this # is because .csi indexed BAM records can have "invalid" bins. merge.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=") merge.add_multiple_options("I", input_bams, sep="=") merge.set_kwargs(TEMP_OUT_BAM=self.PIPE_FILE) command = ParallelCmds([merge.finalize(), command]) CommandNode.__init__(self, command=command, description=description, threads=threads, dependencies=dependencies)
def __init__(self, input_file, output_file, algorithm="auto", options={}, dependencies=()): command = AtomicCmdBuilder( _PRESETS[algorithm.lower()] + ["%(IN_FASTA)s"], IN_FASTA=input_file, OUT_STDOUT=output_file, CHECK_VERSION=MAFFT_VERSION, ) apply_options(command, options) self._output_file = output_file CommandNode.__init__( self, command=command.finalize(), description="<MAFFTNode (%s): '%s' -> '%s'>" % ( algorithm, input_file, output_file, ), dependencies=dependencies, )
def __init__(self, infile, index_format='.bai', dependencies=()): basename = os.path.basename(infile) if index_format == '.bai': samtools_version = SAMTOOLS_VERSION samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"] elif index_format == '.csi': samtools_version = SAMTOOLS_VERSION_1x samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"] else: raise ValueError("Unknown format type %r; expected .bai or .csi" % (index_format,)) cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"], IN_BAM=infile, TEMP_OUT_BAM=basename, set_cwd=True) cmd_index = AtomicCmd(samtools_call, TEMP_IN_BAM=basename, CHECK_SAM=samtools_version) cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"], TEMP_IN_BAM=basename + index_format, OUT_BAM=swap_ext(infile, index_format)) commands = SequentialCmds((cmd_link, cmd_index, cmd_rename)) CommandNode.__init__(self, description="<BAMIndex (%s): '%s'>" % (index_format[1:].upper(), infile), command=commands, dependencies=dependencies)
def _teardown(self, config, temp): # Picard creates a folder named after the user in the temp-root try_rmtree(os.path.join(temp, getpass.getuser())) # Some JREs may create a folder for temporary performance counters try_rmtree(os.path.join(temp, "hsperfdata_" + getpass.getuser())) CommandNode._teardown(self, config, temp)
def __init__(self, samples, treefile, bootstraps, output_prefix, dependencies=()): rscript = rtools.rscript("zonkey", "tinytree.r") cmd = AtomicCmd(("Rscript", rscript, "%(TEMP_OUT_FILE)s", "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=rscript, IN_SAMPLES=samples, IN_FILE=treefile, IN_BOOTSTRAPS=bootstraps, TEMP_OUT_FILE="rerooted.newick", TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_TREE_PDF=output_prefix + ".pdf", OUT_TREE_PNG=output_prefix + ".png", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_RSCRIPT_APE=rtools.requirement("ape"), CHECK_RSCRIPT_GGPLOT2=rtools.requirement("ggplot2"), CHECK_RSCRIPT_GRID=rtools.requirement("grid")) self._treefile = treefile self._bootstraps = bootstraps CommandNode.__init__(self, description="<DrawPhylogeny -> '%s.*'>" % (output_prefix, ), command=cmd, dependencies=dependencies)
def __init__(self, output_prefix, tfam, tped, indep_filter=None, indep_parameters=None, plink_parameters=None, dependencies=()): temp_prefix = os.path.basename(output_prefix) plink_cmd = ["plink", "--make-bed", "--noweb", "--tped", "%(IN_TPED)s", "--tfam", "%(IN_TFAM)s", "--out", "%(TEMP_OUT_PREFIX)s"] plink_cmd.extend(self._parse_parameters(plink_parameters)) command = AtomicCmd(plink_cmd, IN_TPED=tped, IN_TFAM=tfam, TEMP_OUT_PREFIX=temp_prefix, OUT_BED=output_prefix + ".bed", OUT_BIM=output_prefix + ".bim", OUT_FAM=output_prefix + ".fam", OUT_LOG=output_prefix + ".log", TEMP_OUT_NOSEX=temp_prefix + ".nosex", TEMP_OUT_NOF=temp_prefix + ".nof", CHECK_VERSION=PLINK_VERSION, set_cwd=True) CommandNode.__init__(self, description="<BuildBEDFiles -> '%s.*'>" % (output_prefix,), command=command, dependencies=dependencies)
def _setup(self, config, temp): with open(os.path.join(temp, "contigs.table"), "w") as handle: handle.write("ID\tSize\tNs\tHits\n") # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(self._input_file)).split('\n'): line = line.strip() if not line: continue name, size, hits, _ = line.split('\t') name = contig_name_to_plink_name(name) if name is None or not (name.isdigit() or name == 'X'): continue elif name not in self._contigs: # Excluding contigs is allowed continue if int(size) != self._contigs[name]['Size']: raise NodeError( "Size mismatch between database and BAM; " "expected size %i, found %i for contig %r" % (int(size), self._contigs[name]['Size'], name)) row = { 'ID': name, 'Size': self._contigs[name]['Size'], 'Ns': self._contigs[name]['Ns'], 'Hits': hits, } handle.write('{ID}\t{Size}\t{Ns}\t{Hits}\n'.format(**row)) CommandNode._setup(self, config, temp)
def _setup(self, config, temp): CommandNode._setup(self, config, temp) # The temp folder may contain old files: # Remove old pipes to prevent failure at _teardown for pipe_fname in glob.glob(os.path.join(temp, "pipe*")): fileutils.try_remove(pipe_fname) # ExaML refuses to overwrite old info files fileutils.try_remove(os.path.join(temp, "ExaML_info.Pypeline")) # Resume from last checkpoint, if one such was generated checkpoints = glob.glob(os.path.join(temp, "ExaML_binaryCheckpoint.Pypeline_*")) if not checkpoints: return cache = FileStatusCache() if not cache.are_files_outdated(self.input_files, checkpoints): checkpoints.sort(key=lambda fname: int(fname.rsplit("_", 1)[-1])) # FIXME: Less hacky solution to modifying AtomicCmds needed self._command._command.append("-R") self._command._command.append(checkpoints[-1]) else: for fpath in checkpoints: fileutils.try_remove(fpath)
def __init__(self, samples, prefix, output_prefix, dependencies=()): abs_prefix = os.path.abspath(prefix) script = rtools.rscript("zonkey", "pca.r") call = [ "Rscript", script, abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s" ] cmd = AtomicCmd(call, AUX_SCRIPT=script, IN_FILE_EVAL=prefix + ".eval", IN_FILE_EVEC=prefix + ".evec", IN_SAMPLES=samples, TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_LABELS=rtools.requirement("ggrepel"), set_cwd=True) CommandNode.__init__(self, description="<PlotPCA -> '%s.*'>" % (output_prefix, ), command=cmd, dependencies=dependencies)
def __init__(self, infile, index_format='.bai', dependencies=()): basename = os.path.basename(infile) if index_format == '.bai': samtools_version = SAMTOOLS_VERSION samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"] elif index_format == '.csi': samtools_version = SAMTOOLS_VERSION_1x samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"] else: raise ValueError("Unknown format type %r; expected .bai or .csi" % (index_format, )) cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"], IN_BAM=infile, TEMP_OUT_BAM=basename, set_cwd=True) cmd_index = AtomicCmd(samtools_call, TEMP_IN_BAM=basename, CHECK_SAM=samtools_version) cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"], TEMP_IN_BAM=basename + index_format, OUT_BAM=swap_ext(infile, index_format)) commands = SequentialCmds((cmd_link, cmd_index, cmd_rename)) CommandNode.__init__(self, description="<BAMIndex (%s): '%s'>" % (index_format[1:].upper(), infile), command=commands, dependencies=dependencies)
def _setup(self, config, temp): CommandNode._setup(self, config, temp) for fname in ("3pGtoA_freq.txt", "5pCtoT_freq.txt", "dnacomp.txt", "misincorporation.txt"): relpath = os.path.join(self._directory, fname) abspath = os.path.abspath(relpath) os.symlink(abspath, os.path.join(temp, fname))
def _setup(self, config, temp): CommandNode._setup(self, config, temp) input_files = [ self._input_file, fileutils.swap_ext(self._input_file, ".bim"), fileutils.swap_ext(self._input_file, ".fam"), ] for filename in input_files: basename = os.path.basename(filename) os.symlink(os.path.abspath(filename), os.path.join(temp, basename)) if self._supervised: fam_filename = fileutils.swap_ext(self._input_file, ".fam") pop_filename = fileutils.swap_ext(fam_filename, ".pop") pop_filename = fileutils.reroot_path(temp, pop_filename) key = "Group(%i)" % (self._k_groups, ) with open(fam_filename) as fam_handle: with open(pop_filename, "w") as pop_handle: for line in fam_handle: sample, _ = line.split(None, 1) group = self._samples.get(sample, {}).get(key, "-") pop_handle.write("%s\n" % (group, ))
def __init__( self, input_file_1, output_file, reference, prefix, input_file_2=None, threads=1, algorithm="mem", mapping_options={}, cleanup_options={}, dependencies=(), ): if algorithm not in ("mem", "bwasw"): raise NotImplementedError("BWA algorithm %r not implemented" % (algorithm, )) threads = _get_max_threads(reference, threads) aln = _new_bwa_command( ("bwa", algorithm, prefix, "%(IN_FILE_1)s"), prefix, IN_FILE_1=input_file_1, OUT_STDOUT=AtomicCmd.PIPE, ) if input_file_2: aln.add_value("%(IN_FILE_2)s") aln.set_kwargs(IN_FILE_2=input_file_2) aln.set_option("-t", threads) # Mark alternative hits as secondary; required by e.g. Picard aln.set_option("-M") cleanup = _new_cleanup_command(aln, output_file, reference, paired_end=input_file_1 and input_file_2) apply_options(aln, mapping_options) apply_options(cleanup, cleanup_options) description = _get_node_description( name="BWA", algorithm="%s%s" % (algorithm.upper(), "_PE" if input_file_2 else "_SE"), input_files_1=input_file_1, input_files_2=input_file_2, prefix=prefix, ) CommandNode.__init__( self, command=ParallelCmds([aln.finalize(), cleanup.finalize()]), description=description, threads=threads, dependencies=dependencies, )
def _setup(self, config, temp): with open(os.path.join(temp, "contigs.table"), "w") as handle: handle.write("ID\tSize\tNs\tHits\n") # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(self._input_file)).split('\n'): line = line.strip() if not line: continue name, size, hits, _ = line.split('\t') name = contig_name_to_plink_name(name) if name is None or not (name.isdigit() or name == 'X'): continue if int(size) != self._contigs[name]['Size']: raise NodeError("TODO: size mismatch") row = { 'ID': name, 'Size': self._contigs[name]['Size'], 'Ns': self._contigs[name]['Ns'], 'Hits': hits, } handle.write('{ID}\t{Size}\t{Ns}\t{Hits}\n'.format(**row)) CommandNode._setup(self, config, temp)
def __init__(self, input_prefix, output_prefix, nchroms, dependencies=()): self._input_prefix = input_prefix self._output_prefix = output_prefix self._nchroms = nchroms cmd = AtomicCmd( ("smartpca", "-p", "%(TEMP_OUT_PARAMS)s"), TEMP_OUT_PARAMS="parameters.txt", IN_FILE_BED=input_prefix + ".bed", IN_FILE_BIM=input_prefix + ".bim", IN_FILE_FAM=input_prefix + ".fam", OUT_STDOUT=output_prefix + ".log", OUT_EVEC=output_prefix + ".evec", OUT_EVAL=output_prefix + ".eval", OUT_SNPS=output_prefix + ".deleted_snps", CHECK_VERSION=SMARTPCA_VERSION, set_cwd=True, ) CommandNode.__init__( self, description="<SmartPCA -> '%s.*>" % (output_prefix, ), command=cmd, dependencies=dependencies, )
def __init__(self, input_file, output_prefix, order, samples, dependencies=()): self._samples = samples self._order = tuple(order) + ("Sample", ) script = rtools.rscript("zonkey", "admixture.r") cmd = AtomicCmd(("Rscript", script, "%(IN_FILE)s", "%(TEMP_OUT_NAMES)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=script, IN_FILE=input_file, IN_SAMPLES=samples, OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", TEMP_OUT_NAMES="samples.txt", TEMP_OUT_PREFIX=os.path.basename(output_prefix), CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_RESHAPE2=rtools.requirement("reshape2"), set_cwd=True) CommandNode.__init__(self, description="<AdmixturePlot -> '%s.*'>" % (output_prefix, ), command=cmd, dependencies=dependencies)
def __init__(self, output_root, table, bamfile, downsample, dependencies=()): cmd = factory.new("build_tped") cmd.set_option("--name", "Sample") cmd.set_option("--downsample", downsample) cmd.add_value("%(TEMP_DIR)s") cmd.add_value("%(IN_TABLE)s") cmd.add_value("%(IN_BAM)s") if not downsample: # Needed for random access (chromosomes are read 1 ... 31) cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai")) cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"), OUT_SUMMARY=os.path.join(output_root, "common.summary"), OUT_TPED_INCL_TS=os.path.join(output_root, "incl_ts.tped"), OUT_TPED_EXCL_TS=os.path.join(output_root, "excl_ts.tped"), IN_TABLE=table, IN_BAM=bamfile) CommandNode.__init__(self, description="<BuildTPEDFiles -> %r>" % (os.path.join(output_root, '*'),), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, infile, index_format=".bai", dependencies=()): if index_format == ".bai": samtools_call = ["samtools", "index", "%(IN_BAM)s", "%(OUT_IDX)s"] elif index_format == ".csi": samtools_call = [ "samtools", "index", "-c", "%(IN_BAM)s", "%(OUT_IDX)s" ] else: raise ValueError("Unknown format type %r; expected .bai or .csi" % (index_format, )) command = AtomicCmd( samtools_call, IN_BAM=infile, OUT_IDX=infile + index_format, CHECK_SAM=SAMTOOLS_VERSION, ) CommandNode.__init__( self, description="<BAMIndex (%s): '%s'>" % (index_format[1:].upper(), infile), command=command, dependencies=dependencies, )
def __init__(self, input_prefix, output_prefix, tfam, parameters=None, dependencies=()): basename = os.path.basename(output_prefix) plink_cmd = ["plink", "--freq", "--missing", "--noweb", "--bfile", input_prefix, "--within", "%(TEMP_OUT_CLUST)s", "--out", "%(TEMP_OUT_PREFIX)s"] if parameters: plink_cmd.extend(parameters.split()) plink = AtomicCmd(plink_cmd, IN_BED=input_prefix + ".bed", IN_BIM=input_prefix + ".bim", IN_FAM=input_prefix + ".fam", TEMP_OUT_CLUST="samples.clust", OUT_NOSEX=output_prefix + ".frq.strat.nosex", OUT_LOG=output_prefix + ".frq.strat.log", TEMP_OUT_PREFIX=basename, CHECK_VERSION=PLINK_VERSION) gzip = AtomicCmd(["gzip", "%(TEMP_IN_FREQ)s"], TEMP_IN_FREQ=basename + ".frq.strat", OUT_FREQ=output_prefix + ".frq.strat.gz") # FIXME! Can be self._tfam = tfam self._basename = basename CommandNode.__init__(self, description="<BuildFreqFiles -> '%s.*'" % (output_prefix,), command=SequentialCmds((plink, gzip)), dependencies=dependencies)
def __init__(self, infile, outfile, regions, options, dependencies=()): vcffilter = factory.new("vcf_filter") vcffilter.add_value("%(IN_VCF)s") for contig in regions["HomozygousContigs"]: vcffilter.add_option("--homozygous-chromosome", contig) vcffilter.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE) apply_options(vcffilter, options) bgzip = AtomicCmdBuilder(["bgzip"], IN_STDIN=vcffilter, OUT_STDOUT=outfile) description = "<VCFFilter: '%s' -> '%s'>" % ( infile, outfile, ) CommandNode.__init__( self, description=description, command=ParallelCmds([vcffilter.finalize(), bgzip.finalize()]), dependencies=dependencies, )
def _setup(self, config, temp): CommandNode._setup(self, config, temp) # The temp folder may contain old files: # Remove old pipes to prevent failure at _teardown for pipe_fname in glob.glob(os.path.join(temp, "pipe*")): fileutils.try_remove(pipe_fname) # ExaML refuses to overwrite old info files fileutils.try_remove(os.path.join(temp, "ExaML_info.Pypeline")) # Resume from last checkpoint, if one such was generated checkpoints = glob.glob( os.path.join(temp, "ExaML_binaryCheckpoint.Pypeline_*")) if not checkpoints: return cache = FileStatusCache() if not cache.are_files_outdated(self.input_files, checkpoints): checkpoints.sort(key=lambda fname: int(fname.rsplit("_", 1)[-1])) # FIXME: Less hacky solution to modifying AtomicCmds needed self._command._command.append("-R") self._command._command.append(checkpoints[-1]) else: for fpath in checkpoints: fileutils.try_remove(fpath)
def _setup(self, config, temp): with open(os.path.join(temp, "contigs.table"), "w") as handle: handle.write("ID\tSize\tNs\tHits\n") # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(self._input_file)).split("\n"): line = line.strip() if not line: continue name, size, hits, _ = line.split("\t") name = self._mapping.get(name, name) if name not in self._contigs: # Excluding contigs is allowed continue row = { "ID": name, "Size": self._contigs[name]["Size"], "Ns": self._contigs[name]["Ns"], "Hits": hits, } handle.write("{ID}\t{Size}\t{Ns}\t{Hits}\n".format(**row)) CommandNode._setup(self, config, temp)
def __init__(self, contigs, mapping, input_file, output_prefix, dependencies=()): self._contigs = contigs self._mapping = dict(zip(mapping.values(), mapping)) self._input_file = input_file script = rtools.rscript("zonkey", "coverage.r") cmd = AtomicCmd( ("Rscript", script, "%(TEMP_OUT_TABLE)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=script, IN_FILE=input_file, TEMP_OUT_TABLE="contigs.table", OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", TEMP_OUT_PREFIX=os.path.basename(output_prefix), CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), set_cwd=True, ) CommandNode.__init__( self, description="<CoveragePlot -> '%s.*'>" % (output_prefix, ), command=cmd, dependencies=dependencies, )
def _setup(self, config, temp): CommandNode._setup(self, config, temp) input_files = [ self._input_file, fileutils.swap_ext(self._input_file, ".bim"), fileutils.swap_ext(self._input_file, ".fam"), ] for filename in input_files: basename = os.path.basename(filename) os.symlink(os.path.abspath(filename), os.path.join(temp, basename)) if self._supervised: fam_filename = fileutils.swap_ext(self._input_file, ".fam") pop_filename = fileutils.swap_ext(fam_filename, ".pop") pop_filename = fileutils.reroot_path(temp, pop_filename) key = "Group(%i)" % (self._k_groups,) with open(fam_filename) as fam_handle: with open(pop_filename, "w") as pop_handle: for line in fam_handle: sample, _ = line.split(None, 1) group = self._samples.get(sample, {}).get(key, "-") pop_handle.write("%s\n" % (group,))
def test_commandnode_run__exception_on_error(): cmd_mock = _build_cmd_mock(return_codes=(1, )) node = CommandNode(cmd_mock) with pytest.raises(CmdNodeError): node._run(None, "xTMPx") assert cmd_mock.mock_calls == [call.run("xTMPx"), call.join()]
def __init__(self, input_file, output_prefix, order, samples, dependencies=()): self._samples = samples self._order = tuple(order) + ("Sample",) script = rtools.rscript("zonkey", "admixture.r") cmd = AtomicCmd(("Rscript", script, "%(IN_FILE)s", "%(TEMP_OUT_NAMES)s", "%(TEMP_OUT_PREFIX)s"), AUX_RSCRIPT=script, IN_FILE=input_file, IN_SAMPLES=samples, OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", TEMP_OUT_NAMES="samples.txt", TEMP_OUT_PREFIX=os.path.basename(output_prefix), CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_RESHAPE2=rtools.requirement("reshape2"), set_cwd=True) CommandNode.__init__(self, description="<AdmixturePlot -> '%s.*'>" % (output_prefix,), command=cmd, dependencies=dependencies)
def __init__(self, control_file, sequence_file, trees_file, output_tar, exclude_groups=(), dependencies=()): self._exclude_groups = safe_coerce_to_frozenset(exclude_groups) self._control_file = control_file self._sequence_file = sequence_file self._trees_file = trees_file paml_cmd = AtomicCmd(["codeml", "template.ctl"], IN_CONTROL_FILE = control_file, IN_SEQUENCE_FILE = sequence_file, IN_TREES_FILE = trees_file, TEMP_OUT_CTL = "template.ctl", TEMP_OUT_SEQS = "template.seqs", TEMP_OUT_TREES = "template.trees", TEMP_OUT_STDOUT = "template.stdout", TEMP_OUT_STDERR = "template.stderr", TEMP_OUT_4FOLD = "4fold.nuc", IN_STDIN = "/dev/null", # Prevent promts from blocking set_cwd = True, **CodemlNode._get_codeml_files("TEMP_OUT_CODEML")) tar_pairs = CodemlNode._get_codeml_files("TEMP_IN_CODEML") tar_files = ["%%(%s)s" % (key,) for key in tar_pairs] tar_cmd = AtomicCmd(["tar", "cvzf", "%(OUT_FILE)s"] + tar_files, OUT_FILE = output_tar, set_cwd = True, **tar_pairs) CommandNode.__init__(self, description = "<CodemlNode: %r -> %r>" % (sequence_file, output_tar), command = SequentialCmds([paml_cmd, tar_cmd]), dependencies = dependencies)
def __init__(self, output_root, table, bamfile, downsample, dependencies=()): cmd = factory.new("zonkey_tped") cmd.set_option("--name", "Sample") cmd.set_option("--downsample", downsample) cmd.add_value("%(TEMP_DIR)s") cmd.add_value("%(IN_TABLE)s") cmd.add_value("%(IN_BAM)s") if not downsample: # Needed for random access (chromosomes are read 1 ... 31) cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai")) cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"), OUT_SUMMARY=os.path.join(output_root, "common.summary"), OUT_TPED_INCL_TS=os.path.join(output_root, "incl_ts.tped"), OUT_TPED_EXCL_TS=os.path.join(output_root, "excl_ts.tped"), IN_TABLE=table, IN_BAM=bamfile) CommandNode.__init__(self, description="<BuildTPEDFiles -> %r>" % (os.path.join(output_root, '*'), ), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, parameters): self._kwargs = parameters.command.kwargs CommandNode.__init__(self, command = parameters.command.finalize(), description = "<RAxMLReduce: '%s' -> '%s'>" \ % (parameters.input_alignment, parameters.output_alignment), dependencies = parameters.dependencies)
def __init__(self, samples, prefix, output_prefix, dependencies=()): abs_prefix = os.path.abspath(prefix) script = rtools.rscript("zonkey", "pca.r") call = ["Rscript", script, abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s"] cmd = AtomicCmd(call, AUX_SCRIPT=script, IN_FILE_EVAL=prefix + ".eval", IN_FILE_EVEC=prefix + ".evec", IN_SAMPLES=samples, TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_PDF=output_prefix + ".pdf", OUT_PNG=output_prefix + ".png", CHECK_R=RSCRIPT_VERSION, CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_LABELS=rtools.requirement("directlabels"), set_cwd=True) CommandNode.__init__(self, description="<PlotPCA -> '%s.*'>" % (output_prefix,), command=cmd, dependencies=dependencies)
def __init__(self, input_file, output_prefix, threads=1, options={}, dependencies=()): # See below for parameters in common between SE/PE cmd = _get_common_parameters(threads=threads, options=options) # Prefix for output files, ensure that all end up in temp folder cmd.set_option("--basename", "%(TEMP_OUT_BASENAME)s") output_tmpl = output_prefix + ".%s.gz" cmd.set_kwargs( TEMP_OUT_BASENAME=os.path.basename(output_prefix), OUT_SETTINGS=output_prefix + ".settings", OUT_MATE_1=output_tmpl % ("truncated", ), OUT_DISCARDED=output_tmpl % ("discarded", ), ) cmd.set_option("--file1", "%(IN_READS_1)s") cmd.set_kwargs(IN_READS_1=input_file) apply_options(cmd, options) CommandNode.__init__( self, command=cmd.finalize(), threads=threads, description="<AdapterRM (SE): %s -> '%s.*'>" % ( fileutils.describe_files(input_file), output_prefix, ), dependencies=dependencies, )
def __init__(self, parameters): command = parameters.command.finalize() description = "<BWA Index '%s' -> '%s.*'>" % (parameters.input_file, parameters.prefix) CommandNode.__init__(self, command=command, description=description, dependencies=parameters.dependencies)
def _teardown(self, config, temp): os.remove(os.path.join(temp, "RAxML_info.output")) source = os.path.join(temp, "RAxML_parsimonyTree.output.0") destination = fileutils.reroot_path(temp, self._output_tree) fileutils.move_file(source, destination) CommandNode._teardown(self, config, temp)
def _teardown(self, config, temp): os.remove(os.path.join(temp, self.PIPE_FILE)) if self._index_format: os.remove( os.path.join(temp, swap_ext(self.PIPE_FILE, self._index_format))) CommandNode._teardown(self, config, temp)
def __init__(self, parameters): self._directory = parameters.directory description = "<mapDamage (model): %r>" % (parameters.directory, ) CommandNode.__init__(self, command=parameters.command.finalize(), description=description, dependencies=parameters.dependencies)
def _setup(self, config, temp): for key in ("IN_ALIGNMENT", "IN_PARTITION"): source = os.path.abspath(self._kwargs[key]) destination = os.path.join(temp, self._kwargs["TEMP_" + key]) os.symlink(source, destination) CommandNode._setup(self, config, temp)
def __init__(self, parameters): self._directory = parameters.directory description = "<mapDamage (model): %r>" % (parameters.directory,) CommandNode.__init__(self, command=parameters.command.finalize(), description=description, dependencies=parameters.dependencies)
def _teardown(self, config, temp): template = self._output_template bootstraps = self._bootstrap_num start = self._bootstrap_start for (src_file, dst_file) in self._bootstraps(template, bootstraps, start): src_file = os.path.join(temp, src_file) dst_file = fileutils.reroot_path(temp, dst_file) fileutils.move_file(src_file, dst_file) CommandNode._teardown(self, config, temp)
def _setup(self, config, temp): CommandNode._setup(self, config, temp) with open(self._tfam) as in_handle: samples = [line.split(None, 1)[0] for line in in_handle] with open(os.path.join(temp, "samples.clust"), "w") as handle: for sample in samples: handle.write("{0} {0} {0}\n".format(sample))
def _setup(self, config, temp): CommandNode._setup(self, config, temp) # Required to avoid the creation of files outside the temp folder for filename in self._symlinks: source = os.path.abspath(filename) destination = os.path.join(temp, os.path.basename(filename)) os.symlink(source, destination)
def _teardown(self, config, temp): with open(fileutils.reroot_path(temp, self._params_file), "w") as out: out.write("k: %i\n" % (self._param_k,)) out.write("m: %i\n" % (self._param_m,)) out.write("outgroup: %r\n" % (list(self._param_outgroup),)) open(fileutils.reroot_path(temp, self._parameters_hash), "w").close() CommandNode._teardown(self, config, temp)
def __init__(self, parameters): self._symlinks = [os.path.abspath(parameters.input_alignment)] self._output_tree = os.path.basename(parameters.output_tree) CommandNode.__init__(self, command = parameters.command.finalize(), description = "<Parsimonator: '%s' -> '%s'>" \ % (parameters.input_alignment, parameters.output_tree), dependencies = parameters.dependencies)