def _build_cat_command(input_file, output_file): cat = factory.new("cat") cat.set_option("--output", "%(TEMP_OUT_CAT)s") cat.add_value("%(IN_ARCHIVE)s") cat.set_kwargs(TEMP_OUT_CAT=output_file, IN_ARCHIVE=input_file) return cat
def __init__(self, output_root, table, bamfile, downsample, dependencies=()): cmd = factory.new("zonkey_tped") cmd.set_option("--name", "Sample") cmd.set_option("--downsample", downsample) cmd.add_value("%(TEMP_DIR)s") cmd.add_value("%(IN_TABLE)s") cmd.add_value("%(IN_BAM)s") if not downsample: # Needed for random access (chromosomes are read 1 ... 31) cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai")) cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"), OUT_SUMMARY=os.path.join(output_root, "common.summary"), OUT_TPED_INCL_TS=os.path.join(output_root, "incl_ts.tped"), OUT_TPED_EXCL_TS=os.path.join(output_root, "excl_ts.tped"), IN_TABLE=table, IN_BAM=bamfile) CommandNode.__init__(self, description="<BuildTPEDFiles -> %r>" % (os.path.join(output_root, '*'), ), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, infile, bedfile, outfile, padding, options={}, dependencies=()): params = factory.new("vcf_to_fasta") params.set_option("--padding", padding) params.set_option("--genotype", "%(IN_VCFFILE)s") params.set_option("--intervals", "%(IN_INTERVALS)s") params.set_kwargs( IN_VCFFILE=infile, IN_TABIX=infile + ".tbi", IN_INTERVALS=bedfile, OUT_STDOUT=outfile, ) apply_options(params, options) description = "<BuildRegions: '%s' -> '%s'>" % ( infile, outfile, ) CommandNode.__init__( self, description=description, command=params.finalize(), dependencies=dependencies, )
def _read_sequences(file_type, filename, stats): cat_call = factory.new("cat") cat_call.add_multiple_values((filename, )) cat_call = cat_call.finalized_call cat = None try: cat = procs.open_proc(cat_call, bufsize=io.DEFAULT_BUFFER_SIZE, stderr=procs.PIPE, stdout=procs.PIPE) qualities = _collect_qualities(cat.stdout, file_type, filename, stats) return sampling.reservoir_sampling(qualities, 100000) except StandardError as error: if cat: try: cat.kill() except OSError: pass cat.wait() cat = None raise error finally: rc_cat = cat.wait() if cat else 0 if rc_cat: message = "Error running 'paleomix cat':\n" \ " Unicat return-code = %i\n\n%s" \ % (rc_cat, cat.stderr.read()) raise NodeError(message)
def _read_sequences(filename): cat_call = factory.new("cat") cat_call.add_multiple_values((filename,)) cat_call = cat_call.finalized_call cat = None try: cat = procs.open_proc(cat_call, bufsize=io.DEFAULT_BUFFER_SIZE, stderr=procs.PIPE, stdout=procs.PIPE) qualities = _collect_qualities(cat.stdout, filename) return sampling.reservoir_sampling(qualities, 100000) except: if cat: cat.kill() cat.wait() cat = None raise finally: rc_cat = cat.wait() if cat else 0 if rc_cat: message = "Error running 'paleomix cat':\n" \ " Unicat return-code = %i\n\n%s" \ % (rc_cat, cat.stderr.read()) raise NodeError(message)
def __init__( self, target_name, input_file, output_file, prefix, regions_file=None, dependencies=(), ): index_format = regions_file and prefix["IndexFormat"] builder = factory.new("depths") builder.add_value("%(IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(OUT_FILE=output_file, IN_BAM=input_file) if regions_file: builder.set_option("--regions-file", "%(IN_REGIONS)s") builder.set_kwargs(IN_REGIONS=regions_file, TEMP_IN_INDEX=input_file + index_format) description = "<DepthHistogram: %s -> '%s'>" % ( input_file, output_file, ) CommandNode.__init__( self, command=builder.finalize(), description=description, dependencies=dependencies, )
def _build_cat_command(input_files, output_file): cat = factory.new("cat") cat.set_option("--output", "%(TEMP_OUT_CAT)s") cat.set_kwargs(TEMP_OUT_CAT=output_file) cat.add_multiple_values(input_files) return cat.finalize()
def customize(cls, reference, infile_bam, infile_vcf, outfile, dependencies=()): params = factory.new("genotype") params.add_value("%(IN_BAMFILE)s") params.add_value("%(OUT_PILEUP)s") params.set_option("--bedfile", "%(TEMP_IN_INTERVALS)s") params.set_option("--pileup-only") # Ignore read-groups for pileup params.add_option("--mpileup-argument", "-R", sep="=") # Reference sequence (FASTA) params.add_option("--mpileup-argument", "-f=%s" % (reference, ), sep="=") params.set_kwargs( IN_BAMFILE=infile_bam, TEMP_IN_INTERVALS="heterozygous_snps.bed", # Automatically remove this file TEMP_OUT_INTERVALS="heterozygous_snps.bed", OUT_PILEUP=outfile, CHECK_SAMTOOLS=SAMTOOLS_VERSION) return {"command": params}
def customize(cls, reference, infile, bedfile, outfile, pileup_only=False, nbatches=1, dependencies=()): params = factory.new("genotype") params.add_value("%(IN_BAMFILE)s") params.add_value("%(OUT_VCFFILE)s") params.set_option("--nbatches", nbatches) if bedfile: params.set_option("--bedfile", "%(IN_INTERVALS)s") if pileup_only: params.set_option("--pileup-only") # Ignore read-groups for pileup params.add_option("--mpileup-argument", "-R", sep="=") # Reference sequence (FASTA) params.add_option("--mpileup-argument", "-f=%s" % (reference, ), sep="=") params.set_kwargs(IN_BAMFILE=infile, IN_INTERVALS=bedfile, OUT_VCFFILE=outfile, CHECK_SAMTOOLS=SAMTOOLS_VERSION_0119, CHECK_BCFTOOLS=BCFTOOLS_VERSION_0119) return {"command": params}
def __init__(self, config, input_bams, output_bam, keep_dupes=True, dependencies=()): input_bams = safe_coerce_to_tuple(input_bams) builder = factory.new("rmdup_collapsed") builder.add_value("%(TEMP_IN_BAM)s") builder.set_kwargs(OUT_STDOUT=output_bam, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_bams) if not keep_dupes: builder.set_option("--remove-duplicates") description = "<FilterCollapsedBAM: %s>" \ % (describe_files(input_bams),) MultiBAMInputNode.__init__(self, config=config, input_bams=input_bams, command=builder.finalize(), description=description, dependencies=dependencies)
def __init__(self, output_root, table, bamfile, downsample, dependencies=()): cmd = factory.new("build_tped") cmd.set_option("--name", "Sample") cmd.set_option("--downsample", downsample) cmd.add_value("%(TEMP_DIR)s") cmd.add_value("%(IN_TABLE)s") cmd.add_value("%(IN_BAM)s") if not downsample: # Needed for random access (chromosomes are read 1 ... 31) cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai")) cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"), OUT_SUMMARY=os.path.join(output_root, "common.summary"), OUT_TPED_INCL_TS=os.path.join(output_root, "incl_ts.tped"), OUT_TPED_EXCL_TS=os.path.join(output_root, "excl_ts.tped"), IN_TABLE=table, IN_BAM=bamfile) CommandNode.__init__(self, description="<BuildTPEDFiles -> %r>" % (os.path.join(output_root, '*'),), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, infile, outfile, regions, options, dependencies=()): vcffilter = factory.new("vcf_filter") vcffilter.add_value("%(IN_VCF)s") for contig in regions["HomozygousContigs"]: vcffilter.add_option("--homozygous-chromosome", contig) vcffilter.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE) apply_options(vcffilter, options) bgzip = AtomicCmdBuilder(["bgzip"], IN_STDIN=vcffilter, OUT_STDOUT=outfile) description = "<VCFFilter: '%s' -> '%s'>" % ( infile, outfile, ) CommandNode.__init__( self, description=description, command=ParallelCmds([vcffilter.finalize(), bgzip.finalize()]), dependencies=dependencies, )
def customize(cls, infile, bedfile, outfile, dependencies=()): params = factory.new("sample_pileup") params.set_option("--genotype", "%(IN_PILEUP)s") params.set_option("--intervals", "%(IN_INTERVALS)s") params.set_kwargs(IN_PILEUP=infile, IN_INTERVALS=bedfile, OUT_STDOUT=outfile) return {"command": params}
def customize(cls, pileup, infile, outfile, regions, dependencies=()): cat = factory.new("cat") cat.add_value("%(IN_VCF)s") cat.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE) vcffilter = factory.new("vcf_filter") vcffilter.add_option("--pileup", "%(IN_PILEUP)s") for contig in regions["HomozygousContigs"]: vcffilter.add_option("--homozygous-chromosome", contig) vcffilter.set_kwargs(IN_PILEUP=pileup, IN_STDIN=cat, OUT_STDOUT=AtomicCmd.PIPE) bgzip = AtomicCmdBuilder(["bgzip"], IN_STDIN=vcffilter, OUT_STDOUT=outfile) return {"commands": {"cat": cat, "filter": vcffilter, "bgzip": bgzip}}
def _do_test_factory__commands(command, expected): cmd = factory.new(command) call = cmd.finalized_call if command in ("bam_pipeline", "trim_pipeline"): call.append("run") stdout, stderr = check_run(call + ["--help"]) assert_equal(expected, stdout.split("\n")[0]) assert_equal("", stderr)
def customize(cls, infile, bedfile, outfile, padding, dependencies=()): params = factory.new("vcf_to_fasta") params.set_option("--padding", padding) params.set_option("--genotype", "%(IN_VCFFILE)s") params.set_option("--intervals", "%(IN_INTERVALS)s") params.set_kwargs(IN_VCFFILE=infile, IN_TABIX=infile + ".tbi", IN_INTERVALS=bedfile, OUT_STDOUT=outfile) return {"command": params}
def __init__(self, input_files, destination, dependencies=()): cat_cmd = factory.new("cat") cat_cmd.add_multiple_values(input_files) cat_cmd.set_kwargs(OUT_STDOUT=AtomicCmd.PIPE) cat_cmd = cat_cmd.finalize() zip_cmd = AtomicCmd("gzip", IN_STDIN=cat_cmd, OUT_STDOUT=destination) description = "<Cat %s -> %s>" \ % (fileutils.describe_files(input_files), destination) CommandNode.__init__(self, description=description, command=ParallelCmds((cat_cmd, zip_cmd)), dependencies=dependencies)
def _new_cleanup_command(stdin, output_file, reference, paired_end=False): convert = factory.new("cleanup") convert.set_option("--fasta", "%(IN_FASTA_REF)s") convert.set_option("--temp-prefix", "%(TEMP_OUT_PREFIX)s") convert.set_kwargs( IN_STDIN=stdin, IN_FASTA_REF=reference, OUT_STDOUT=output_file, TEMP_OUT_PREFIX="bam_cleanup", CHECK_SAMTOOLS=SAMTOOLS_VERSION, ) if paired_end: convert.set_option("--paired-end") return convert
def __init__(self, config, input_files, output_file, dependencies=()): input_files = safe_coerce_to_tuple(input_files) builder = factory.new("duphist") builder.add_value('%(TEMP_IN_BAM)s') builder.set_kwargs(OUT_STDOUT=output_file, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_files) description = "<DuplicateHistogram: %s -> %r>" \ % (describe_files(input_files), output_file) MultiBAMInputNode.__init__(self, config=config, input_bams=input_files, command=builder.finalize(), description=description, dependencies=dependencies)
def __init__(self, database, bamfile, output_prefix, dependencies=()): cmd = factory.new("zonkey_mito") cmd.add_value("%(IN_DATABASE)s") cmd.add_value("%(IN_BAMFILE)s") cmd.add_value("%(TEMP_OUT_PREFIX)s") cmd.set_kwargs(IN_DATABASE=database, IN_BAMFILE=bamfile, TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_PHYLIP=output_prefix + ".phy", OUT_FASTA=output_prefix + ".fasta", OUT_SUMMARY=output_prefix + ".summary") CommandNode.__init__(self, description="<MitoConsensus -> '%s.*'>" % (output_prefix, ), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, database, bamfile, output_prefix, dependencies=()): cmd = factory.new("build_mito") cmd.add_value("%(IN_DATABASE)s") cmd.add_value("%(IN_BAMFILE)s") cmd.add_value("%(TEMP_OUT_PREFIX)s") cmd.set_kwargs(IN_DATABASE=database, IN_BAMFILE=bamfile, TEMP_OUT_PREFIX=os.path.basename(output_prefix), OUT_PHYLIP=output_prefix + ".phy", OUT_FASTA=output_prefix + ".fasta", OUT_SUMMARY=output_prefix + ".summary") CommandNode.__init__(self, description="<MitoConsensus -> '%s.*'>" % (output_prefix,), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, input_files, output_file, offset, collapsed=False, dependencies=()): command = factory.new(":validate_fastq") command.set_option("--offset", offset) if collapsed: command.set_option("--collapsed") command.add_multiple_values(input_files) command.set_kwargs(OUT_STDOUT=output_file) CommandNode.__init__( self, description="<Validate FASTQ Files: %s>" % (describe_files(input_files)), command=command.finalize(), dependencies=dependencies, )
def __init__(self, target_name, input_file, output_file, regions_file=None, dependencies=()): builder = factory.new("coverage") builder.add_value("%(IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(IN_BAM=input_file, OUT_FILE=output_file) if regions_file: builder.set_option('--regions-file', '%(IN_REGIONS)s') builder.set_kwargs(IN_REGIONS=regions_file) description = "<Coverage: %s -> '%s'>" % (input_file, output_file) CommandNode.__init__(self, command=builder.finalize(), description=description, dependencies=dependencies)
def __init__(self, config, input_bams, output_bam, keep_dupes=True, dependencies=()): merge = merge_bam_files_command(input_bams) builder = factory.new("rmdup_collapsed") builder.set_kwargs(IN_STDIN=merge, OUT_STDOUT=output_bam) if not keep_dupes: builder.set_option("--remove-duplicates") description = "<FilterCollapsedBAM: %s>" % (describe_files( merge.input_files), ) CommandNode.__init__( self, command=ParallelCmds([merge, builder.finalize()]), description=description, dependencies=dependencies, )
def _process_output(stdin, output_file, reference, run_fixmate=False): convert = factory.new("cleanup") convert.set_option("--fasta", "%(IN_FASTA_REF)s") convert.set_option("--temp-prefix", "%(TEMP_OUT_PREFIX)s") convert.set_kwargs(IN_STDIN=stdin, IN_FASTA_REF=reference, OUT_STDOUT=output_file, TEMP_OUT_PREFIX="bam_cleanup", CHECK_SAMTOOLS=SAMTOOLS_VERSION) if run_fixmate: convert.set_option('--paired-end') try: if SAMTOOLS_VERSION.version >= (1,): convert.set_option('--samtools1x', 'yes') else: convert.set_option('--samtools1x', 'no') except versions.VersionRequirementError: pass return ["convert"], {"convert": convert}
def setup_basic_batch(args, regions, prefix, func, first_batch=True): setup = {"files": {}, "temp_files": {}, "procs": {}, "handles": {}} try: setup["files"]["bed"] = write_bed_file(prefix, regions) setup["temp_files"]["bed"] = setup["files"]["bed"] filter_builder = factory.new("genotype") filter_builder.set_option("--filter-only") filter_builder.set_option("--bedfile", setup["files"]["bed"]) filter_builder.add_option(args.bamfile) filter_builder.add_option(args.destination) setup["procs"]["filter"] \ = processes.open_proc(filter_builder.call, stdout=processes.PIPE) call_stdout = func(setup) if not first_batch: setup["procs"]["grep"] = processes.open_proc(('grep', '-v', '^#'), stdin=call_stdout, stdout=processes.PIPE) call_stdout = setup["procs"]["grep"].stdout setup["handles"]["outfile"] = open(prefix, "w") zip_proc = processes.open_proc(["bgzip"], stdin=call_stdout, stdout=setup["handles"]["outfile"]) setup["procs"]["gzip"] = zip_proc return setup except: sys.stderr.write(traceback.format_exc() + "\n") cleanup_batch(setup) raise
def _process_output(stdin, output_file, reference, run_fixmate=False): convert = factory.new("cleanup") if reference is not None: convert.set_option("--fasta", "%(IN_FASTA_REF)s") convert.set_option("--temp-prefix", "%(TEMP_OUT_PREFIX)s") convert.set_kwargs(IN_STDIN=stdin, IN_FASTA_REF=reference, OUT_STDOUT=output_file, TEMP_OUT_PREFIX="bam_cleanup", CHECK_SAMTOOLS=SAMTOOLS_VERSION) if run_fixmate: convert.set_option('--paired-end') try: if SAMTOOLS_VERSION.version >= (1,): convert.set_option('--samtools1x', 'yes') else: convert.set_option('--samtools1x', 'no') except versions.VersionRequirementError: pass return ["convert"], {"convert": convert}
def __init__(self, config, target_name, input_files, output_file, prefix, regions_file=None, dependencies=()): input_files = safe_coerce_to_tuple(input_files) index_format = regions_file and prefix['IndexFormat'] builder = factory.new("depths") builder.add_value("%(TEMP_IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(OUT_FILE=output_file, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_files) if regions_file: index_file = swap_ext(MultiBAMInputNode.PIPE_FILE, index_format) builder.set_option('--regions-file', '%(IN_REGIONS)s') builder.set_kwargs(IN_REGIONS=regions_file, TEMP_IN_INDEX=index_file) description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(input_files), output_file) MultiBAMInputNode.__init__(self, config=config, input_bams=input_files, index_format=index_format, command=builder.finalize(), description=description, dependencies=dependencies)