def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Detect Input Duplication: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, config, reference, intervals, infiles, outfile, dependencies=()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs( IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), IN_INTERVALS=intervals, OUT_BAMFILE=outfile, CHECK_GATK=_get_gatk_version_check(config), ) calmd = AtomicCmd( ["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM=self._basename, IN_REF=reference, TEMP_OUT_STDOUT=self._basename + ".calmd", CHECK_VERSION=SAMTOOLS_VERSION, ) description = "<Indel Realigner (aligning): %s -> %r>" % (describe_files(infiles), outfile) CommandNode.__init__( self, description=description, command=ParallelCmds([command.finalize(), calmd]), dependencies=dependencies )
def __init__(self, config, target_name, input_files, output_file, intervals_file = None, print_stats = False, max_contigs = _MAX_CONTIGS, dependencies = ()): self._target_name = target_name self._input_files = safe_coerce_to_tuple(input_files) self._output_file = output_file self._intervals = intervals_file self._print_stats = print_stats self._max_contigs = max_contigs self._max_contigs_reached = False input_files = [] input_files.extend(self._input_files) input_files.extend(swap_ext(input_file, ".bai") for input_file in self._input_files) if intervals_file: input_files.append(intervals_file) executables = ["coverageBed"] if intervals_file else ["genomeCoverageBed"] auxiliary_files = [] for cmd in concatenate_input_bams(config, self._input_files)[0]: executables.extend(cmd.executables) auxiliary_files.extend(cmd.auxiliary_files) Node.__init__(self, description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(self._input_files), self._output_file), input_files = input_files, output_files = self._output_file, dependencies = dependencies, executables = executables, auxiliary_files = auxiliary_files)
def __init__(self, config, reference, intervals, infiles, outfile, dependencies=()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), IN_INTERVALS=intervals, OUT_BAMFILE=outfile, CHECK_GATK=_get_gatk_version_check(config)) calmd = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM=self._basename, IN_REF=reference, TEMP_OUT_STDOUT=self._basename + ".calmd", CHECK_VERSION=SAMTOOLS_VERSION) description = "<Indel Realigner (aligning): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=ParallelCmds([command.finalize(), calmd]), dependencies=dependencies)
def __init__(self, parameters): description = "<MarkDuplicates: %s>" \ % (describe_files(parameters.input_bams),) PicardNode.__init__(self, command=parameters.command.finalize(), description=description, dependencies=parameters.dependencies)
def __init__(self, config, target_name, input_files, output_file, regions_file=None, dependencies=()): bam_input = MultiBAMInput(config, input_files) if len(bam_input.files) > 1 and regions_file: raise ValueError("DepthHistogram for regions require single, " "indexed input BAM file.") builder = factory.new("depths") builder.add_value("%(TEMP_IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(OUT_FILE=output_file) bam_input.setup(builder) if regions_file: builder.set_option('--regions-file', '%(IN_REGIONS)s') builder.set_kwargs(IN_REGIONS=regions_file) command = ParallelCmds(bam_input.commands + [builder.finalize()]) description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(bam_input.files), output_file) MultiBAMInputNode.__init__(self, bam_input=bam_input, command=command, description=description, dependencies=dependencies)
def __init__(self, input_files, output_file, offset, dependencies=()): self._offset = offset Node.__init__(self, description="<Validate FASTQ Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, parameters): self._version = parameters.version self._basename = parameters.basename if len(parameters.input_files_1) != len(parameters.input_files_2): raise CmdError("Number of mate 1 files differ from mate 2 files: %i != %i" \ % (len(parameters.input_files_1), len(parameters.input_files_2))) zcat_pair_1 = _build_unicat_command(parameters.input_files_1, "uncompressed_input_1") zcat_pair_2 = _build_unicat_command(parameters.input_files_2, "uncompressed_input_2") zip_pair_1 = _build_zip_command(parameters.output_format, parameters.output_prefix, ".pair1.truncated") zip_pair_2 = _build_zip_command(parameters.output_format, parameters.output_prefix, ".pair2.truncated") zip_discarded = _build_zip_command(parameters.output_format, parameters.output_prefix, ".discarded") adapterrm = parameters.command.finalize() commands = [adapterrm, zip_pair_1, zip_pair_2] if parameters.version == VERSION_15: zip_aln = _build_zip_command(parameters.output_format, parameters.output_prefix, ".collapsed") zip_aln_trunc = _build_zip_command(parameters.output_format, parameters.output_prefix, ".collapsed.truncated") zip_unaligned = _build_zip_command(parameters.output_format, parameters.output_prefix, ".singleton.truncated") commands += [zip_aln, zip_aln_trunc, zip_unaligned] else: zip_aln = _build_zip_command(parameters.output_format, parameters.output_prefix, ".singleton.aln.truncated") zip_unaligned = _build_zip_command(parameters.output_format, parameters.output_prefix, ".singleton.unaln.truncated") commands += [zip_aln, zip_unaligned] commands += [zip_discarded, zcat_pair_1, zcat_pair_2] # Opening of pipes block, so the order of these commands is dependent upon # the order of file-opens in atomiccmd and the the programs themselves. commands = ParallelCmds(commands) description = "<PE_AdapterRM: %s -> '%s.*'>" \ % (fileutils.describe_files(parameters.input_files_1).replace("file", "pair"), parameters.output_prefix) CommandNode.__init__(self, command=commands, description=description, dependencies=parameters.dependencies)
def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Validate FASTA Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies) assert len(self.output_files) == 1, self.output_files
def __init__(self, input_files, output_file, dependencies=()): self._output_file = output_file Node.__init__(self, description = "<MergeCoverage: '%s' -> '%s'>" \ % (describe_files(input_files), self._output_file), input_files = input_files, output_files = self._output_file, dependencies = dependencies)
def __init__(self, input_files, output_file, dependencies=()): self._output_file = output_file Node.__init__( self, description="<MergeCoverage: '%s' -> '%s'>" % (describe_files(input_files), self._output_file), input_files=input_files, output_files=self._output_file, dependencies=dependencies, )
def __init__(self, config, input_bams, output_bam, dependencies = ()): cat_cmds, cat_obj = concatenate_input_bams(config, input_bams) filteruniq = AtomicCmd(["bam_rmdup_collapsed", "--remove-duplicates"], IN_STDIN = cat_obj, OUT_STDOUT = output_bam) command = ParallelCmds(cat_cmds + [filteruniq]) description = "<FilterCollapsedBAM: %s>" % (describe_files(input_bams),) CommandNode.__init__(self, command = command, description = description, dependencies = dependencies)
def __init__(self, config, input_bams, output_bam, dependencies=()): cat_cmds, cat_obj = concatenate_input_bams(config, input_bams) filteruniq = AtomicCmd(["bam_rmdup_collapsed", "--remove-duplicates"], IN_STDIN=cat_obj, OUT_STDOUT=output_bam) command = ParallelCmds(cat_cmds + [filteruniq]) description = "<FilterCollapsedBAM: %s>" % ( describe_files(input_bams), ) CommandNode.__init__(self, command=command, description=description, dependencies=dependencies)
def __init__(self, parameters): bam_input = MultiBAMInput(parameters.config, parameters.input_files) bam_input.setup(parameters.command) cmd_map = parameters.command.finalize() description = "<mapDamage (plots): %s -> '%s'>" \ % (describe_files(parameters.input_files), parameters.output_directory) MultiBAMInputNode.__init__(self, bam_input=bam_input, command=ParallelCmds(bam_input.commands + [cmd_map]), description=description, dependencies=parameters.dependencies)
def __init__(self, config, input_files, output_file, dependencies=()): bam_input = MultiBAMInput(config, input_files) duphist_command = factory.new("duphist") duphist_command.add_value("%(TEMP_IN_BAM)s") duphist_command.set_kwargs(OUT_STDOUT=output_file) bam_input.setup(duphist_command) duphist_command = duphist_command.finalize() commands = ParallelCmds(bam_input.commands + [duphist_command]) description = "<DuplicateHistogram: %s -> %r>" % (describe_files(input_files), output_file) MultiBAMInputNode.__init__( self, bam_input=bam_input, command=commands, description=description, dependencies=dependencies )
def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()): self._output_file = output_file self._main_tree_files = safe_coerce_to_tuple(main_tree_files) self._support_tree_files = safe_coerce_to_tuple(support_tree_files) input_files = self._main_tree_files + self._support_tree_files description = "<NewickSupport: %s>" % \ (describe_files(main_tree_files),) Node.__init__(self, description = description, input_files = input_files, output_files = output_file, dependencies = dependencies)
def __init__(self, parameters): self._directory = parameters.directory bam_input = MultiBAMInput(parameters.config, parameters.input_files) bam_input.setup(parameters.command) command = parameters.command.finalize() description = "<mapDamage (rescale): %s -> %r>" \ % (describe_files(parameters.input_files), parameters.output_file) MultiBAMInputNode.__init__(self, bam_input=bam_input, command=ParallelCmds(bam_input.commands + [command]), description=description, dependencies=parameters.dependencies)
def __init__(self, parameters): self._quality_offset = parameters.quality_offset self._basename = parameters.basename zcat = _build_cat_command(parameters.input_files, "uncompressed_input") zip_truncated = _build_zip_command(parameters.output_format, parameters.output_prefix, ".truncated") zip_discarded = _build_zip_command(parameters.output_format, parameters.output_prefix, ".discarded") adapterrm = parameters.command.finalize() commands = ParallelCmds([adapterrm, zip_discarded, zip_truncated, zcat]) CommandNode.__init__(self, command = commands, description = "<AdapterRM (SE): %s -> '%s.*'>" \ % (fileutils.describe_files(parameters.input_files), parameters.output_prefix), dependencies = parameters.dependencies)
def __init__(self, parameters): self._basename = parameters.basename zcat = _build_unicat_command(parameters.input_files, "uncompressed_input") zip_truncated = _build_zip_command(parameters.output_format, parameters.output_prefix, ".truncated") zip_discarded = _build_zip_command(parameters.output_format, parameters.output_prefix, ".discarded") adapterrm = parameters.command.finalize() # Opening of pipes block, so the order of these commands is dependent upon # the order of file-opens in atomiccmd and the the programs themselves. commands = ParallelCmds([adapterrm, zip_discarded, zip_truncated, zcat]) CommandNode.__init__(self, command = commands, description = "<SE_AdapterRM: %s -> '%s.*'>" \ % (fileutils.describe_files(parameters.input_files), parameters.output_prefix), dependencies = parameters.dependencies)
def __init__(self, config, input_bams, output_bam, keep_dupes=True, dependencies=()): bam_input = MultiBAMInput(config, input_bams) builder = factory.new("rmdup_collapsed") builder.add_value("%(TEMP_IN_BAM)s") builder.set_kwargs(OUT_STDOUT=output_bam) bam_input.setup(builder) if not keep_dupes: builder.set_option("--remove-duplicates") filteruniq = builder.finalize() command = ParallelCmds(bam_input.commands + [filteruniq]) description = "<FilterCollapsedBAM: %s>" % (describe_files(bam_input.files),) MultiBAMInputNode.__init__( self, bam_input=bam_input, command=command, description=description, dependencies=dependencies )
def __init__(self, tree_files, output_file, taxa = (), dependencies = ()): self._output_file = output_file self._tree_files = safe_coerce_to_tuple(tree_files) self._reroot_on_taxa = safe_coerce_to_tuple(taxa) reroot_on = "midpoint" if self._reroot_on_taxa: reroot_on = repr("', '".join(sorted(self._reroot_on_taxa))) description = "<NewickReroot (on %s): %s>" % \ (reroot_on, describe_files(tree_files),) Node.__init__(self, description = description, input_files = self._tree_files, output_files = self._output_file, dependencies = dependencies)
def __init__(self, config, input_files, output_file, dependencies=()): bam_input = MultiBAMInput(config, input_files) duphist_command = factory.new("duphist") duphist_command.add_value('%(TEMP_IN_BAM)s') duphist_command.set_kwargs(OUT_STDOUT=output_file) bam_input.setup(duphist_command) duphist_command = duphist_command.finalize() commands = ParallelCmds(bam_input.commands + [duphist_command]) description = "<DuplicateHistogram: %s -> %r>" \ % (describe_files(input_files), output_file) MultiBAMInputNode.__init__(self, bam_input=bam_input, command=commands, description=description, dependencies=dependencies)
def __init__(self, config, reference, infiles, outfile, dependencies=()): infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") _set_input_files(command, infiles) command.set_kwargs( IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile, CHECK_GATK=_get_gatk_version_check(config), ) description = "<Indel Realigner (training): %s -> %r>" % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=command.finalize(), dependencies=dependencies)
def __init__(self, parameters): self._quality_offset = parameters.quality_offset self._basename = parameters.basename zcat = _build_cat_command(parameters.input_files, "uncompressed_input") zip_truncated = _build_zip_command(parameters.output_format, parameters.output_prefix, ".truncated") zip_discarded = _build_zip_command(parameters.output_format, parameters.output_prefix, ".discarded") adapterrm = parameters.command.finalize() commands = ParallelCmds( [adapterrm, zip_discarded, zip_truncated, zcat]) CommandNode.__init__(self, command = commands, description = "<AdapterRM (SE): %s -> '%s.*'>" \ % (fileutils.describe_files(parameters.input_files), parameters.output_prefix), dependencies = parameters.dependencies)
def __init__(self, config, input_bams, output_bam, keep_dupes=True, dependencies=()): bam_input = MultiBAMInput(config, input_bams) builder = factory.new("rmdup_collapsed") builder.add_value("%(TEMP_IN_BAM)s") builder.set_kwargs(OUT_STDOUT=output_bam) bam_input.setup(builder) if not keep_dupes: builder.set_option("--remove-duplicates") filteruniq = builder.finalize() command = ParallelCmds(bam_input.commands + [filteruniq]) description = "<FilterCollapsedBAM: %s>" \ % (describe_files(bam_input.files),) MultiBAMInputNode.__init__(self, bam_input=bam_input, command=command, description=description, dependencies=dependencies)
def __init__(self, config, reference, infiles, outfile, dependencies=()): infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile, CHECK_GATK=_get_gatk_version_check(config)) description = "<Indel Realigner (training): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=command.finalize(), dependencies=dependencies)
def __init__(self, config, target_name, input_files, output_file, regions_file=None, dependencies=()): bam_input = MultiBAMInput(config, input_files) if len(bam_input.files) > 1 and regions_file: raise ValueError("DepthHistogram for regions require single, " "indexed input BAM file.") builder = factory.new("depths") builder.add_value("%(TEMP_IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(OUT_FILE=output_file) bam_input.setup(builder) if regions_file: builder.set_option("--regions-file", "%(IN_REGIONS)s") builder.set_kwargs(IN_REGIONS=regions_file) command = ParallelCmds(bam_input.commands + [builder.finalize()]) description = "<DepthHistogram: %s -> '%s'>" % (describe_files(bam_input.files), output_file) MultiBAMInputNode.__init__( self, bam_input=bam_input, command=command, description=description, dependencies=dependencies )
def __init__(self, config, target_name, input_files, output_file, intervals_file=None, print_stats=False, max_contigs=_MAX_CONTIGS, dependencies=()): self._target_name = target_name self._input_files = safe_coerce_to_tuple(input_files) self._output_file = output_file self._intervals = intervals_file self._print_stats = print_stats self._max_contigs = max_contigs self._max_contigs_reached = False input_files = [] input_files.extend(self._input_files) input_files.extend( swap_ext(input_file, ".bai") for input_file in self._input_files) if intervals_file: input_files.append(intervals_file) executables = ["coverageBed" ] if intervals_file else ["genomeCoverageBed"] auxiliary_files = [] for cmd in concatenate_input_bams(config, self._input_files)[0]: executables.extend(cmd.executables) auxiliary_files.extend(cmd.auxiliary_files) Node.__init__(self, description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(self._input_files), self._output_file), input_files = input_files, output_files = self._output_file, dependencies = dependencies, executables = executables, auxiliary_files = auxiliary_files)
def __init__(self, parameters): self._version = parameters.version self._basename = parameters.basename if len(parameters.input_files_1) != len(parameters.input_files_2): raise CmdError("Number of mate 1 files differ from mate 2 files: %i != %i" \ % (len(parameters.input_files_1), len(parameters.input_files_2))) zcat_pair_1 = _build_unicat_command(parameters.input_files_1, "uncompressed_input_1") zcat_pair_2 = _build_unicat_command(parameters.input_files_2, "uncompressed_input_2") zip_pair_1 = _build_zip_command(parameters.output_format, parameters.output_prefix, ".pair1.truncated") zip_pair_2 = _build_zip_command(parameters.output_format, parameters.output_prefix, ".pair2.truncated") zip_discarded = _build_zip_command(parameters.output_format, parameters.output_prefix, ".discarded") adapterrm = parameters.command.finalize() commands = [adapterrm, zip_pair_1, zip_pair_2] if parameters.version == VERSION_15: zip_aln = _build_zip_command(parameters.output_format, parameters.output_prefix, ".collapsed") zip_aln_trunc = _build_zip_command(parameters.output_format, parameters.output_prefix, ".collapsed.truncated") zip_unaligned = _build_zip_command(parameters.output_format, parameters.output_prefix, ".singleton.truncated") commands += [zip_aln, zip_aln_trunc, zip_unaligned] else: zip_aln = _build_zip_command(parameters.output_format, parameters.output_prefix, ".singleton.aln.truncated") zip_unaligned = _build_zip_command(parameters.output_format, parameters.output_prefix, ".singleton.unaln.truncated") commands += [zip_aln, zip_unaligned] commands += [zip_discarded, zcat_pair_1, zcat_pair_2] # Opening of pipes block, so the order of these commands is dependent upon # the order of file-opens in atomiccmd and the the programs themselves. commands = ParallelCmds(commands) description = "<PE_AdapterRM: %s -> '%s.*'>" \ % (fileutils.describe_files(parameters.input_files_1).replace("file", "pair"), parameters.output_prefix) CommandNode.__init__(self, command = commands, description = description, dependencies = parameters.dependencies)
def __init__(self, parameters): self._basename = parameters.basename zcat = _build_unicat_command(parameters.input_files, "uncompressed_input") zip_truncated = _build_zip_command(parameters.output_format, parameters.output_prefix, ".truncated") zip_discarded = _build_zip_command(parameters.output_format, parameters.output_prefix, ".discarded") adapterrm = parameters.command.finalize() # Opening of pipes block, so the order of these commands is dependent upon # the order of file-opens in atomiccmd and the the programs themselves. commands = ParallelCmds( [adapterrm, zip_discarded, zip_truncated, zcat]) CommandNode.__init__(self, command = commands, description = "<SE_AdapterRM: %s -> '%s.*'>" \ % (fileutils.describe_files(parameters.input_files), parameters.output_prefix), dependencies = parameters.dependencies)
def test_describe_files__iterable(): fpaths = iter(("/var/foo/bar", "/var/foo/foo")) assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
def test_describe_files__no_files(): assert_equal(describe_files(()), "No files")
def test_describe_files__same_path_abs__1_differences(): fpaths = ("/var/foo/faz", "/var/foo/fao") assert_equal(describe_files(fpaths), "'/var/foo/fa?'")
def test_describe_files__same_path_rel(): fpaths = ("var/foo/bar", "var/foo/foo") assert_equal(describe_files(fpaths), "2 files in 'var/foo'")
def test_describe_files__single_file(): fpath = "/var/foo/bar" assert_equal(describe_files((fpath,)), repr(fpath))
def test_describe_files__same_path_abs__3_differences(): fpaths = ("/var/foo/bar", "/var/foo/foo") assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
def test_describe_files__different_paths_rel(): fpaths = ("var/foo/bar", "var/bar/foo") assert_equal(describe_files(fpaths), "2 files")
def test_describe_files__single_file(): fpath = "/var/foo/bar" assert_equal(describe_files((fpath, )), repr(fpath))