def _build_bwa_backtrack_pe(self, config, prefix, record, parameters): template = parameters.pop("input_file") output_bam = parameters.pop("output_file") output_sai_1 = swap_ext(output_bam, "%i.sai" % (1, )) aln_node_1 = self._build_bwa_backtrack_aln( parameters=parameters, input_file=template.format(Pair=1), output_file=output_sai_1, ) output_sai_2 = swap_ext(output_bam, "%i.sai" % (2, )) aln_node_2 = self._build_bwa_backtrack_aln( parameters=parameters, input_file=template.format(Pair=2), output_file=output_sai_2, ) return BWASampe( input_file_sai_1=output_sai_1, input_file_sai_2=output_sai_2, input_file_fq_1=template.format(Pair=1), input_file_fq_2=template.format(Pair=2), output_file=output_bam, prefix=parameters["prefix"], reference=parameters["reference"], mapping_options=self.options["Aligners"]["BWA"], cleanup_options=self._cleanup_options("BWA"), dependencies=(aln_node_1, aln_node_2), )
def _build_examl_bootstraps(options, phylo, destination, input_alignment, input_partition, dependencies): bootstraps = [] num_bootstraps = phylo["ExaML"]["Bootstraps"] bootstrap_destination = os.path.join(destination, "bootstraps") bootstrap_template = os.path.join(bootstrap_destination, "bootstrap.%04i.phy") for bootstrap_num in xrange(num_bootstraps): bootstrap_alignment = bootstrap_template % (bootstrap_num,) bootstrap = PHYLIPBootstrapNode(input_alignment = input_alignment, input_partition = input_partition, output_alignment = bootstrap_alignment, seed = random.randint(1, 2**32 - 1), dependencies = dependencies) bootstrap_binary = swap_ext(bootstrap_alignment, ".binary") bootstrap_final = swap_ext(bootstrap_alignment, ".%s") bs_binary = ExaMLParserNode(input_alignment = bootstrap_alignment, input_partition = input_partition, output_file = bootstrap_binary, dependencies = bootstrap) bootstraps.append(_examl_nodes(options = options, settings = phylo, input_alignment = bootstrap_alignment, input_partitions = input_partition, input_binary = bootstrap_binary, output_template = bootstrap_final, dependencies = bs_binary)) if bootstraps: return _build_rerooted_trees(bootstraps, phylo["RootTreesOn"]) return None
def _setup(self, config, temp): CommandNode._setup(self, config, temp) input_files = [ self._input_file, fileutils.swap_ext(self._input_file, ".bim"), fileutils.swap_ext(self._input_file, ".fam"), ] for filename in input_files: basename = os.path.basename(filename) os.symlink(os.path.abspath(filename), os.path.join(temp, basename)) if self._supervised: fam_filename = fileutils.swap_ext(self._input_file, ".fam") pop_filename = fileutils.swap_ext(fam_filename, ".pop") pop_filename = fileutils.reroot_path(temp, pop_filename) key = "Group(%i)" % (self._k_groups,) with open(fam_filename) as fam_handle: with open(pop_filename, "w") as pop_handle: for line in fam_handle: sample, _ = line.split(None, 1) group = self._samples.get(sample, {}).get(key, "-") pop_handle.write("%s\n" % (group,))
def _setup(self, config, temp): CommandNode._setup(self, config, temp) input_files = [ self._input_file, fileutils.swap_ext(self._input_file, ".bim"), fileutils.swap_ext(self._input_file, ".fam"), ] for filename in input_files: basename = os.path.basename(filename) os.symlink(os.path.abspath(filename), os.path.join(temp, basename)) if self._supervised: fam_filename = fileutils.swap_ext(self._input_file, ".fam") pop_filename = fileutils.swap_ext(fam_filename, ".pop") pop_filename = fileutils.reroot_path(temp, pop_filename) key = "Group(%i)" % (self._k_groups, ) with open(fam_filename) as fam_handle: with open(pop_filename, "w") as pop_handle: for line in fam_handle: sample, _ = line.split(None, 1) group = self._samples.get(sample, {}).get(key, "-") pop_handle.write("%s\n" % (group, ))
def _setup(self, config, temp): CommandNode._setup(self, config, temp) pipe_fname = os.path.join(temp, self.PIPE_FILE) if len(self._input_bams) > 1: os.mkfifo(pipe_fname) else: source_fname = os.path.abspath(self._input_bams[0]) os.symlink(source_fname, pipe_fname) if self._index_format: os.symlink(swap_ext(source_fname, self._index_format), swap_ext(pipe_fname, self._index_format))
def _setup(self, config, temp): CommandNode._setup(self, config, temp) pipe_fname = os.path.join(temp, self.PIPE_FILE) if len(self._input_bams) > 1: os.mkfifo(pipe_fname) else: source_fname = os.path.abspath(self._input_bams[0]) os.symlink(source_fname, pipe_fname) if self._index_format: os.symlink(swap_ext(source_fname, self._index_format), swap_ext(pipe_fname, self._index_format))
def customize(cls, config, input_bams, output_bam, output_metrics=None, keep_dupes=False, dependencies=()): params = picard_command(config, "MarkDuplicates") _set_max_open_files(params, "MAX_FILE_HANDLES") params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=") params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=") # Validation is mostly left to manual ValidateSamFile runs; required # because .csi indexed BAM records can have "invalid" bins. params.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=") params.add_multiple_options("I", input_bams, sep="=") if not keep_dupes: # Remove duplicates from output by default to save disk-space params.set_option("REMOVE_DUPLICATES", "True", sep="=", fixed=False) output_metrics = output_metrics or swap_ext(output_bam, ".metrics") params.set_kwargs(OUT_BAM=output_bam, OUT_METRICS=output_metrics) return {"command": params, "dependencies": dependencies}
def __init__(self, config, input_bams, pipename="input.bam", indexed=True): self.pipe = pipename self.indexed = indexed self.files = safe_coerce_to_tuple(input_bams) self.commands = [] self.kwargs = {"TEMP_IN_BAM": self.pipe} if len(self.files) > 1: params = picard_command(config, "MergeSamFiles") params.set_option("SO", "coordinate", sep="=", fixed=False) params.set_option("CREATE_INDEX", "False", sep="=") params.set_option("COMPRESSION_LEVEL", 0, sep="=") params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") params.add_multiple_options("I", input_bams, sep="=") params.set_kwargs(TEMP_OUT_BAM=self.pipe) self.commands = [params.finalize()] else: # Ensure that the actual command depends on the input self.kwargs["IN_FILE_00"] = self.files[0] if indexed: self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
def __init__( self, config, input_bam, input_index=None, output_log=None, ignored_checks=(), big_genome_mode=False, dependencies=(), ): builder = picard_command(config, "ValidateSamFile") _set_max_open_files(builder, "MAX_OPEN_TEMP_FILES") if True or big_genome_mode: self._configure_for_big_genome(config, builder) builder.set_option("I", "%(IN_BAM)s", sep="=") for check in ignored_checks: builder.add_option("IGNORE", check, sep="=") output_log = output_log or swap_ext(input_bam, ".validated") builder.set_kwargs(IN_BAM=input_bam, IN_INDEX=input_index, OUT_STDOUT=output_log) description = "<Validate BAM: '%s'>" % (input_bam, ) PicardNode.__init__( self, command=builder.finalize(), description=description, dependencies=dependencies, )
def __init__(self, output_root, table, bamfile, downsample, dependencies=()): cmd = factory.new("build_tped") cmd.set_option("--name", "Sample") cmd.set_option("--downsample", downsample) cmd.add_value("%(TEMP_DIR)s") cmd.add_value("%(IN_TABLE)s") cmd.add_value("%(IN_BAM)s") if not downsample: # Needed for random access (chromosomes are read 1 ... 31) cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai")) cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"), OUT_SUMMARY=os.path.join(output_root, "common.summary"), OUT_TPED_INCL_TS=os.path.join(output_root, "incl_ts.tped"), OUT_TPED_EXCL_TS=os.path.join(output_root, "excl_ts.tped"), IN_TABLE=table, IN_BAM=bamfile) CommandNode.__init__(self, description="<BuildTPEDFiles -> %r>" % (os.path.join(output_root, '*'),), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, config, reference, infiles, outfile, threads=1, dependencies=()): threads = _get_max_threads(reference, threads) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") command.set_option("-nt", threads) _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile, CHECK_GATK=_get_gatk_version_check(config)) description = "<GATK Indel Realigner (training): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, threads=threads, description=description, command=command.finalize(), dependencies=dependencies)
def _teardown(self, config, temp): os.remove(os.path.join(temp, self.PIPE_FILE)) if self._index_format: os.remove(os.path.join(temp, swap_ext(self.PIPE_FILE, self._index_format))) CommandNode._teardown(self, config, temp)
def __init__(self, output_root, table, bamfile, downsample, dependencies=()): cmd = factory.new("zonkey_tped") cmd.set_option("--name", "Sample") cmd.set_option("--downsample", downsample) cmd.add_value("%(TEMP_DIR)s") cmd.add_value("%(IN_TABLE)s") cmd.add_value("%(IN_BAM)s") if not downsample: # Needed for random access (chromosomes are read 1 ... 31) cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai")) cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"), OUT_SUMMARY=os.path.join(output_root, "common.summary"), OUT_TPED_INCL_TS=os.path.join(output_root, "incl_ts.tped"), OUT_TPED_EXCL_TS=os.path.join(output_root, "excl_ts.tped"), IN_TABLE=table, IN_BAM=bamfile) CommandNode.__init__(self, description="<BuildTPEDFiles -> %r>" % (os.path.join(output_root, '*'), ), command=cmd.finalize(), dependencies=dependencies)
def _build_bwa_backtrack_pe(self, config, prefix, record, parameters): template = parameters.pop("input_file") output_bam = parameters.pop("output_file") aln_files = [] aln_nodes = [] for mate in (1, 2): input_file = template.format(Pair=mate) output_sai = swap_ext(output_bam, "%i.sai" % (mate,)) aln_node = self._build_bwa_backtrack_aln(parameters=parameters, input_file=input_file, output_file=output_sai) aln_files.append(output_sai) aln_nodes.append(aln_node) sam_node = BWASampe.customize(input_file_sai_1=aln_files[0], input_file_sai_2=aln_files[1], input_file_fq_1=template.format(Pair=1), input_file_fq_2=template.format(Pair=2), output_file=output_bam, prefix=parameters['prefix'], reference=parameters["reference"], dependencies=aln_nodes) return self._finalize_nodes(config, prefix, parameters, sam_node)
def _collect_subsets(roi, subset, path): if roi not in subsets_by_regions: raise MakefileError("Subset of unknown region (%r) requested at %r" % (roi, path)) roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names") if not os.path.isfile(roi_fname): raise MakefileError("Subset file does not exist for Regions Of " "Interest:\n Region = %r\n Subset = %r\n" " Path = %r" % (roi, subset, roi_fname)) sequences = set() with open(roi_fname) as handle: for line in handle: line = line.strip() if line and not line.startswith("#"): sequences.add(line) known_seqs = subsets_by_regions[roi]["Sequences"][None] unknown_seqs = sequences - known_seqs if unknown_seqs: message = ("Unknown sequences in subset file:\n" " File = %r\n Region = %r\n Subset = %r\n" " Unknown sequence names =") \ % (roi_fname, roi, subset) unknown_seqs = list(sorted(unknown_seqs)) if len(unknown_seqs) > 5: unknown_seqs = unknown_seqs[:5] + ["..."] message = "\n - ".join([message] + unknown_seqs) raise MakefileError(message) subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname,) subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)
def __init__(self, infile, index_format='.bai', dependencies=()): basename = os.path.basename(infile) if index_format == '.bai': samtools_version = SAMTOOLS_VERSION samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"] elif index_format == '.csi': samtools_version = SAMTOOLS_VERSION_1x samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"] else: raise ValueError("Unknown format type %r; expected .bai or .csi" % (index_format, )) cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"], IN_BAM=infile, TEMP_OUT_BAM=basename, set_cwd=True) cmd_index = AtomicCmd(samtools_call, TEMP_IN_BAM=basename, CHECK_SAM=samtools_version) cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"], TEMP_IN_BAM=basename + index_format, OUT_BAM=swap_ext(infile, index_format)) commands = SequentialCmds((cmd_link, cmd_index, cmd_rename)) CommandNode.__init__(self, description="<BAMIndex (%s): '%s'>" % (index_format[1:].upper(), infile), command=commands, dependencies=dependencies)
def __init__(self, infile, index_format='.bai', dependencies=()): basename = os.path.basename(infile) if index_format == '.bai': samtools_version = SAMTOOLS_VERSION samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"] elif index_format == '.csi': samtools_version = SAMTOOLS_VERSION_1x samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"] else: raise ValueError("Unknown format type %r; expected .bai or .csi" % (index_format,)) cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"], IN_BAM=infile, TEMP_OUT_BAM=basename, set_cwd=True) cmd_index = AtomicCmd(samtools_call, TEMP_IN_BAM=basename, CHECK_SAM=samtools_version) cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"], TEMP_IN_BAM=basename + index_format, OUT_BAM=swap_ext(infile, index_format)) commands = SequentialCmds((cmd_link, cmd_index, cmd_rename)) CommandNode.__init__(self, description="<BAMIndex (%s): '%s'>" % (index_format[1:].upper(), infile), command=commands, dependencies=dependencies)
def index_and_validate_bam(config, prefix, node, log_file=None, create_index=True): input_file, has_index = _get_input_file(node) if not has_index and create_index: node = BAMIndexNode(infile=input_file, dependencies=node) validation_params = ValidateBAMNode.customize(config=config, input_bam=input_file, output_log=log_file, dependencies=node) # Ensure that the validation node is re-run if the index changes if has_index or create_index: bai_filename = swap_ext(input_file, ".bai") validation_params.command.set_kwargs(IN_BAI=bai_filename) # Check MD tags against reference sequence # FIXME: Disabled due to issues with Picard/Samtools disagreeing, # backwards compatibility. See the discussion at # http://sourceforge.net/mailarchive/message.php?msg_id=31348639 # validation_params.command.set_kwargs(IN_REF=prefix["Reference"]) # validation_params.command.add_option("R", "%(IN_REF)s", sep="=") # Ignored since we may filter out misses and low-quality hits during # mapping, which leads to a large proportion of missing PE mates. validation_params.command.add_option("IGNORE", "MATE_NOT_FOUND", sep="=") # Ignored due to high rate of false positives for lanes with few hits, # where high-quality reads may cause mis-identification of qualities validation_params.command.add_option("IGNORE", "INVALID_QUALITY_FORMAT", sep="=") return validation_params.build_node()
def _collect_subsets(roi, subset, path): if roi not in subsets_by_regions: raise MakefileError("Subset of unknown region (%r) requested at %r" % (roi, path)) roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names") if not os.path.isfile(roi_fname): raise MakefileError("Subset file does not exist for Regions Of " "Interest:\n Region = %r\n Subset = %r\n" " Path = %r" % (roi, subset, roi_fname)) sequences = set() with open(roi_fname) as handle: for line in handle: line = line.strip() if line and not line.startswith("#"): sequences.add(line) known_seqs = subsets_by_regions[roi]["Sequences"][None] unknown_seqs = sequences - known_seqs if unknown_seqs: message = ("Unknown sequences in subset file:\n" " File = %r\n Region = %r\n Subset = %r\n" " Unknown sequence names =") \ % (roi_fname, roi, subset) unknown_seqs = list(sorted(unknown_seqs)) if len(unknown_seqs) > 5: unknown_seqs = unknown_seqs[:5] + ["..."] message = "\n - ".join([message] + unknown_seqs) raise MakefileError(message) subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname,) subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)
def __init__(self, config, reference, intervals, infiles, outfile, dependencies=()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), IN_INTERVALS=intervals, OUT_BAMFILE=outfile, CHECK_GATK=_get_gatk_version_check(config)) calmd = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM=self._basename, IN_REF=reference, TEMP_OUT_STDOUT=self._basename + ".calmd", CHECK_VERSION=SAMTOOLS_VERSION) description = "<GATK Indel Realigner (aligning): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=ParallelCmds([command.finalize(), calmd]), dependencies=dependencies)
def __init__(self, input_file, k_groups, output_root, samples=None, dependencies=()): self._samples = samples self._input_file = input_file self._k_groups = k_groups group_key = "Group(%i)" % (self._k_groups,) self._supervised = samples and any((row[group_key] != '-') for row in samples.itervalues()) assert k_groups in (2, 3), k_groups prefix = os.path.splitext(os.path.basename(input_file))[0] output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups)) cmd = AtomicCmdBuilder("admixture", IN_FILE_BED=input_file, IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"), IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"), TEMP_OUT_FILE_BED=prefix + ".bed", TEMP_OUT_FILE_BIM=prefix + ".bim", TEMP_OUT_FILE_FAM=prefix + ".fam", TEMP_OUT_FILE_POP=prefix + ".pop", OUT_P=output_prefix + ".P", OUT_Q=output_prefix + ".Q", OUT_STDOUT=output_prefix + ".log", CHECK_VERSION=ADMIXTURE_VERSION, set_cwd=True) cmd.set_option("-s", random.randint(0, 2 ** 16 - 1)) if self._supervised: cmd.set_option("--supervised") cmd.add_value("%(TEMP_OUT_FILE_BED)s") cmd.add_value(int(k_groups)) CommandNode.__init__(self, description="<Admixture -> '%s.*''>" % (output_prefix,), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, input_file, k_groups, output_root, samples=None, dependencies=()): self._samples = samples self._input_file = input_file self._k_groups = k_groups group_key = "Group(%i)" % (self._k_groups,) self._supervised = samples and any((row[group_key] != '-') for row in samples.itervalues()) assert k_groups in (2, 3), k_groups prefix = os.path.splitext(os.path.basename(input_file))[0] output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups)) cmd = AtomicCmdBuilder("admixture", IN_FILE_BED=input_file, IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"), IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"), TEMP_OUT_FILE_BED=prefix + ".bed", TEMP_OUT_FILE_BIM=prefix + ".bim", TEMP_OUT_FILE_FAM=prefix + ".fam", TEMP_OUT_FILE_POP=prefix + ".pop", OUT_P=output_prefix + ".P", OUT_Q=output_prefix + ".Q", OUT_STDOUT=output_prefix + ".log", CHECK_VERSION=ADMIXTURE_VERSION, set_cwd=True) cmd.set_option("-s", random.randint(0, 2 ** 16 - 1)) if self._supervised: cmd.set_option("--supervised") cmd.add_value("%(TEMP_OUT_FILE_BED)s") cmd.add_value(int(k_groups)) CommandNode.__init__(self, description="<Admixture -> '%s.*''>" % (output_prefix,), command=cmd.finalize(), dependencies=dependencies)
def _teardown(self, config, temp): os.remove(os.path.join(temp, self.PIPE_FILE)) if self._index_format: os.remove( os.path.join(temp, swap_ext(self.PIPE_FILE, self._index_format))) CommandNode._teardown(self, config, temp)
def _set_input_files(command, input_files): keys = {} for (index, filename) in enumerate(input_files): command.add_option("-I", "%%(IN_BAMFILE_%02i)s" % index) keys["IN_BAMFILE_%02i" % index] = filename keys["IN_BAIFILE_%02i" % index] = swap_ext(filename, ".bai") command.set_kwargs(**keys)
def _build_examl_bootstraps(options, phylo, destination, input_alignment, input_partition, dependencies): bootstraps = [] num_bootstraps = phylo["ExaML"]["Bootstraps"] bootstrap_destination = os.path.join(destination, "bootstraps") bootstrap_template = os.path.join(bootstrap_destination, "bootstrap.%04i.phy") for bootstrap_num in range(num_bootstraps): bootstrap_alignment = bootstrap_template % (bootstrap_num, ) bootstrap = PHYLIPBootstrapNode( input_alignment=input_alignment, input_partition=input_partition, output_alignment=bootstrap_alignment, seed=random.randint(1, 2**32 - 1), dependencies=dependencies, ) bootstrap_binary = swap_ext(bootstrap_alignment, ".binary") bootstrap_final = swap_ext(bootstrap_alignment, ".%s") bs_binary = ExaMLParserNode( input_alignment=bootstrap_alignment, input_partition=input_partition, output_file=bootstrap_binary, dependencies=bootstrap, ) bootstraps.append( _examl_nodes( options=options, settings=phylo, input_alignment=bootstrap_alignment, input_partitions=input_partition, input_binary=bootstrap_binary, output_template=bootstrap_final, dependencies=bs_binary, )) if bootstraps: return _build_rerooted_trees(bootstraps, phylo["RootTreesOn"]) return None
def customize(cls, config, input_bam, output_log=None, dependencies=()): params = picard_command(config, "ValidateSamFile") params.set_option("I", "%(IN_BAM)s", sep="=") output_log = output_log or swap_ext(input_bam, ".validated") params.set_kwargs(IN_BAM=input_bam, OUT_STDOUT=output_log) return {"command": params, "dependencies": dependencies}
def customize(cls, config, reference, dependencies=()): params = picard_command(config, "CreateSequenceDictionary") params.set_option("R", "%(TEMP_OUT_REF)s", sep="=") params.set_option("O", "%(OUT_DICT)s", sep="=") params.set_kwargs(IN_REF=reference, TEMP_OUT_REF=os.path.basename(reference), OUT_DICT=swap_ext(reference, ".dict")) return {"command": params, "dependencies": dependencies}
def _setup(self, config, temp_root): CommandNode._setup(self, config, temp_root) dst_fname = os.path.join(temp_root, self._bam_input.pipe) if len(self._bam_input.files) > 1: os.mkfifo(dst_fname) else: src_fname, = self._bam_input.files os.symlink(os.path.join(os.getcwd(), src_fname), dst_fname) if self._bam_input.indexed: src_fname = os.path.join(os.getcwd(), swap_ext(src_fname, ".bai")) os.symlink(src_fname, dst_fname + ".bai")
def __init__(self, input_file, k_groups, output_root, groups, dependencies=()): self._groups = groups self._input_file = input_file prefix = os.path.splitext(os.path.basename(input_file))[0] output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups)) cmd = AtomicCmdBuilder( "admixture", IN_FILE_BED=input_file, IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"), IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"), TEMP_OUT_FILE_BED=prefix + ".bed", TEMP_OUT_FILE_BIM=prefix + ".bim", TEMP_OUT_FILE_FAM=prefix + ".fam", TEMP_OUT_FILE_POP=prefix + ".pop", OUT_P=output_prefix + ".P", OUT_Q=output_prefix + ".Q", OUT_STDOUT=output_prefix + ".log", CHECK_VERSION=ADMIXTURE_VERSION, set_cwd=True, ) cmd.set_option("-s", random.randint(0, 2**16 - 1)) cmd.set_option("--supervised") cmd.add_value("%(TEMP_OUT_FILE_BED)s") cmd.add_value(int(k_groups)) CommandNode.__init__( self, description="<Admixture -> '%s.*''>" % (output_prefix, ), command=cmd.finalize(), dependencies=dependencies, )
def customize(cls, config, input_bams, output_bam, dependencies=()): params = picard_command(config, "MergeSamFiles") params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=") params.set_option("CREATE_INDEX", "True", sep="=") params.set_option("SO", "coordinate", sep="=", fixed=False) params.add_multiple_options("I", input_bams, sep="=") params.set_kwargs(OUT_BAM=output_bam, OUT_BAI=swap_ext(output_bam, ".bai")) return {"command": params, "dependencies": dependencies}
def customize(cls, config, input_bams, output_bam, output_metrics=None, keep_dupes=False, dependencies=()): params = picard_command(config, "MarkDuplicates") # Create .bai index, since it is required by a lot of other programs params.set_option("CREATE_INDEX", "True", sep="=") params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=") params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=") params.add_multiple_options("I", input_bams, sep="=") if not keep_dupes: # Remove duplicates from output by default to save disk-space params.set_option("REMOVE_DUPLICATES", "True", sep="=", fixed=False) output_metrics = output_metrics or swap_ext(output_bam, ".metrics") params.set_kwargs(OUT_BAM=output_bam, OUT_BAI=swap_ext(output_bam, ".bai"), OUT_METRICS=output_metrics) return {"command": params, "dependencies": dependencies}
def _validate_mito_bam(data, handle, info): if data.mitochondria is None: # No mitochondrial data .. skip phylogeny return True references = handle.references min_length = min((len(record.sequence)) for record in data.mitochondria.itervalues()) for bam_contig, bam_length in zip(references, handle.lengths): if bam_contig not in data.mitochondria: continue db_sequence = data.mitochondria[bam_contig].sequence db_length = len(db_sequence) - db_sequence.count("-") if bam_length != db_length: print_err("ERROR: Length of mitochondrial contig %r (%i bp) " "does not match the length of the corresponding " "sequence in the database (%i bp)" % (bam_contig, bam_length, db_length)) return False if not os.path.exists(handle.filename + '.bai') \ and not os.path.exists(swap_ext(handle.filename, '.bai')): print_info(' - Attempting to index BAM file %r!' % (handle.filename,)) pysam.index(handle.filename) # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(handle.filename)).split('\n'): line = line.strip() if not line: continue name, _, hits, _ = line.split('\t') if (name == bam_contig) and not int(hits): print_err("WARNING: Mitochondrial BAM (%r) does not contain " "any reads aligned to contig %r; inferring an " "phylogeny is not possible." % (handle.filename, name)) return True info.mt_contig = bam_contig info.mt_length = bam_length info.mt_padding = len(db_sequence) - min_length return True return True
def _validate_mito_bam(data, handle, info): if data.mitochondria is None: # No mitochondrial data .. skip phylogeny return True references = handle.references min_length = min( (len(record.sequence)) for record in data.mitochondria.itervalues()) for bam_contig, bam_length in zip(references, handle.lengths): if bam_contig not in data.mitochondria: continue db_sequence = data.mitochondria[bam_contig].sequence db_length = len(db_sequence) - db_sequence.count("-") if bam_length != db_length: print_err("ERROR: Length of mitochondrial contig %r (%i bp) " "does not match the length of the corresponding " "sequence in the database (%i bp)" % (bam_contig, bam_length, db_length)) return False if not os.path.exists(handle.filename + '.bai') \ and not os.path.exists(swap_ext(handle.filename, '.bai')): print_info(' - Attempting to index BAM file %r!' % (handle.filename, )) pysam.index(handle.filename) # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(handle.filename)).split('\n'): line = line.strip() if not line: continue name, _, hits, _ = line.split('\t') if (name == bam_contig) and not int(hits): print_err("WARNING: Mitochondrial BAM (%r) does not contain " "any reads aligned to contig %r; inferring an " "phylogeny is not possible." % (handle.filename, name)) return True info.mt_contig = bam_contig info.mt_length = bam_length info.mt_padding = len(db_sequence) - min_length return True return True
def _build_bwa_backtrack_se(self, config, prefix, record, parameters): input_file_fq = parameters.pop("input_file") output_file_bam = parameters.pop("output_file") output_file_sai = swap_ext(output_file_bam, ".sai") aln_node = self._build_bwa_backtrack_aln(parameters=parameters, input_file=input_file_fq, output_file=output_file_sai) sam_node = BWASamse.customize(input_file_fq=input_file_fq, input_file_sai=output_file_sai, output_file=output_file_bam, prefix=parameters["prefix"], reference=parameters["reference"], dependencies=aln_node) return self._finalize_nodes(config, prefix, parameters, sam_node)
def __init__(self, config, reference, dependencies=()): self._in_reference = os.path.abspath(reference) builder = picard_command(config, "CreateSequenceDictionary") builder.set_option("R", "%(TEMP_OUT_REF)s", sep="=") builder.set_option("O", "%(OUT_DICT)s", sep="=") builder.set_kwargs(IN_REFERENCE=reference, TEMP_OUT_REF=os.path.basename(reference), OUT_DICT=swap_ext(reference, ".dict")) description = "<SequenceDictionary: '%s'>" % (reference,) PicardNode.__init__(self, command=builder.finalize(), description=description, dependencies=dependencies)
def __init__(self, config, reference, dependencies=()): self._in_reference = os.path.abspath(reference) builder = picard_command(config, "CreateSequenceDictionary") builder.set_option("R", "%(TEMP_OUT_REF)s", sep="=") builder.set_option("O", "%(OUT_DICT)s", sep="=") builder.set_kwargs(IN_REFERENCE=reference, TEMP_OUT_REF=os.path.basename(reference), OUT_DICT=swap_ext(reference, ".dict")) description = "<SequenceDictionary: '%s'>" % (reference, ) PicardNode.__init__(self, command=builder.finalize(), description=description, dependencies=dependencies)
def _build_bwa_backtrack_se(self, config, prefix, record, parameters): input_file_fq = parameters.pop("input_file") output_file_bam = parameters.pop("output_file") output_file_sai = swap_ext(output_file_bam, ".sai") aln_node = self._build_bwa_backtrack_aln(parameters=parameters, input_file=input_file_fq, output_file=output_file_sai) return BWASamse( input_file_fq=input_file_fq, input_file_sai=output_file_sai, output_file=output_file_bam, prefix=parameters["prefix"], reference=parameters["reference"], mapping_options=self.options["Aligners"]["BWA"], cleanup_options=self._cleanup_options("BWA"), dependencies=aln_node, )
def __init__(self, config, input_bam, input_index=None, output_log=None, ignored_checks=(), dependencies=()): builder = picard_command(config, "ValidateSamFile") _set_max_open_files(builder, "MAX_OPEN_TEMP_FILES") builder.set_option("I", "%(IN_BAM)s", sep="=") for check in ignored_checks: builder.add_option("IGNORE", check, sep="=") output_log = output_log or swap_ext(input_bam, ".validated") builder.set_kwargs(IN_BAM=input_bam, IN_INDEX=input_index, OUT_STDOUT=output_log) description = "<Validate BAM: '%s'>" % (input_bam,) PicardNode.__init__(self, command=builder.finalize(), description=description, dependencies=dependencies)
def _build_coverage_nodes_cached(files_and_nodes, target_name, roi_name, roi_filename, cache): output_ext = ".coverage" if roi_name: output_ext = ".%s.coverage" % roi_name coverages = {} for (input_filename, node) in files_and_nodes.iteritems(): output_filename = swap_ext(input_filename, output_ext) cache_key = (roi_filename, input_filename) if cache_key not in cache: cache[cache_key] = CoverageNode(input_file=input_filename, output_file=output_filename, target_name=target_name, regions_file=roi_filename, dependencies=node) coverages[output_filename] = cache[cache_key] return coverages
def _build_coverage_nodes_cached(files_and_nodes, target_name, roi_name, roi_filename, cache): output_ext = ".coverage" if roi_name: output_ext = ".%s.coverage" % roi_name coverages = {} for (input_filename, node) in files_and_nodes.iteritems(): output_filename = swap_ext(input_filename, output_ext) cache_key = (roi_filename, input_filename) if cache_key not in cache: cache[cache_key] = CoverageNode(input_file=input_filename, output_file=output_filename, target_name=target_name, regions_file=roi_filename, dependencies=node) coverages[output_filename] = cache[cache_key] return coverages
def __init__(self, infile, dependencies=()): basename = os.path.basename(infile) cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"], IN_BAM=infile, TEMP_OUT_BAM=basename, set_cwd=True) cmd_index = AtomicCmd(["samtools", "index", "%(TEMP_IN_BAM)s"], TEMP_IN_BAM=basename, CHECK_SAM=SAMTOOLS_VERSION) cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"], TEMP_IN_BAM=basename + ".bai", OUT_BAM=swap_ext(infile, ".bai")) commands = SequentialCmds((cmd_link, cmd_index, cmd_rename)) CommandNode.__init__(self, description="<BAMIndex: '%s'>" % (infile,), command=commands, dependencies=dependencies)
def customize(cls, config, input_bams, output_bam, output_metrics=None, keep_dupes=False, dependencies=()): params = picard_command(config, "MarkDuplicates") _set_max_open_files(params, "MAX_FILE_HANDLES") params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=") params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=") # Validation is mostly left to manual ValidateSamFile runs; required # because .csi indexed BAM records can have "invalid" bins. params.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=") params.add_multiple_options("I", input_bams, sep="=") if not keep_dupes: # Remove duplicates from output by default to save disk-space params.set_option("REMOVE_DUPLICATES", "True", sep="=", fixed=False) output_metrics = output_metrics or swap_ext(output_bam, ".metrics") params.set_kwargs(OUT_BAM=output_bam, OUT_METRICS=output_metrics) return {"command": params, "dependencies": dependencies}
def build_sampling_nodes(options, genotyping, sample, regions, dependencies): fasta_file = regions["Genotypes"][sample] pileup_file = swap_ext(fasta_file, ".pileup.bgz") padding = genotyping["Padding"] slop, node = build_regions_nodes(regions, padding, dependencies) bam_file = "%s.%s.bam" % (sample, regions["Prefix"]) bam_file = os.path.join(options.samples_root, bam_file) if regions["Realigned"]: bam_file = add_postfix(bam_file, ".realigned") bai_node = build_bam_index_node(bam_file) genotype = GenotypeRegionsNode.customize(pileup_only=True, reference=regions["FASTA"], bedfile=slop, infile=bam_file, outfile=pileup_file, nbatches=options.samtools_max_threads, dependencies=node + (bai_node,)) apply_samtools_options(genotype.command, genotyping["MPileup"], "--mpileup-argument") genotype = genotype.build_node() tabix = TabixIndexNode(infile=pileup_file, preset="pileup", dependencies=genotype) builder = SampleRegionsNode(infile=pileup_file, bedfile=regions["BED"], outfile=fasta_file, dependencies=tabix) faidx = FastaIndexNode(infile=fasta_file, dependencies=builder) return (faidx,)
def __init__(self, config, target_name, input_files, output_file, prefix, regions_file=None, dependencies=()): input_files = safe_coerce_to_tuple(input_files) index_format = regions_file and prefix['IndexFormat'] builder = factory.new("depths") builder.add_value("%(TEMP_IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(OUT_FILE=output_file, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_files) if regions_file: index_file = swap_ext(MultiBAMInputNode.PIPE_FILE, index_format) builder.set_option('--regions-file', '%(IN_REGIONS)s') builder.set_kwargs(IN_REGIONS=regions_file, TEMP_IN_INDEX=index_file) description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(input_files), output_file) MultiBAMInputNode.__init__(self, config=config, input_bams=input_files, index_format=index_format, command=builder.finalize(), description=description, dependencies=dependencies)
def test_swap_ext__empty_ext_vs_new_ext(): assert_equal(swap_ext("name", "bar"), "name.bar")
def build_genotyping_nodes_cached(options, genotyping, sample, regions, dependencies): """Carries out genotyping, filtering of calls, and indexing of files for a given sample and prefix. If the option 'GenotypeEntirePrefix' is enabled, the BAM is genotyped once, and each set of RegionsOfInterest simply extract the relevant regions during construction of the consensus sequence. Parameters: options: An options object (c.f. paleomix.tools.phylo_pipeline.config). genotyping: Genotyping options defined for a specific set of areas of interest, corresponding to Genotyping:NAME in the makefile. sample: The name of the sample to be genotyped. egions: A dictionary for a 'RegionsOfInterest' from the makefile. dependencies: Depenencies that must be met before genotyping starts. Returns a tuple containing the filename of the filtered and tabix-indexed VCF file, and the top-level node generating this file. Multiple calls for the same BAM and prefix will return the same VCF and nodes if the option for 'GenotypeEntirePrefix' is enabled, otherwise each ROI is genotyped individiually. Output files are generated in ./results/PROJECT/genotyping. If the option for 'GenotypeEntirePrefix' is enabled, the following files are generated: SAMPLE.PREFIX.vcf.bgz: Unfiltered calls for variant/non-variant sites. SAMPLE.PREFIX.vcf.pileup.bgz: Pileup of sites containing SNPs. SAMPLE.PREFIX.vcf.pileup.bgz.tbi: Tabix index of the pileup. SAMPLE.PREFIX.filtered.vcf.bgz: Variant calls filtered with vcf_filter. SAMPLE.PREFIX.filtered.vcf.bgz.tbi: Tabix index for the filtered VCF. If 'GenotypeEntirePrefix' is not enabled for a given ROI, the following files are generated for that ROI (see descriptions above): SAMPLE.PREFIX.ROI.filtered.vcf.bgz SAMPLE.PREFIX.ROI.filtered.vcf.bgz.tbi SAMPLE.PREFIX.ROI.vcf.bgz SAMPLE.PREFIX.ROI.vcf.pileup.bgz SAMPLE.PREFIX.ROI.vcf.pileup.bgz.tbi In addition, the following files are generated for each set of RegionsOfInterest (ROI), regardless of the 'GenotypeEntirePrefix' option: SAMPLE.PREFIX.ROI.CDS.fasta: FASTA sequence of each feature in the ROI. SAMPLE.PREFIX.ROI.CDS.fasta.fai: FASTA index generated using SAMTools. """ output_prefix, bamfile, bedfile, dependencies \ = build_genotyping_bedfile_nodes(options, genotyping, sample, regions, dependencies) if (bamfile, output_prefix) in _VCF_CACHE: return _VCF_CACHE[(bamfile, output_prefix)] calls = swap_ext(output_prefix, ".vcf.bgz") pileups = swap_ext(output_prefix, ".vcf.pileup.bgz") filtered = swap_ext(output_prefix, ".filtered.vcf.bgz") # 1. Call samtools mpilup | bcftools view on the bam genotype = GenotypeRegionsNode.customize(reference=regions["FASTA"], bedfile=bedfile, infile=bamfile, outfile=calls, nbatches=options.samtools_max_threads, dependencies=dependencies) apply_samtools_options(genotype.command, genotyping["MPileup"], "--mpileup-argument") apply_samtools_options(genotype.command, genotyping["BCFTools"], "--bcftools-argument") genotype = genotype.build_node() # 2. Collect pileups of sites with SNPs, to allow proper filtering by # frequency of the minor allele, as only the major non-ref allele is # counted in the VCF (c.f. field DP4). vcfpileup = VCFPileupNode.customize(reference=regions["FASTA"], infile_bam=bamfile, infile_vcf=calls, outfile=pileups, dependencies=genotype) apply_samtools_options(vcfpileup.command, genotyping["MPileup"], "--mpileup-argument") vcfpileup = vcfpileup.build_node() vcf_tabix = TabixIndexNode(infile=pileups, preset="pileup", dependencies=vcfpileup) # 3. Filter all sites using the 'vcf_filter' command vcffilter = VCFFilterNode.customize(infile=calls, pileup=pileups, outfile=filtered, regions=regions, dependencies=vcf_tabix) vcffilter = _apply_vcf_filter_options(vcffilter, genotyping, sample) # 4. Tabix index. This allows random-access to the VCF file when building # the consensus FASTA sequence later in the pipeline. tabix = TabixIndexNode(infile=filtered, preset="vcf", dependencies=vcffilter) _VCF_CACHE[(bamfile, output_prefix)] = (filtered, tabix) return filtered, tabix
def finalize_run_config(parser, args): log = logging.getLogger(__name__) if args.command in ("run", "dryrun") and not (1 <= len(args.files) <= 3): parser.print_usage() return args.multisample = False known_samples = set(args.database.samples) | set(("Sample", )) unknown_samples = set(args.treemix_outgroup) - known_samples if unknown_samples: log.error( "Argument --treemix-outgroup includes unknown sample(s): %s; known " "samples are %s. Note that names are case-sensitive." ", ".join(map(repr, sorted(unknown_samples))), ", ".join(map(repr, sorted(known_samples))), ) return if len(args.files) == 1: args.files.append(fileutils.swap_ext(args.files[0], ".zonkey")) if len(args.files) == 2: filename, args.destination = args.files if os.path.exists( args.destination) and not os.path.isdir(args.destination): log.error("Destination %r is not a directory", args.destination) return elif not os.path.isfile(filename): log.error("Not a valid filename: %r", filename) return elif _is_bamfile(filename): args.samples = { "-": { "Root": args.destination, "Files": [filename] } } else: args.multisample = True if not _read_sample_table(args, filename): return elif len(args.files) == 3: filename_1, filename_2, args.destination = args.files args.samples = { "-": { "Root": args.destination, "Files": [filename_1, filename_2] } } else: raise RuntimeError("Unexpected number of arguments: %r" % (args.files, )) # Identify (mito or nuc?) and validate BAM files provided by user if not _process_samples(args): return return args
def test_swap_ext__dot_ext_vs_new_dot_ext(): assert_equal(swap_ext("name", ".bar"), "name.bar")
"names are case-sensitive." % (", ".join(map(repr, sorted(unknown_samples))), ", ".join( map(repr, sorted(known_samples))))) return if config.command in ("mito", "example"): if len(args) != 2: print_err("ERROR: Wrong number of arguments!") print_usage() return config.destination = args[1] config.samples = {} elif len(args) == 2: filename = args[1] config.destination = fileutils.swap_ext(filename, ".zonkey") if not os.path.isfile(filename): print_err("ERROR: Not a valid filename: %r" % (filename, )) return elif _is_bamfile(filename): # Called as either of # zonkey run <SampleDB> <nuclear.bam> # zonkey run <SampleDB> <mitochondrial.bam> config.samples = { "-": { "Root": config.destination, "Files": [filename] } } else:
def parse_arguments(argv, ext): prog = "paleomix %s" % (ext.strip("."),) usage = "%s [options] sorted.bam [out%s]" % (prog, ext) parser = argparse.ArgumentParser(prog=prog, usage=usage) parser.add_argument("infile", metavar="BAM", help="Filename of a sorted BAM file. If set to '-' " "the file is read from STDIN.") parser.add_argument("outfile", metavar="OUTPUT", nargs='?', help="Filename of output table; defaults to name of " "the input BAM with a '%s' extension. If " "set to '-' the table is printed to STDOUT." % (ext,)) parser.add_argument("--target-name", default=None, metavar="NAME", help="Name used for 'Target' column; defaults to the " "filename of the BAM file.") parser.add_argument("--regions-file", default=None, dest="regions_fpath", help="BED file containing regions of interest; %s " "is calculated only for these grouping by the " "name used in the BED file, or the contig name " "if no name has been specified for a record." % (ext.strip("."),)) parser.add_argument('--max-contigs', default=100, type=int, help="The maximum number of contigs allowed in a BAM " "file. If this number is exceeded, the entire " "set of contigs is aggregated into one pseudo-" "contig named '<Genome>'. This is done to " "limit table sizes [default: %(default)s]") parser.add_argument('--ignore-readgroups', default=False, action="store_true", help="Ignore readgroup information in reads, and only " "provide aggregated statistics; this is required " "if readgroup information is missing or partial " "[default: %(default)s]") parser.add_argument('--overwrite-output', default=False, action="store_true", help="Overwrite output file if it it exists; by " "default, the script will terminate if the file " "already exists.") args = parser.parse_args(argv) if not args.outfile: args.outfile = swap_ext(args.infile, ext) if args.ignore_readgroups: args.get_readgroup_func = _get_readgroup_ignored else: args.get_readgroup_func = _get_readgroup if not args.target_name: if args.infile == "-": args.target_name = "<STDIN>" else: args.target_name = os.path.basename(args.infile) if os.path.exists(args.outfile) and not args.overwrite_output: parser.error("Destination filename already exists (%r); use option " "--overwrite-output to allow overwriting of this file." % (args.outfile,)) return args
def parse_arguments(argv, ext): prog = "paleomix %s" % (ext.strip("."), ) usage = "%s [options] sorted.bam [out%s]" % (prog, ext) parser = argparse.ArgumentParser(prog=prog, usage=usage) parser.add_argument( "infile", metavar="BAM", help="Filename of a sorted BAM file. If set to '-' " "the file is read from STDIN.", ) parser.add_argument( "outfile", metavar="OUTPUT", nargs="?", help="Filename of output table; defaults to name of " "the input BAM with a '%s' extension. If " "set to '-' the table is printed to STDOUT." % (ext, ), ) parser.add_argument( "--target-name", default=None, metavar="NAME", help="Name used for 'Target' column; defaults to the " "filename of the BAM file.", ) parser.add_argument( "--regions-file", default=None, dest="regions_fpath", help="BED file containing regions of interest; %s " "is calculated only for these grouping by the " "name used in the BED file, or the contig name " "if no name has been specified for a record." % (ext.strip("."), ), ) parser.add_argument( "--max-contigs", default=100, type=int, help="The maximum number of contigs allowed in a BAM " "file. If this number is exceeded, the entire " "set of contigs is aggregated into one pseudo-" "contig named '<Genome>'. This is done to " "limit table sizes [default: %(default)s]", ) parser.add_argument( "--ignore-readgroups", default=False, action="store_true", help="Ignore readgroup information in reads, and only " "provide aggregated statistics; this is required " "if readgroup information is missing or partial " "[default: %(default)s]", ) parser.add_argument( "--overwrite-output", default=False, action="store_true", help="Overwrite output file if it it exists; by " "default, the script will terminate if the file " "already exists.", ) args = parser.parse_args(argv) if not args.outfile: args.outfile = swap_ext(args.infile, ext) if args.ignore_readgroups: args.get_readgroup_func = _get_readgroup_ignored else: args.get_readgroup_func = _get_readgroup if not args.target_name: if args.infile == "-": args.target_name = "<STDIN>" else: args.target_name = os.path.basename(args.infile) if os.path.exists(args.outfile) and not args.overwrite_output: parser.error("Destination filename already exists (%r); use option " "--overwrite-output to allow overwriting of this file." % (args.outfile, )) return args
def test_swap_ext__has_ext_vs_new_ext(): assert_equal(swap_ext("name.foo", "bar"), "name.bar")
"names are case-sensitive." % (", ".join(map(repr, sorted(unknown_samples))), ", ".join(map(repr, sorted(known_samples))))) return if config.command in ("mito", "example"): if len(args) != 2: print_err("ERROR: Wrong number of arguments!") print_usage() return config.destination = args[1] config.samples = {} elif len(args) == 2: filename = args[1] config.destination = fileutils.swap_ext(filename, ".zonkey") if not os.path.isfile(filename): print_err("ERROR: Not a valid filename: %r" % (filename,)) return elif _is_bamfile(filename): # Called as either of # zonkey run <SampleDB> <nuclear.bam> # zonkey run <SampleDB> <mitochondrial.bam> config.samples = {"-": {"Root": config.destination, "Files": [filename]}} else: config.multisample = True if not _read_sample_table(config, filename): return elif 3 <= len(args) <= 4: