def add_bootstrap_support(destination, replicate, bootstrap): if not (replicate and bootstrap): return filter(None, (replicate, bootstrap)) replicate_file = os.path.join(destination, "replicates.newick") bootstrap_file = os.path.join(destination, "bootstraps.newick") output_file = add_postfix(replicate_file, ".support") return NewickSupportNode(main_tree_files = replicate_file, support_tree_files = bootstrap_file, output_file = output_file, dependencies = (bootstrap, replicate)),
def add_bootstrap_support(destination, replicate, bootstrap): if not (replicate and bootstrap): return filter(None, (replicate, bootstrap)) replicate_file = os.path.join(destination, "replicates.newick") bootstrap_file = os.path.join(destination, "bootstraps.newick") output_file = add_postfix(replicate_file, ".support") return NewickSupportNode(main_tree_files=replicate_file, support_tree_files=bootstrap_file, output_file=output_file, dependencies=(bootstrap, replicate)),
def build_regions_nodes(regions, padding, dependencies=()): destination = add_postfix(regions["BED"], ".padded_%ibp" % (padding,)) if not padding: return regions["BED"], dependencies if destination not in _BED_CACHE: dependencies = list(dependencies) dependencies.append(build_fasta_index_node(regions["FASTA"])) _BED_CACHE[destination] \ = PaddedBedNode(fai_file=regions["FASTA"] + ".fai", infile=regions["BED"], outfile=destination, amount=padding, dependencies=dependencies) return destination, (_BED_CACHE[destination],)
def _check_bam_sequences(options, mkfile, steps): """Check that the BAM files contains the reference sequences found in the FASTA file, matched by name and length; extra sequences are permitted. This check is only done if genotyping is to be carried out, to reduce the overhead of reading the BAM file headers. """ if ("genotype" not in steps) and ("genotyping" not in steps): return print_info(" - Validating BAM files ...") bam_files = {} for regions in mkfile["Project"]["Regions"].itervalues(): for sample in mkfile["Project"]["Samples"].itervalues(): filename = os.path.join(options.samples_root, "%s.%s.bam" % (sample["Name"], regions["Prefix"])) if regions["Realigned"]: filename = add_postfix(filename, ".realigned") if os.path.exists(filename): bam_files[filename] = _collect_fasta_contigs(regions["FASTA"]) for (filename, contigs) in bam_files.iteritems(): with pysam.Samfile(filename) as handle: bam_contigs = dict(zip(handle.references, handle.lengths)) for (contig, length) in contigs.iteritems(): bam_length = bam_contigs.get(contig) if bam_length is None: message = ("Reference sequence missing from BAM file; " "BAM file aligned against different prefix?\n" " BAM file = %s\n Sequence name = %s") \ % (filename, contig) raise MakefileError(message) elif bam_length != length: message = ("Length of reference sequence in FASTA differs " "from length of sequence in BAM file; BAM file " "aligned against different prefix?\n" " BAM file = %s\n" " Length in FASTA = %s\n" " Length in BAM = %s") \ % (filename, length, bam_length) raise MakefileError(message)
def build_genotyping_bedfile_nodes(options, genotyping, sample, regions, dependencies): bamfile = "%s.%s.bam" % (sample, regions["Prefix"]) bamfile = os.path.join(options.samples_root, bamfile) if regions["Realigned"]: bamfile = add_postfix(bamfile, ".realigned") prefix = regions["Genotypes"][sample] padding, bedfile = genotyping["Padding"], None if not genotyping["GenotypeEntirePrefix"]: bedfile, nodes = \ build_regions_nodes(regions, padding, dependencies) bai_node = build_bam_index_node(bamfile) dependencies = nodes + (bai_node,) else: prefix = os.path.join(os.path.dirname(prefix), "%s.%s.TEMP" % (sample, regions["Prefix"])) dependencies += (build_bam_index_node(bamfile),) return prefix, bamfile, bedfile, dependencies
def build_sampling_nodes(options, genotyping, sample, regions, dependencies): fasta_file = regions["Genotypes"][sample] pileup_file = swap_ext(fasta_file, ".pileup.bgz") padding = genotyping["Padding"] slop, node = build_regions_nodes(regions, padding, dependencies) bam_file = "%s.%s.bam" % (sample, regions["Prefix"]) bam_file = os.path.join(options.samples_root, bam_file) if regions["Realigned"]: bam_file = add_postfix(bam_file, ".realigned") bai_node = build_bam_index_node(bam_file) genotype = GenotypeRegionsNode.customize(pileup_only=True, reference=regions["FASTA"], bedfile=slop, infile=bam_file, outfile=pileup_file, nbatches=options.samtools_max_threads, dependencies=node + (bai_node,)) apply_samtools_options(genotype.command, genotyping["MPileup"], "--mpileup-argument") genotype = genotype.build_node() tabix = TabixIndexNode(infile=pileup_file, preset="pileup", dependencies=genotype) builder = SampleRegionsNode(infile=pileup_file, bedfile=regions["BED"], outfile=fasta_file, dependencies=tabix) faidx = FastaIndexNode(infile=fasta_file, dependencies=builder) return (faidx,)
def main(argv): config, args = parse_options(argv) if config is None: return 1 # Get default options for bam_pipeline bam_config, _ = bam_cfg.parse_config(args, "bam") makefiles = bam_pipeline.read_makefiles(bam_config, args) # Build .fai files for reference .fasta files bam_pipeline.index_references(bam_config, makefiles) for makefile in makefiles: mkfile_fname = makefile["Statistics"]["Filename"] bam_config.destination = os.path.dirname(mkfile_fname) tasks = bam_pipeline.build_pipeline_full(bam_config, makefile, return_nodes=False) make_dirs(config.destination) makefile_name = add_postfix(makefile["Statistics"]["Filename"], config.postfix) makefile_path = reroot_path(config.destination, makefile_name) if samefile(makefile["Statistics"]["Filename"], makefile_path): sys.stderr.write("ERROR: Would overwrite source makefile at %r\n" % (makefile_path,)) sys.stderr.write(" Please set --destination and/or --output-name-postfix\n") sys.stderr.write(" before continuing.\n") return 1 print("Writing makefile", makefile_path) found_prefix = False for prefix in makefile["Prefixes"]: if prefix != config.prefix: print("%sSkipping %s" % (_INDENTATION, prefix)) else: found_prefix = True if not found_prefix: sys.stderr.write("\nERROR:\n") sys.stderr.write("Could not find prefix %r in %r! Aborting ...\n" % (config.prefix, mkfile_fname)) return 1 with open(makefile_path, "w") as makefile_handle: template = bam_mkfile.build_makefile(add_sample_tmpl=False) makefile_handle.write(template) makefile_handle.write("\n" * 3) for target in tasks: target_name = add_postfix(target.name, config.postfix) print("%sTarget: %s -> %s" % (_INDENTATION, target.name, target_name)) makefile_handle.write('%s"%s":\n' % (_INDENTATION * 0, target_name)) for prefix in target.prefixes: if prefix.name != config.prefix: continue for sample in prefix.samples: print("%sSample: %s" % (_INDENTATION * 2, sample.name)) makefile_handle.write('%s"%s":\n' % (_INDENTATION * 1, sample.name)) for library in sample.libraries: print("%sLibrary: %s" % (_INDENTATION * 3, library.name)) makefile_handle.write('%s"%s":\n' % (_INDENTATION * 2, library.name)) sink_cache = {} destination = os.path.join(target_name, "reads", sample.name, library.name) for lane in library.lanes: convert_reads(config, destination, lane, sink_cache) ReadSink.close_all_sinks() for lane_name in sorted(sink_cache): makefile_handle.write('%s"%s":\n' % (_INDENTATION * 3, lane_name)) for (reads_type, sink) in sorted(sink_cache[lane_name].items()): makefile_handle.write('%s%s "%s"\n' % (_INDENTATION * 4, ("%s:" % (reads_type,)).ljust(20), sink.filename)) makefile_handle.write("\n") print("\tDone ...") print() return 0
def test_add_postfix__no_ext__no_postfix(): assert_equal(add_postfix("name", ""), "name")
def test_add_postfix__no_ext__underscore_postfix(): assert_equal(add_postfix("name", "_pf"), "name_pf")
def test_add_postfix__dot_postfix(): assert_equal(add_postfix("name.foo", ".pf"), "name.pf.foo")
def test_add_postfix__underscore_postfix(): assert_equal(add_postfix("name.foo", "_pf"), "name_pf.foo")
def test_add_postfix__no_ext__dot_postfix(): assert_equal(add_postfix("name", ".pf"), "name.pf")
def test_add_postfix__no_postfix(): assert_equal(add_postfix("name.foo", ""), "name.foo")