def align_rna_metagenomics( inBam, db, taxDb, outReport, dupeReport=None, outBam=None, dupeLca=None, outLca=None, sensitive=None, JVMmemory=None, numThreads=None, picardOptions=None, min_score_to_output=None, ): ''' Align to metagenomics bwa index, mark duplicates, and generate LCA report ''' picardOptions = picardOptions if picardOptions else [] bwa = tools.bwa.Bwa() samtools = tools.samtools.SamtoolsTool() bwa_opts = ['-a'] if sensitive: bwa_opts += '-k 12 -A 1 -B 1 -O 1 -E 1'.split() map_threshold = min_score_to_output or 30 aln_bam = util.file.mkstempfname('.bam') bwa.mem(inBam, db, aln_bam, options=bwa_opts, min_qual=map_threshold) tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True) if dupeReport: aln_bam_sorted = util.file.mkstempfname('.align_namesorted.bam') samtools.sort(aln_bam, aln_bam_sorted, args=['-n'], threads=numThreads) sam_lca_report(tax_db, aln_bam_sorted, outReport=dupeReport, outLca=dupeLca, unique_only=False) os.unlink(aln_bam_sorted) aln_bam_deduped = outBam if outBam else util.file.mkstempfname( '.align_deduped.bam') opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') pic = tools.picard.MarkDuplicatesTool() pic.execute([aln_bam], aln_bam_deduped, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=pic.jvmMemDefault) os.unlink(aln_bam) sam_lca_report(tax_db, aln_bam_deduped, outReport=outReport, outLca=outLca) if not outBam: os.unlink(aln_bam_deduped)
def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None, min_score_to_filter=None, aligner_options=None): ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats. ''' assert outBam or outStats, "Either outBam or outStats must be specified" if outBam is None: bam_aligned = mkstempfname('.aligned.bam') else: bam_aligned = outBam samtools = tools.samtools.SamtoolsTool() bwa = tools.bwa.Bwa() ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(refFasta, ref_indexed) bwa.index(ref_indexed) bwa_opts = [] if aligner_options is None else aligner_options.split() bwa.mem(inBam, refFasta, bam_aligned, options=bwa_opts, min_score_to_filter=min_score_to_filter) if outStats is not None: samtools.idxstats(bam_aligned, outStats) if outBam is None: os.unlink(bam_aligned)
def align_rna_metagenomics( inBam, db, taxDb, outReport, dupeReport=None, outBam=None, dupeLca=None, outLca=None, sensitive=None, JVMmemory=None, numThreads=None, picardOptions=None, min_score_to_output=None, ): """ Align to metagenomics bwa index, mark duplicates, and generate LCA report """ picardOptions = picardOptions if picardOptions else [] bwa = tools.bwa.Bwa() samtools = tools.samtools.SamtoolsTool() bwa_opts = ["-a"] if sensitive: bwa_opts += "-k 12 -A 1 -B 1 -O 1 -E 1".split() map_threshold = min_score_to_output or 30 aln_bam = util.file.mkstempfname(".bam") bwa.mem(inBam, db, aln_bam, options=bwa_opts, min_qual=map_threshold) tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True) if dupeReport: aln_bam_sorted = util.file.mkstempfname(".align_namesorted.bam") samtools.sort(aln_bam, aln_bam_sorted, args=["-n"], threads=numThreads) sam_lca_report(tax_db, aln_bam_sorted, outReport=dupeReport, outLca=dupeLca, unique_only=False) os.unlink(aln_bam_sorted) aln_bam_deduped = outBam if outBam else util.file.mkstempfname(".align_deduped.bam") opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname(".metrics") pic = tools.picard.MarkDuplicatesTool() pic.execute([aln_bam], aln_bam_deduped, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=pic.jvmMemDefault) os.unlink(aln_bam) sam_lca_report(tax_db, aln_bam_deduped, outReport=outReport, outLca=outLca) if not outBam: os.unlink(aln_bam_deduped)
def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None): ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats. ''' if outBam is None: bam_aligned = mkstempfname('.aligned.bam') else: bam_aligned = outBam samtools = tools.samtools.SamtoolsTool() bwa = tools.bwa.Bwa() bwa.mem(inBam, refFasta, bam_aligned) if outStats is not None: samtools.idxstats(bam_aligned, outStats) if outBam is None: os.unlink(bam_aligned)
def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None): ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats. ''' assert outBam or outStats, "Either outBam or outStats must be specified" if outBam is None: bam_aligned = mkstempfname('.aligned.bam') else: bam_aligned = outBam samtools = tools.samtools.SamtoolsTool() bwa = tools.bwa.Bwa() bwa.mem(inBam, refFasta, bam_aligned) if outStats is not None: samtools.idxstats(bam_aligned, outStats) if outBam is None: os.unlink(bam_aligned)
def align_rna_metagenomics( inBam, db, taxDb, outReport, dupeReport=None, outBam=None, dupeReads=None, outReads=None, sensitive=None, JVMmemory=None, numThreads=None, picardOptions=None, ): ''' Align to metagenomics bwa index, mark duplicates, and generate LCA report ''' picardOptions = picardOptions if picardOptions else [] bwa = tools.bwa.Bwa() samtools = tools.samtools.SamtoolsTool() bwa_opts = ['-a'] if sensitive: bwa_opts += '-k 12 -A 1 -B 1 -O 1 -E 1'.split() # TODO: Use bwa.mem's min_score_to_filter argument to decrease false # positives in the output. Currently, it works by summing the alignment # score across all alignments output by bwa for each query (reads in a # pair, supplementary, and secondary alignments). This is not reasonable # for reads with secondary alignments because it will be easier for those # reads/queries to exceed the threshold given by the value of the argument. # In this context, bwa is called using '-a' as an option and its output # will likely include many secondary alignments. One option is to add # another argument to bwa.mem, similar to min_score_to_filter, that sets a # threshold on the alignment score of output alignments but only filters on # a per-alignment level (i.e., not by summing alignment scores across all # alignments for each query). aln_bam = util.file.mkstempfname('.bam') bwa.mem(inBam, db, aln_bam, options=bwa_opts) tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True) if dupeReport: aln_bam_sorted = util.file.mkstempfname('.align_namesorted.bam') samtools.sort(aln_bam, aln_bam_sorted, args=['-n'], threads=numThreads) sam_lca_report(tax_db, aln_bam_sorted, outReport=dupeReport, outReads=dupeReads, unique_only=False) os.unlink(aln_bam_sorted) aln_bam_deduped = outBam if outBam else util.file.mkstempfname( '.align_deduped.bam') opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') pic = tools.picard.MarkDuplicatesTool() pic.execute([aln_bam], aln_bam_deduped, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory) os.unlink(aln_bam) aln_bam_dd_sorted = util.file.mkstempfname('.bam') samtools.sort(aln_bam_deduped, aln_bam_dd_sorted, args=['-n'], threads=numThreads) sam_lca_report(tax_db, aln_bam_dd_sorted, outReport=outReport, outReads=outReads) if not outBam: os.unlink(aln_bam_deduped)
def align_and_plot_coverage( out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, in_bam, ref_fasta, out_bam=None, sensitive=False, excludeDuplicates=False, JVMmemory=None, picardOptions=None, min_score_to_output=None ): ''' Take reads, align to reference with BWA-MEM, and generate a coverage plot ''' if out_bam is None: bam_aligned = util.file.mkstempfname('.aligned.bam') else: bam_aligned = out_bam ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(ref_fasta, ref_indexed) bwa = tools.bwa.Bwa() samtools = tools.samtools.SamtoolsTool() bwa.index(ref_indexed) bwa_opts = [] if sensitive: bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split() map_threshold = min_score_to_output or 30 bwa_opts + ["-T", str(map_threshold)] aln_bam = util.file.mkstempfname('.bam') bwa.mem(in_bam, ref_indexed, aln_bam, opts=bwa_opts) # @haydenm says: # For some reason (particularly when the --sensitive option is on), bwa # doesn't listen to its '-T' flag and outputs alignments with score less # than the '-T 30' threshold. So filter these: aln_bam_filtered = util.file.mkstempfname('.filtered.bam') samtools.view(["-b", "-h", "-q", str(map_threshold)], aln_bam, aln_bam_filtered) os.unlink(aln_bam) aln_bam_dupe_processed = util.file.mkstempfname('.filtered_dupe_processed.bam') if excludeDuplicates: opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') tools.picard.MarkDuplicatesTool().execute( [aln_bam_filtered], aln_bam_dupe_processed, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory ) else: aln_bam_dupe_processed = aln_bam_filtered samtools.sort(aln_bam_dupe_processed, bam_aligned) os.unlink(aln_bam_filtered) if excludeDuplicates: os.unlink(aln_bam_dupe_processed) samtools.index(bam_aligned) # -- call plot function -- plot_coverage( bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary ) # remove the output bam, unless it is needed if out_bam is None: os.unlink(bam_aligned) # remove the files created by bwa index. # The empty extension causes the original fasta file to be removed for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: file_to_remove = ref_indexed + ext if os.path.isfile(file_to_remove): os.unlink(file_to_remove)