def align_and_plot_coverage(out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, plot_x_limits, plot_y_limits, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, in_bam, ref_fasta, out_bam=None, sensitive=False, excludeDuplicates=False, bin_large_plots=False, binning_summary_statistic="max", JVMmemory=None, picardOptions=None, min_score_to_filter=None, aligner="bwa", aligner_options='', novoalign_license_path=None): ''' Take reads, align to reference with BWA-MEM, and generate a coverage plot ''' # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here # The main difference is the presence/absence of GATK's local_realign if out_bam is None: bam_aligned = util.file.mkstempfname('.aligned.bam') else: bam_aligned = out_bam assert aligner in ["bwa", "novoalign"] if aligner_options is None: if aligner == "novoalign": aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k' elif aligner == 'bwa': aligner_options = '-1' # hidden option to work around kernel/cpu bug; disables multithreaded file read: https://github.com/lh3/bwa/issues/102 samtools = tools.samtools.SamtoolsTool() ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(ref_fasta, ref_indexed) aln_bam = util.file.mkstempfname('.bam') if aligner == "bwa": bwa = tools.bwa.Bwa() bwa.index(ref_indexed) bwa_opts = aligner_options.split() if sensitive: bwa_opts += "-k 12 -A 1 -B 1 -O 1 -E 1".split() bwa.align_mem_bam(in_bam, ref_indexed, aln_bam, options=bwa_opts, min_score_to_filter=min_score_to_filter) elif aligner == "novoalign": tools.novoalign.NovoalignTool( license_path=novoalign_license_path).index_fasta(ref_indexed) tools.novoalign.NovoalignTool( license_path=novoalign_license_path).execute( in_bam, ref_indexed, aln_bam, options=aligner_options.split(), JVMmemory=JVMmemory) aln_bam_dupe_processed = util.file.mkstempfname( '.filtered_dupe_processed.bam') if excludeDuplicates: opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') tools.picard.MarkDuplicatesTool().execute([aln_bam], aln_bam_dupe_processed, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory) else: aln_bam_dupe_processed = aln_bam samtools.sort(aln_bam_dupe_processed, bam_aligned) os.unlink(aln_bam) if excludeDuplicates: os.unlink(aln_bam_dupe_processed) samtools.index(bam_aligned) # -- call plot function -- plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, plot_x_limits, plot_y_limits, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, bin_large_plots, binning_summary_statistic, out_summary) # remove the output bam, unless it is needed if out_bam is None: os.unlink(bam_aligned) # remove the files created by bwa index. # The empty extension causes the original fasta file to be removed for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: file_to_remove = ref_indexed + ext if os.path.isfile(file_to_remove): os.unlink(file_to_remove)
def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, aligner_options='', aligner="novoalign", JVMmemory=None, threads=1, skip_mark_dupes=False, gatk_path=None, novoalign_license_path=None): ''' Take reads, align to reference with Novoalign, optionally mark duplicates with Picard, realign indels with GATK, and optionally filters final file to mapped/non-dupe reads. ''' if not (outBamAll or outBamFiltered): log.warn("are you sure you meant to do nothing?") return assert aligner in ["novoalign", "bwa"] refFastaCopy = mkstempfname('.ref_copy.fasta') shutil.copyfile(refFasta, refFastaCopy) tools.picard.CreateSequenceDictionaryTool().execute(refFastaCopy, overwrite=True) tools.samtools.SamtoolsTool().faidx(refFastaCopy, overwrite=True) if aligner_options is None: if aligner == "novoalign": aligner_options = '-r Random' elif aligner == 'bwa': aligner_options = '' # use defaults bam_aligned = mkstempfname('.aligned.bam') if aligner == "novoalign": tools.novoalign.NovoalignTool( license_path=novoalign_license_path).index_fasta(refFastaCopy) tools.novoalign.NovoalignTool( license_path=novoalign_license_path).execute( inBam, refFastaCopy, bam_aligned, options=aligner_options.split(), JVMmemory=JVMmemory) elif aligner == 'bwa': bwa = tools.bwa.Bwa() bwa.index(refFastaCopy) opts = aligner_options.split() bwa.align_mem_bam(inBam, refFastaCopy, bam_aligned, options=opts) if skip_mark_dupes: bam_marked = bam_aligned else: bam_marked = mkstempfname('.mkdup.bam') tools.picard.MarkDuplicatesTool().execute( [bam_aligned], bam_marked, picardOptions=['CREATE_INDEX=true'], JVMmemory=JVMmemory) os.unlink(bam_aligned) tools.samtools.SamtoolsTool().index(bam_marked) bam_realigned = mkstempfname('.realigned.bam') tools.gatk.GATKTool(path=gatk_path).local_realign(bam_marked, refFastaCopy, bam_realigned, JVMmemory=JVMmemory, threads=threads) os.unlink(bam_marked) if outBamAll: shutil.copyfile(bam_realigned, outBamAll) tools.picard.BuildBamIndexTool().execute(outBamAll) if outBamFiltered: tools.samtools.SamtoolsTool().view(['-b', '-q', '1', '-F', '1028'], bam_realigned, outBamFiltered) tools.picard.BuildBamIndexTool().execute(outBamFiltered) os.unlink(bam_realigned)
def align_and_plot_coverage(out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, in_bam, ref_fasta, out_bam=None, sensitive=False, excludeDuplicates=False, JVMmemory=None, picardOptions=None, min_score_to_output=None, aligner="bwa", aligner_options='', novoalign_license_path=None): ''' Take reads, align to reference with BWA-MEM, and generate a coverage plot ''' # TODO: use read_utils.py::align_and_fix in place of the duplicated alignment code here # The main difference is the presence/absence of GATK's local_realign if out_bam is None: bam_aligned = util.file.mkstempfname('.aligned.bam') else: bam_aligned = out_bam assert aligner in ["bwa", "novoalign"] if aligner_options is None: if aligner == "novoalign": aligner_options = '-r Random -l 40 -g 40 -x 20 -t 100 -k' elif aligner == 'bwa': aligner_options = '-T 30' # quality threshold samtools = tools.samtools.SamtoolsTool() ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(ref_fasta, ref_indexed) aln_bam = util.file.mkstempfname('.bam') if aligner == "bwa": bwa = tools.bwa.Bwa() bwa.index(ref_indexed) bwa_opts = aligner_options.split() if sensitive: bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split() # get the quality threshold from the opts # for downstream filtering bwa_map_threshold = min_score_to_output or 30 if '-T' in bwa_opts: if bwa_opts.index("-T") + 1 <= len(bwa_opts): bwa_map_threshold = int(bwa_opts[bwa_opts.index("-T") + 1]) bwa.align_mem_bam(in_bam, ref_indexed, aln_bam, options=bwa_opts, min_qual=bwa_map_threshold) elif aligner == "novoalign": tools.novoalign.NovoalignTool( license_path=novoalign_license_path).index_fasta(ref_indexed) tools.novoalign.NovoalignTool( license_path=novoalign_license_path).execute( in_bam, ref_indexed, aln_bam, options=aligner_options.split(), JVMmemory=JVMmemory) aln_bam_dupe_processed = util.file.mkstempfname( '.filtered_dupe_processed.bam') if excludeDuplicates: opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') tools.picard.MarkDuplicatesTool().execute([aln_bam], aln_bam_dupe_processed, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory) else: aln_bam_dupe_processed = aln_bam samtools.sort(aln_bam_dupe_processed, bam_aligned) os.unlink(aln_bam) if excludeDuplicates: os.unlink(aln_bam_dupe_processed) samtools.index(bam_aligned) # -- call plot function -- plot_coverage(bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary) # remove the output bam, unless it is needed if out_bam is None: os.unlink(bam_aligned) # remove the files created by bwa index. # The empty extension causes the original fasta file to be removed for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: file_to_remove = ref_indexed + ext if os.path.isfile(file_to_remove): os.unlink(file_to_remove)
def align_and_fix( inBam, refFasta, outBamAll=None, outBamFiltered=None, aligner_options='', aligner="novoalign", JVMmemory=None, threads=1, gatk_path=None, novoalign_license_path=None ): ''' Take reads, align to reference with Novoalign, mark duplicates with Picard, realign indels with GATK, and optionally filter final file to mapped/non-dupe reads. ''' if not (outBamAll or outBamFiltered): log.warn("are you sure you meant to do nothing?") return assert aligner in ["novoalign", "bwa"] refFastaCopy = mkstempfname('.ref_copy.fasta') shutil.copyfile(refFasta, refFastaCopy) tools.picard.CreateSequenceDictionaryTool().execute(refFastaCopy, overwrite=True) tools.samtools.SamtoolsTool().faidx(refFastaCopy, overwrite=True) if aligner_options is None: if aligner=="novoalign": aligner_options = '-r Random' elif aligner=='bwa': aligner_options = '-T 30' # quality threshold bam_aligned = mkstempfname('.aligned.bam') if aligner=="novoalign": tools.novoalign.NovoalignTool(license_path=novoalign_license_path).index_fasta(refFastaCopy) tools.novoalign.NovoalignTool(license_path=novoalign_license_path).execute( inBam, refFastaCopy, bam_aligned, options=aligner_options.split(), JVMmemory=JVMmemory ) elif aligner=='bwa': bwa = tools.bwa.Bwa() bwa.index(refFastaCopy) opts = aligner_options.split() # get the quality threshold from the opts # for downstream filtering bwa_map_threshold = 30 if '-T' in opts: if opts.index("-T")+1 <= len(opts): bwa_map_threshold = int(opts[opts.index("-T")+1]) bwa.align_mem_bam(inBam, refFastaCopy, bam_aligned, options=opts, min_qual=bwa_map_threshold) bam_mkdup = mkstempfname('.mkdup.bam') tools.picard.MarkDuplicatesTool().execute( [bam_aligned], bam_mkdup, picardOptions=['CREATE_INDEX=true'], JVMmemory=JVMmemory ) os.unlink(bam_aligned) tools.samtools.SamtoolsTool().index(bam_mkdup) bam_realigned = mkstempfname('.realigned.bam') tools.gatk.GATKTool(path=gatk_path).local_realign(bam_mkdup, refFastaCopy, bam_realigned, JVMmemory=JVMmemory, threads=threads) os.unlink(bam_mkdup) if outBamAll: shutil.copyfile(bam_realigned, outBamAll) tools.picard.BuildBamIndexTool().execute(outBamAll) if outBamFiltered: tools.samtools.SamtoolsTool().view(['-b', '-q', '1', '-F', '1028'], bam_realigned, outBamFiltered) tools.picard.BuildBamIndexTool().execute(outBamFiltered) os.unlink(bam_realigned)
def align_and_plot_coverage( out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, in_bam, ref_fasta, out_bam=None, sensitive=False, excludeDuplicates=False, JVMmemory=None, picardOptions=None, min_score_to_output=None ): ''' Take reads, align to reference with BWA-MEM, and generate a coverage plot ''' if out_bam is None: bam_aligned = util.file.mkstempfname('.aligned.bam') else: bam_aligned = out_bam ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(ref_fasta, ref_indexed) bwa = tools.bwa.Bwa() samtools = tools.samtools.SamtoolsTool() bwa.index(ref_indexed) bwa_opts = [] if sensitive: bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split() map_threshold = min_score_to_output or 30 aln_bam = util.file.mkstempfname('.bam') bwa.align_mem_bam(in_bam, ref_indexed, aln_bam, options=bwa_opts, min_qual=map_threshold) aln_bam_dupe_processed = util.file.mkstempfname('.filtered_dupe_processed.bam') if excludeDuplicates: opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') tools.picard.MarkDuplicatesTool().execute( [aln_bam], aln_bam_dupe_processed, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory ) else: aln_bam_dupe_processed = aln_bam samtools.sort(aln_bam_dupe_processed, bam_aligned) os.unlink(aln_bam) if excludeDuplicates: os.unlink(aln_bam_dupe_processed) samtools.index(bam_aligned) # -- call plot function -- plot_coverage( bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary ) # remove the output bam, unless it is needed if out_bam is None: os.unlink(bam_aligned) # remove the files created by bwa index. # The empty extension causes the original fasta file to be removed for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: file_to_remove = ref_indexed + ext if os.path.isfile(file_to_remove): os.unlink(file_to_remove)