def align(work_dir, sample_name, l_fpath, r_fpath, bwa, smb, bwa_prefix, dedup=True, threads=1): info('Running bwa to align reads...') bam_fpath = make_bam_fpath(work_dir) if can_reuse(bam_fpath, [l_fpath, r_fpath]): return bam_fpath tmp_dirpath = join(work_dir, 'sambamba_tmp_dir') safe_mkdir(tmp_dirpath) bwa_cmdline = ( '{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' + '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' + '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}' ).format(**locals()) run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False) if dedup: dedup_bam_fpath = add_suffix(bam_fpath, 'dedup') dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format( **locals()) run(dedup_cmdl, output_fpath=dedup_bam_fpath, stdout_to_outputfile=False) verify_bam(dedup_bam_fpath) os.rename(dedup_bam_fpath, bam_fpath) sambamba.index_bam(bam_fpath) # samtools view -b -S -u - | # sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam # /dev/stdin # if dedup: # info() # info('Calling SamBlaster to mark duplicates') # markdup_sam_fpath = markdup_sam(sam_fpath, samblaster) # if markdup_sam_fpath: # sam_fpath = markdup_sam_fpath # info() # info('Converting to BAM') # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals()) # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate) # # info() # info('Sorting BAM') # prefix = splitext(sorted_bam_fpath)[0] # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals()) # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate) return bam_fpath
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs): gzipped_fpath = join(fpath + '.gz') tbi_fpath = gzipped_fpath + '.tbi' if reuse and \ file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \ file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath): info('Actual compressed file and index exist, reusing') return gzipped_fpath info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)') bgzip = which('bgzip') tabix = which('tabix') if not bgzip: err('Cannot index file because bgzip is not found') if not tabix: err('Cannot index file because tabix is not found') if not bgzip and not tabix: return fpath if isfile(gzipped_fpath): os.remove(gzipped_fpath) if isfile(tbi_fpath): os.remove(tbi_fpath) info('BGzipping ' + fpath) cmdline = '{bgzip} {fpath}'.format(**locals()) call_process.run(cmdline) info('Tabixing ' + gzipped_fpath) cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals()) call_process.run(cmdline) return gzipped_fpath
def cut(fpath, col_num, output_fpath=None): output_fpath = output_fpath or add_suffix(fpath, 'cut') if can_reuse(output_fpath, fpath): return output_fpath cmdline = 'cut -f' + ','.join(map(str, range( 1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def get_padded_bed_file(work_dir, bed, padding, fai_fpath): genome_fpath = fai_fpath info('Making bed file for padded regions...') bedtools = which('bedtools') cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format( **locals()) output_fpath = intermediate_fname(work_dir, bed, 'padded') call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all( can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all( verify_file( fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def intersect_bed(work_dir, bed1, bed2): bed1_fname, _ = splitext_plus(basename(bed1)) bed2_fname, _ = splitext_plus(basename(bed2)) output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed') if can_reuse(output_fpath, [bed1, bed2]): return output_fpath bedtools = which('bedtools') cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals()) call_process.run(cmdline, output_fpath=output_fpath, checks=[call_process.file_exists_check]) return output_fpath
def bam_to_bed(bam_fpath, to_gzip=True): debug( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed') if can_reuse(bam_bed_fpath, bam_fpath): return bam_bed_fpath bedtools = which('bedtools') gzip = which('gzip') cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals()) cmdline += ' | {gzip}'.format(**locals()) if to_gzip else '' call_process.run(cmdline, output_fpath=bam_bed_fpath) return bam_bed_fpath
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath annotate_bed_py = which('annotate_bed.py') if not annotate_bed_py: critical( 'Error: annotate_bed.py not found in PATH, please install TargQC.') cmdline = '{annotate_bed_py} {target_bed} -g {genome_build} -o {output_fpath}'.format( **locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def sort_bam(bam_fpath, work_dir, sambamba=None, samtools=None): sambamba = sambamba or get_executable() sorted_bam = intermediate_fname(work_dir, bam_fpath, 'sorted') if not can_reuse(sorted_bam, cmp_f=bam_fpath, silent=True): cmdline = '{sambamba} sort {bam_fpath} -o {sorted_bam}'.format(**locals()) res = run(cmdline, output_fpath=sorted_bam, stdout_to_outputfile=False, stdout_tx=False) return sorted_bam
def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report): """ 1. Generates Qualimap2 plots and put into plots_dirpath 2. Adds records to targqc_full_report.plots """ plots_dirpath = join(output_dir, 'plots') individual_report_fpaths = [s.qualimap_html_fpath for s in samples] if isdir(plots_dirpath) and not any( not can_reuse(join(plots_dirpath, f), individual_report_fpaths) for f in listdir(plots_dirpath) if not f.startswith('.')): debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...') else: # Qualimap2 run for multi-sample plots if len([s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0: if find_executable() is not None: # and get_qualimap_type(find_executable()) == 'full': qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc') _correct_qualimap_genome_results(samples) _correct_qualimap_insert_size_histogram(samples) safe_mkdir(qualimap_output_dir) rows = [] for sample in samples: if sample.qualimap_html_fpath: rows += [[sample.name, sample.qualimap_html_fpath]] data_fpath = write_tsv_rows(([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv')) qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport') cmdline = find_executable() + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(**locals()) run(cmdline, env_vars=dict(DISPLAY=None), checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate) if not verify_dir(qualimap_plots_dirpath): warn('Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.') return None else: if exists(plots_dirpath): shutil.rmtree(plots_dirpath) shutil.move(qualimap_plots_dirpath, plots_dirpath) else: warn('Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.') return None targqc_full_report.plots = [] for plot_fpath in listdir(plots_dirpath): plot_fpath = join(plots_dirpath, plot_fpath) if verify_file(plot_fpath) and plot_fpath.endswith('.png'): targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def align(work_dir, sample_name, l_fpath, r_fpath, bwa, smb, bwa_prefix, dedup=True, threads=1): info('Running bwa to align reads...') bam_fpath = make_bam_fpath(work_dir) if can_reuse(bam_fpath, [l_fpath, r_fpath]): return bam_fpath tmp_dirpath = join(work_dir, 'sambamba_tmp_dir') safe_mkdir(tmp_dirpath) bwa_cmdline = ('{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' + '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' + '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}').format(**locals()) run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False) if dedup: dedup_bam_fpath = add_suffix(bam_fpath, 'dedup') dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format(**locals()) run(dedup_cmdl, output_fpath=dedup_bam_fpath, stdout_to_outputfile=False) verify_bam(dedup_bam_fpath) os.rename(dedup_bam_fpath, bam_fpath) sambamba.index_bam(bam_fpath) # samtools view -b -S -u - | # sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam # /dev/stdin # if dedup: # info() # info('Calling SamBlaster to mark duplicates') # markdup_sam_fpath = markdup_sam(sam_fpath, samblaster) # if markdup_sam_fpath: # sam_fpath = markdup_sam_fpath # info() # info('Converting to BAM') # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals()) # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate) # # info() # info('Sorting BAM') # prefix = splitext(sorted_bam_fpath)[0] # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals()) # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate) return bam_fpath
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report): """ 1. Generates Qualimap2 plots and put into plots_dirpath 2. Adds records to targqc_full_report.plots """ plots_dirpath = join(output_dir, 'plots') individual_report_fpaths = [s.qualimap_html_fpath for s in samples] if isdir(plots_dirpath) and not any( not can_reuse(join(plots_dirpath, f), individual_report_fpaths) for f in listdir(plots_dirpath) if not f.startswith('.')): debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...') else: # Qualimap2 run for multi-sample plots if len( [s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0: if find_executable( ) is not None: # and get_qualimap_type(find_executable()) == 'full': qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc') _correct_qualimap_genome_results(samples) _correct_qualimap_insert_size_histogram(samples) safe_mkdir(qualimap_output_dir) rows = [] for sample in samples: if sample.qualimap_html_fpath: rows += [[sample.name, sample.qualimap_html_fpath]] data_fpath = write_tsv_rows( ([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv')) qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport') cmdline = find_executable( ) + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format( **locals()) run(cmdline, env_vars=dict(DISPLAY=None), checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate) if not verify_dir(qualimap_plots_dirpath): warn( 'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.' ) return None else: if exists(plots_dirpath): shutil.rmtree(plots_dirpath) shutil.move(qualimap_plots_dirpath, plots_dirpath) else: warn( 'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.' ) return None targqc_full_report.plots = [] for plot_fpath in listdir(plots_dirpath): plot_fpath = join(plots_dirpath, plot_fpath) if verify_file(plot_fpath) and plot_fpath.endswith('.png'): targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def call_sambamba(cmdl, bam_fpath, output_fpath=None, command_name='', no_index=False): if not no_index: index_bam(bam_fpath) sambamba = get_executable() run(sambamba + ' ' + cmdl, output_fpath=output_fpath) return output_fpath
def index_bam(bam_fpath, sambamba=None, samtools=None): sambamba = sambamba or get_executable() indexed_bam = bam_fpath + '.bai' if not can_reuse(indexed_bam, cmp_f=bam_fpath, silent=True): cmdline = '{sambamba} index {bam_fpath}'.format(**locals()) res = run(cmdline, output_fpath=indexed_bam, stdout_to_outputfile=False, stdout_tx=False)