Exemple #1
0
def align(work_dir,
          sample_name,
          l_fpath,
          r_fpath,
          bwa,
          smb,
          bwa_prefix,
          dedup=True,
          threads=1):
    info('Running bwa to align reads...')
    bam_fpath = make_bam_fpath(work_dir)
    if can_reuse(bam_fpath, [l_fpath, r_fpath]):
        return bam_fpath

    tmp_dirpath = join(work_dir, 'sambamba_tmp_dir')
    safe_mkdir(tmp_dirpath)

    bwa_cmdline = (
        '{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' +
        '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' +
        '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}'
    ).format(**locals())
    run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False)

    if dedup:
        dedup_bam_fpath = add_suffix(bam_fpath, 'dedup')
        dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format(
            **locals())
        run(dedup_cmdl,
            output_fpath=dedup_bam_fpath,
            stdout_to_outputfile=False)
        verify_bam(dedup_bam_fpath)
        os.rename(dedup_bam_fpath, bam_fpath)

    sambamba.index_bam(bam_fpath)

    # samtools view -b -S -u - |
    # sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full
    # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam
    # /dev/stdin

    # if dedup:
    #     info()
    #     info('Calling SamBlaster to mark duplicates')
    #     markdup_sam_fpath = markdup_sam(sam_fpath, samblaster)
    #     if markdup_sam_fpath:
    #         sam_fpath = markdup_sam_fpath
    # info()

    # info('Converting to BAM')
    # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate)
    #
    # info()
    # info('Sorting BAM')
    # prefix = splitext(sorted_bam_fpath)[0]
    # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate)

    return bam_fpath
Exemple #2
0
def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath
Exemple #3
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
Exemple #4
0
def cut(fpath, col_num, output_fpath=None):
    output_fpath = output_fpath or add_suffix(fpath, 'cut')
    if can_reuse(output_fpath, fpath):
        return output_fpath
    cmdline = 'cut -f' + ','.join(map(str, range(
        1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Exemple #5
0
def get_padded_bed_file(work_dir, bed, padding, fai_fpath):
    genome_fpath = fai_fpath
    info('Making bed file for padded regions...')
    bedtools = which('bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format(
        **locals())
    output_fpath = intermediate_fname(work_dir, bed, 'padded')
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Exemple #6
0
def run_qualimap(work_dir,
                 output_dir,
                 output_fpaths,
                 bam_fpath,
                 genome,
                 bed_fpath=None,
                 threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() +
               ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
               '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(
            can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(
            verify_file(
                fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath])
            for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir
Exemple #7
0
def intersect_bed(work_dir, bed1, bed2):
    bed1_fname, _ = splitext_plus(basename(bed1))
    bed2_fname, _ = splitext_plus(basename(bed2))
    output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed')
    if can_reuse(output_fpath, [bed1, bed2]):
        return output_fpath
    bedtools = which('bedtools')
    cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals())
    call_process.run(cmdline,
                     output_fpath=output_fpath,
                     checks=[call_process.file_exists_check])
    return output_fpath
Exemple #8
0
def bam_to_bed(bam_fpath, to_gzip=True):
    debug(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz'
                                                   if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
Exemple #9
0
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    annotate_bed_py = which('annotate_bed.py')
    if not annotate_bed_py:
        critical(
            'Error: annotate_bed.py not found in PATH, please install TargQC.')

    cmdline = '{annotate_bed_py} {target_bed} -g {genome_build} -o {output_fpath}'.format(
        **locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
Exemple #10
0
def sort_bam(bam_fpath, work_dir, sambamba=None, samtools=None):
    sambamba = sambamba or get_executable()
    sorted_bam = intermediate_fname(work_dir, bam_fpath, 'sorted')
    if not can_reuse(sorted_bam, cmp_f=bam_fpath, silent=True):
        cmdline = '{sambamba} sort {bam_fpath} -o {sorted_bam}'.format(**locals())
        res = run(cmdline, output_fpath=sorted_bam, stdout_to_outputfile=False, stdout_tx=False)
    return sorted_bam
Exemple #11
0
def run_multisample_qualimap(output_dir, work_dir, samples, targqc_full_report):
    """ 1. Generates Qualimap2 plots and put into plots_dirpath
        2. Adds records to targqc_full_report.plots
    """
    plots_dirpath = join(output_dir, 'plots')
    individual_report_fpaths = [s.qualimap_html_fpath for s in samples]
    if isdir(plots_dirpath) and not any(
            not can_reuse(join(plots_dirpath, f), individual_report_fpaths)
            for f in listdir(plots_dirpath) if not f.startswith('.')):
        debug('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...')
    else:
        # Qualimap2 run for multi-sample plots
        if len([s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0:
            if find_executable() is not None:  # and get_qualimap_type(find_executable()) == 'full':
                qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc')

                _correct_qualimap_genome_results(samples)
                _correct_qualimap_insert_size_histogram(samples)

                safe_mkdir(qualimap_output_dir)
                rows = []
                for sample in samples:
                    if sample.qualimap_html_fpath:
                        rows += [[sample.name, sample.qualimap_html_fpath]]

                data_fpath = write_tsv_rows(([], rows), join(qualimap_output_dir, 'qualimap_results_by_sample.tsv'))
                qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport')
                cmdline = find_executable() + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(**locals())
                run(cmdline, env_vars=dict(DISPLAY=None),
                    checks=[lambda _1, _2: verify_dir(qualimap_output_dir)], reuse=cfg.reuse_intermediate)

                if not verify_dir(qualimap_plots_dirpath):
                    warn('Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.')
                    return None
                else:
                    if exists(plots_dirpath):
                        shutil.rmtree(plots_dirpath)
                    shutil.move(qualimap_plots_dirpath, plots_dirpath)
            else:
                warn('Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.')
                return None

    targqc_full_report.plots = []
    for plot_fpath in listdir(plots_dirpath):
        plot_fpath = join(plots_dirpath, plot_fpath)
        if verify_file(plot_fpath) and plot_fpath.endswith('.png'):
            targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
Exemple #12
0
def align(work_dir, sample_name, l_fpath, r_fpath, bwa, smb, bwa_prefix, dedup=True, threads=1):
    info('Running bwa to align reads...')
    bam_fpath = make_bam_fpath(work_dir)
    if can_reuse(bam_fpath, [l_fpath, r_fpath]):
        return bam_fpath

    tmp_dirpath = join(work_dir, 'sambamba_tmp_dir')
    safe_mkdir(tmp_dirpath)

    bwa_cmdline = ('{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' +
                   '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' +
                   '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}').format(**locals())
    run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False)

    if dedup:
        dedup_bam_fpath = add_suffix(bam_fpath, 'dedup')
        dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format(**locals())
        run(dedup_cmdl, output_fpath=dedup_bam_fpath, stdout_to_outputfile=False)
        verify_bam(dedup_bam_fpath)
        os.rename(dedup_bam_fpath, bam_fpath)

    sambamba.index_bam(bam_fpath)

# samtools view -b -S -u - |
# sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full
    # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam
    # /dev/stdin

    # if dedup:
    #     info()
    #     info('Calling SamBlaster to mark duplicates')
    #     markdup_sam_fpath = markdup_sam(sam_fpath, samblaster)
    #     if markdup_sam_fpath:
    #         sam_fpath = markdup_sam_fpath
    # info()

    # info('Converting to BAM')
    # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate)
    #
    # info()
    # info('Sorting BAM')
    # prefix = splitext(sorted_bam_fpath)[0]
    # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate)

    return bam_fpath
Exemple #13
0
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1):
    info('Analysing ' + bam_fpath)

    safe_mkdir(dirname(output_dir))
    safe_mkdir(output_dir)

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(bam_fpath)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = '--java-mem-size=' + mem

    cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 '
        '-bam {bam_fpath} -outdir {output_dir}')

    if genome.startswith('hg') or genome.startswith('GRCh'):
        cmdline += ' -gd HUMAN'
    if genome.startswith('mm'):
        cmdline += ' -gd MOUSE'

    if bed_fpath:
        cmdline += ' -gff {bed_fpath}'
        debug('Using amplicons/capture panel ' + bed_fpath)

    cmdline = cmdline.format(**locals())
    if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        for fp in output_fpaths:
            if isfile(fp):
                os.remove(fp)
        try:
            run(cmdline, env_vars=dict(DISPLAY=None))
        except subprocess.CalledProcessError as e:
            if 'The alignment file is unsorted.' in e.output:
                info()
                info('BAM file is unsorted; trying to sort and rerun QualiMap')
                sorted_bam_fpath = sort_bam(bam_fpath)
                cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath)
                run(cmdline, env_vars=dict(DISPLAY=None))

    if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths):
        critical('Some of the QualiMap results were not generated')

    return output_dir
Exemple #14
0
def run_multisample_qualimap(output_dir, work_dir, samples,
                             targqc_full_report):
    """ 1. Generates Qualimap2 plots and put into plots_dirpath
        2. Adds records to targqc_full_report.plots
    """
    plots_dirpath = join(output_dir, 'plots')
    individual_report_fpaths = [s.qualimap_html_fpath for s in samples]
    if isdir(plots_dirpath) and not any(
            not can_reuse(join(plots_dirpath, f), individual_report_fpaths)
            for f in listdir(plots_dirpath) if not f.startswith('.')):
        debug('Qualimap miltisample plots exist - ' + plots_dirpath +
              ', reusing...')
    else:
        # Qualimap2 run for multi-sample plots
        if len(
            [s.qualimap_html_fpath
             for s in samples if s.qualimap_html_fpath]) > 0:
            if find_executable(
            ) is not None:  # and get_qualimap_type(find_executable()) == 'full':
                qualimap_output_dir = join(work_dir, 'qualimap_multi_bamqc')

                _correct_qualimap_genome_results(samples)
                _correct_qualimap_insert_size_histogram(samples)

                safe_mkdir(qualimap_output_dir)
                rows = []
                for sample in samples:
                    if sample.qualimap_html_fpath:
                        rows += [[sample.name, sample.qualimap_html_fpath]]

                data_fpath = write_tsv_rows(
                    ([], rows),
                    join(qualimap_output_dir,
                         'qualimap_results_by_sample.tsv'))
                qualimap_plots_dirpath = join(qualimap_output_dir,
                                              'images_multisampleBamQcReport')
                cmdline = find_executable(
                ) + ' multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(
                    **locals())
                run(cmdline,
                    env_vars=dict(DISPLAY=None),
                    checks=[lambda _1, _2: verify_dir(qualimap_output_dir)],
                    reuse=cfg.reuse_intermediate)

                if not verify_dir(qualimap_plots_dirpath):
                    warn(
                        'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.'
                    )
                    return None
                else:
                    if exists(plots_dirpath):
                        shutil.rmtree(plots_dirpath)
                    shutil.move(qualimap_plots_dirpath, plots_dirpath)
            else:
                warn(
                    'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.'
                )
                return None

    targqc_full_report.plots = []
    for plot_fpath in listdir(plots_dirpath):
        plot_fpath = join(plots_dirpath, plot_fpath)
        if verify_file(plot_fpath) and plot_fpath.endswith('.png'):
            targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
Exemple #15
0
def call_sambamba(cmdl, bam_fpath, output_fpath=None, command_name='', no_index=False):
    if not no_index:
        index_bam(bam_fpath)
    sambamba = get_executable()
    run(sambamba + ' ' + cmdl, output_fpath=output_fpath)
    return output_fpath
Exemple #16
0
def index_bam(bam_fpath, sambamba=None, samtools=None):
    sambamba = sambamba or get_executable()
    indexed_bam = bam_fpath + '.bai'
    if not can_reuse(indexed_bam, cmp_f=bam_fpath, silent=True):
        cmdline = '{sambamba} index {bam_fpath}'.format(**locals())
        res = run(cmdline, output_fpath=indexed_bam, stdout_to_outputfile=False, stdout_tx=False)