Beispiel #1
0
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs):
    gzipped_fpath = join(fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if reuse and \
           file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed file and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = which('bgzip')
    tabix = which('tabix')
    if not bgzip:
        err('Cannot index file because bgzip is not found')
    if not tabix:
        err('Cannot index file because tabix is not found')
    if not bgzip and not tabix:
        return fpath

    if isfile(gzipped_fpath):
        os.remove(gzipped_fpath)
    if isfile(tbi_fpath):
        os.remove(tbi_fpath)

    info('BGzipping ' + fpath)
    cmdline = '{bgzip} {fpath}'.format(**locals())
    call_process.run(cmdline)

    info('Tabixing ' + gzipped_fpath)
    cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())
    call_process.run(cmdline)

    return gzipped_fpath
Beispiel #2
0
def bam_to_bed(bam_fpath, to_gzip=True):
    debug(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz'
                                                   if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
Beispiel #3
0
def _get(relative_path, genome=None):
    """
    :param relative_path: relative path of the file inside the repository
    :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
                   in case of BED, the returning BedTool will be with added filter.
    :return: BedTools object if it's a BED file, or filepath
    """
    chrom = None
    if genome:
        if '-chr' in genome:
            genome, chrom = genome.split('-')
        check_genome(genome)
        relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if not isfile(path) and isfile(path + '.gz'):
        path += '.gz'

    if path.endswith('.bed') or path.endswith('.bed.gz'):
        if path.endswith('.bed.gz'):
            bedtools = which('bedtools')
            if not bedtools:
                critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
            debug('BED is compressed, creating BedTool')
            bed = BedTool(path)
        else:
            debug('BED is uncompressed, creating BedTool')
            bed = BedTool(path)

        if chrom:
            debug('Filtering BEDTool for chrom ' + chrom)
            bed = bed.filter(lambda r: r.chrom == chrom)
        return bed
    else:
        return path
Beispiel #4
0
def get_padded_bed_file(work_dir, bed, padding, fai_fpath):
    genome_fpath = fai_fpath
    info('Making bed file for padded regions...')
    bedtools = which('bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format(
        **locals())
    output_fpath = intermediate_fname(work_dir, bed, 'padded')
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Beispiel #5
0
def intersect_bed(work_dir, bed1, bed2):
    bed1_fname, _ = splitext_plus(basename(bed1))
    bed2_fname, _ = splitext_plus(basename(bed2))
    output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed')
    if can_reuse(output_fpath, [bed1, bed2]):
        return output_fpath
    bedtools = which('bedtools')
    cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals())
    call_process.run(cmdline,
                     output_fpath=output_fpath,
                     checks=[call_process.file_exists_check])
    return output_fpath
Beispiel #6
0
def annotate_target(work_dir, target_bed, genome_build):
    output_fpath = intermediate_fname(work_dir, target_bed, 'ann')
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    annotate_bed_py = which('annotate_bed.py')
    if not annotate_bed_py:
        critical(
            'Error: annotate_bed.py not found in PATH, please install TargQC.')

    cmdline = '{annotate_bed_py} {target_bed} -g {genome_build} -o {output_fpath}'.format(
        **locals())
    run(cmdline, output_fpath, stdout_to_outputfile=False)
    output_fpath = clean_bed(output_fpath, work_dir)
    return output_fpath
Beispiel #7
0
def proc_fastq(samples,
               parall_view,
               work_dir,
               bwa_prefix,
               downsample_to,
               num_pairs_by_sample=None,
               dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(
                can_reuse(make_pair_counts_fpath(join(work_dir, s.name)),
                          s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {
                s.name: int(
                    open(make_pair_counts_fpath(join(work_dir,
                                                     s.name))).read().strip())
                for s in samples
            }
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(
                count_read_pairs,
                [[s.name,
                  safe_mkdir(join(work_dir, s.name)), s.l_fpath]
                 for s in samples])
            num_pairs_by_sample = {
                s.name: pairs_count
                for s, pairs_count in zip(samples, num_pairs)
            }

        # Downsampling
        debug()
        if all(
                can_reuse(
                    make_downsampled_fpath(join(work_dir, s.name), s.l_fpath),
                    s.l_fpath) and can_reuse(
                        make_downsampled_fpath(join(work_dir, s.name),
                                               s.r_fpath), s.r_fpath)
                for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name),
                                                   s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) +
                     ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) +
                     ' read pairs')
            fastq_pairs = parall_view.run(downsample, [[
                join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath,
                downsample_to,
                num_pairs_by_sample.get(s.name)
            ] for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(
            can_reuse(make_bam_fpath(join(work_dir, s.name)),
                      [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:
                err('Error: bwa is required for the alignment pipeline')
            if not smb:
                err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align, [[
            join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb,
            bwa_prefix, dedup, parall_view.cores_per_job
        ] for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample
Beispiel #8
0
def find_executable():
    executable = which('qualimap')
    if not executable:
        critical('Error: "qualimap" executable is not found in PATH')
    return executable
Beispiel #9
0
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True):
    num_pairs_by_sample = num_pairs_by_sample or dict()
    if downsample_to:
        # Read pairs counts
        debug()
        if all(s.name in num_pairs_by_sample for s in samples):
            debug('Using read pairs counts extracted from FastQC reports')
        elif all(can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples):
            debug('Reusing pairs counts, reading from files')
            num_pairs_by_sample = {s.name: int(open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples}
        else:
            info('Counting read pairs')
            num_pairs = parall_view.run(count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples])
            num_pairs_by_sample = {s.name: pairs_count for s, pairs_count in zip(samples, num_pairs)}

        # Downsampling
        debug()
        if all(can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and
               can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples):
            debug('Reusing downsampled FastQ')
            for s in samples:
                s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath)
                s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath)
        else:
            if isinstance(downsample_to, float):
                info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads')
            else:
                info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs')
            fastq_pairs = parall_view.run(downsample,
                [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name)]
                 for s in samples])
            for s, (l_r, r_r) in zip(samples, fastq_pairs):
                s.l_fpath = l_r
                s.r_fpath = r_r
    else:
        info('Skipping downsampling')

    debug()
    if all(can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples):
        debug('All downsampled BAM exists, reusing')
        for s in samples:
            s.bam = make_bam_fpath(join(work_dir, s.name))
    else:
        bwa = which('bwa')
        if not isfile(bwa):
            critical('BWA not found under ' + bwa)
        smb = sambamba.get_executable()
        if not (bwa and smb):
            if not bwa:         err('Error: bwa is required for the alignment pipeline')
            if not smb:         err('Error: sambamba is required for the alignment pipeline')
            critical('Tools required for alignment not found')
        info('Aligning reads to the reference')
        bam_fpaths = parall_view.run(align,
            [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job]
             for s in samples])

        bam_fpaths = [verify_bam(b) for b in bam_fpaths]
        if len(bam_fpaths) < len(samples):
            critical('Some samples were not aligned successfully.')
        for bam, s in zip(bam_fpaths, samples):
            s.bam = bam

    return num_pairs_by_sample
Beispiel #10
0
def find_executable():
    executable = which('qualimap')
    if not executable:
        critical('Error: "qualimap" executable is not found in PATH')
    return executable
Beispiel #11
0
def get_executable():
    sys_path = which('sambamba')
    if not sys_path:
        critical('Error: sambamba executable is not found')
    return sys_path