コード例 #1
0
def extract_variant_from_bams(cnf, out_dirpath, transcripts, chr_length, samples, chrom, variant, bams_created_before):
    padding = 500
    sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True)
    pos, ref, alt, variant_transcripts = variant['pos'], variant['ref'], variant['alt'], variant['transcripts']
    bam_prefix = None
    for transcript in variant_transcripts:
        transcript_name = sorted(variant_transcripts)[0]
        transcript_exons = transcripts[(transcript, chrom)]
        for idx, exon in enumerate(transcript_exons):
            if exon['start'] <= pos <= exon['stop']:
                start, end = exon['start'], exon['stop']
                bam_prefix = '{chrom}-{transcript_name}-{idx}-'.format(**locals())
        if bam_prefix:
            break
    if not bam_prefix:
        start, end = max(1, pos - padding), min(chr_length, pos + padding)
        ref_ = ref[:20]
        alt_ = alt[:20]
        bam_prefix = '{chrom}-{pos}-{ref_}-{alt_}-'.format(**locals())
    bams_by_sample = dict()
    for sample in samples:
        sample_name = sample.name.replace('-', '_')
        output_bam_fpath = join(out_dirpath, bam_prefix + '{sample_name}.bam'.format(**locals()))
        if output_bam_fpath in bams_created_before:
            continue
        if cnf.reuse_intermediate and verify_file(output_bam_fpath, silent=True):
            bams_by_sample[sample.name] = output_bam_fpath
        else:
            cmdline = '{sambamba} slice {sample.bam} {chrom}:{start}-{end} -o {output_bam_fpath}'.format(**locals())
            call(cnf, cmdline, silent=not cnf.verbose)
            if verify_file(output_bam_fpath, silent=True):
                cmdline = '{sambamba} index {output_bam_fpath}'.format(**locals())
                call(cnf, cmdline, silent=not cnf.verbose)
                bams_by_sample[sample.name] = output_bam_fpath
    return bams_by_sample
コード例 #2
0
def __final_seq2c_scripts(cnf, read_stats_fpath, combined_gene_depths_fpath, output_fpath):
    cov2lr = get_script_cmdline(cnf, 'perl', join('Seq2C', 'cov2lr.pl'), is_critical=True)
    cov2lr_output = join(cnf.work_dir, splitext(basename(output_fpath))[0] + '.cov2lr.tsv')

    controls = ''
    lr2gene_opt = ''
    if cnf.controls:
        controls = '-c ' + cnf.controls  # ':'.join([adjust_path(fpath) for fpath in cnf.controls.split(':')])
        lr2gene_opt = '-c'

    cmdline = '{cov2lr} -a {controls} {read_stats_fpath} {combined_gene_depths_fpath}'.format(**locals())
    call(cnf, cmdline, cov2lr_output, exit_on_error=False)
    info()

    if not verify_file(cov2lr_output):
        return None

    seq2c_opts = cnf.seq2c_opts or ''

    lr2gene = get_script_cmdline(cnf, 'perl', join('Seq2C', 'lr2gene.pl'), is_critical=True)
    cmdline = '{lr2gene} {lr2gene_opt} {seq2c_opts} {cov2lr_output}'.format(**locals())
    res = call(cnf, cmdline, output_fpath, exit_on_error=False)
    info()

    if not verify_file(output_fpath):
        return None

    return res
コード例 #3
0
def annotate_target(cnf, target_bed):
    output_fpath = intermediate_fname(cnf, target_bed, 'ann')
    if not cnf.genome.bed_annotation_features:
        return output_fpath
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    features_bed = verify_bed(
        cnf.genome.bed_annotation_features,
        is_critical=True,
        description='bed_annotation_features in system config')

    # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py'))
    # bedtools = get_system_path(cnf, 'bedtools')

    annotate_bed_py = which('annotate_bed.py')
    if not annotate_bed_py:
        critical(
            'Error: annotate_bed.py not found in PATH, please install TargQC.')

    cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \
              '-o {output_fpath} --canonical'.format(**locals())
    # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \
    #           '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \
    #           '-o {output_fpath}'.format(**locals())
    call(cnf, cmdline, output_fpath, stdout_to_outputfile=False)

    output_fpath = remove_comments(cnf, output_fpath)

    return output_fpath
コード例 #4
0
def submit_job(cnf, cmdline, job_name, wait_for_steps=None, threads=1,
               output_fpath=None, stdout_to_outputfile=True, run_on_chara=False, **kwargs):

    prefix = str(cnf.project_name) + '_'
    if job_name: prefix += job_name + '_'
    prefix += datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '_'
    f, done_marker_fpath = make_tmpfile(cnf, prefix=prefix, suffix='.done')
    f, error_marker_fpath = make_tmpfile(cnf, prefix=prefix, suffix='.error')
    if isfile(done_marker_fpath): os.remove(done_marker_fpath)
    if isfile(error_marker_fpath): os.remove(error_marker_fpath)
    job_id = basename(splitext(done_marker_fpath)[0])

    tx_output_fpath = None
    if output_fpath:
        if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            j = JobRunning(None, None, None, None, None, output_fpath=output_fpath, **kwargs)
            j.is_done = True
            return j
        if stdout_to_outputfile:
            tx_output_fpath = output_fpath + '.tx'
            if isfile(tx_output_fpath):
                os.remove(tx_output_fpath)
            cmdline += ' > ' + tx_output_fpath
        else:
            if isfile(output_fpath):
                os.remove(output_fpath)

    qsub = get_system_path(cnf, 'qsub', is_critical=True)
    bash = get_system_path(cnf, 'bash', is_critical=True)

    if cnf.log_dir:
        err_fpath = log_fpath = join(cnf.log_dir, job_id + '.log')
    else:
        fd, fpath = make_tmpfile(cnf, suffix=job_id + '.log', text=True)
        err_fpath = log_fpath = fpath

    queue = cnf.queue
    runner_script = adjust_system_path(cnf.qsub_runner)
    verify_file(runner_script, is_critical=True, description='qsub_runner')
    hold_jid_line = '-hold_jid ' + ','.join(wait_for_steps or ['_'])
    mem = threads * 15
    priority = 0
    if cnf.qsub_priority:
        priority = cnf.qsub_priority
    extra_qsub_opts = ''
    if run_on_chara and is_us():
        extra_qsub_opts += '-l h="chara|rask"'
    cmdline = cmdline.replace('"', '\\"').replace('\\\\"', '\\"')
    qsub_cmdline = (
        '{qsub} -pe smp {threads} {extra_qsub_opts} -S {bash} -q {queue} -p {priority} '
        '-j n -o {log_fpath} -e {err_fpath} {hold_jid_line} '
        '-N {job_id} {runner_script} {done_marker_fpath} {error_marker_fpath} "{cmdline}"'.format(**locals()))
    info('Submitting job ' + job_id)
    info(qsub_cmdline)
    job = JobRunning(job_id, log_fpath, qsub_cmdline, done_marker_fpath, error_marker_fpath,
                     output_fpath=output_fpath, tx_output_fpath=tx_output_fpath,
                     stdout_to_outputfile=stdout_to_outputfile, **kwargs)
    call(cnf, qsub_cmdline, silent=True)
    return job
コード例 #5
0
def _intersect_with_tricky_regions(cnf, selected_bed_fpath, sample):
    info()
    info('Detecting problematic regions for ' + sample)

    bed_filenames = [fn + '.bed.gz' for fn in tricky_regions_fnames_d.keys()]

    merged_bed_fpaths = [
        join(cnf.genome.tricky_regions, 'merged', bed_filename)
        for bed_filename in bed_filenames
    ]

    info('Intersecting BED ' + selected_bed_fpath +
         ' using BED files with tricky regions')

    intersection_fpath = join(
        cnf.work_dir,
        splitext_plus(basename(selected_bed_fpath))[0] +
        '_tricky_vcf_bed.intersect')
    if not cnf.reuse_intermediate or not verify_file(
            intersection_fpath, silent=True, is_critical=False):
        bedtools = get_system_path(cnf, 'bedtools')
        cmdline = bedtools + ' intersect -header -a ' + selected_bed_fpath + ' -b ' + ' '.join(
            merged_bed_fpaths) + ' -wo -filenames'
        call(cnf,
             cmdline,
             output_fpath=intersection_fpath,
             exit_on_error=False)

    return intersection_fpath
コード例 #6
0
def _get_depth_for_each_variant(cnf, var_by_site, clipped_gz_vcf_fpath,
                                bed_fpath, bam_fpath):
    # http://www.1000genomes.org/faq/what-depth-coverage-your-phase1-variants
    # bedtools intersect -a oncomine.vcf -b Exons.az_key.bed -header > oncomine.az_key.vcf
    # /opt/az/local/tabix/tabix-0.2.6/bgzip oncomine.az_key.vcf
    # /opt/az/local/tabix/tabix-0.2.6/tabix -h -p vcf oncomine.az_key.vcf.gz
    # samtools view -b TRF004223.sorted.bam -L Exons.az_key.bed | bedtools genomecov -ibam stdin -bg > coverage.bg
    # bedtools intersect -a oncomine.az_key.vcf.gz -b coverage.bg -wa | cut -f1,2,4,5,8,11,12,13,14 > oncomine.az_key.depth_numbers.vcf

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)
    bedtools = get_system_path(cnf, 'bedtools')

    info()
    info('Depth of coverage for regions in BED ' + bed_fpath)
    cov_bg = join(cnf.work_dir, 'coverage.bg')
    cmdline = '{sambamba} view -f bam -t {cnf.threads} -L {bed_fpath} {bam_fpath} | {bedtools} genomecov -ibam stdin -bg'.format(
        **locals())
    call(cnf, cmdline, output_fpath=cov_bg, exit_on_error=False)

    info()
    info('Intersecting depth regions with VCF ' + clipped_gz_vcf_fpath)
    vcf_depth_numbers_fpath = join(cnf.work_dir, 'vcf_bg.intersect')
    if not cnf.reuse_intermediate or not verify_file(
            vcf_depth_numbers_fpath, silent=True, is_critical=False):
        cmdline = '{bedtools} intersect -a {clipped_gz_vcf_fpath} -b {cov_bg} -wao'.format(
            **locals())
        res = call(cnf,
                   cmdline,
                   output_fpath=vcf_depth_numbers_fpath,
                   exit_on_error=False)
    # if res != oncomine_depth_numbers_fpath:
    #     info()
    #     info('Trying with uncompressed VCF')
    #     cmdline = 'gunzip {vcf_fpath} -c | {bedtools} intersect -a - -b {cov_bg} -wao | cut -f1,2,4,5,8,11,12,13,14,15'.format(**locals())
    #     call(cnf, cmdline, output_fpath=oncomine_depth_numbers_fpath)

    depths_per_var = defaultdict(list)
    with open(vcf_depth_numbers_fpath) as f:
        for l in f:
            # 1,2,4,5,8,11,12,13,14,15,16,17,18,19,20
            # c,p,r,a,f,ch,st,en,ge,ex,st,ft,bt,de,ov
            fs = l.replace('\n', '').split('\t')
            chrom, pos, _, ref, alt = fs[:5]
            depth, overlap = fs[-2:]
            var = var_by_site.get((chrom, pos, ref, alt))
            if var and depth != '.':
                depth, overlap = int(depth), int(overlap)
                for i in range(overlap):
                    depths_per_var[(chrom, pos, ref, alt)].append(depth)

    # Getting average depth of coverage of each variant (exactly for those parts that were in BED)
    depth_by_var = {
        var: (sum(depths) / len(depths)) if len(depths) != 0 else None
        for var, depths in depths_per_var.iteritems()
    }

    return depth_by_var
コード例 #7
0
def add_project_to_exac(cnf):
    info('Adding project to ExAC database')
    exac_venv_pythonpath = join(exac_venv_dir, 'bin', 'python')
    if is_local():
        exac_venv_pythonpath = 'python'
    cmdline = exac_venv_pythonpath + ' ' + join(exac_code_dir, 'manage.py') + ' ' + 'add_project' + \
              ' ' + cnf.project_name + ' ' + cnf.genome.name
    call(cnf, cmdline)
コード例 #8
0
def get_padded_bed_file(cnf, bed, genome, padding):
    info('Making bed file for padded regions...')
    bedtools = get_system_path(cnf, 'bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome} -b {padding}'.format(
        **locals())
    output_fpath = intermediate_fname(cnf, bed, 'padded')
    call(cnf, cmdline, output_fpath)
    return output_fpath
コード例 #9
0
def calculate_coverage_use_grid(cnf, samples, output_dirpath):
    assert len(samples) > 0

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)

    chr_len_fpath = get_chr_len_fpath(cnf)
    jobs_to_wait = []

    for sample in samples:
        sample_output_dirpath = join(output_dirpath, sample.name)
        safe_mkdir(sample_output_dirpath)

    for chrom in chromosomes:
        info('Processing chromosome ' + chrom)
        avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz')
        sample_output_fpaths = [
            join(output_dirpath, sample.name, chrom + '.txt.gz')
            for sample in samples
        ]

        sample_names = ','.join(sample.name for sample in samples)
        chrom_bams = []

        for sample in samples:
            if not verify_file(sample.bam):
                err('BAM for ' + sample.name + ' is not exist!')
                continue
            output_bam_fpath = join(
                cnf.work_dir,
                basename(sample.name) + '_' + str(chrom) + '.bam')
            cmdline = '{sambamba} slice {sample.bam} {chrom}'.format(
                **locals())
            call(cnf, cmdline, output_fpath=output_bam_fpath)
            if verify_file(output_bam_fpath):
                chrom_bams.append(output_bam_fpath)

        bam_fpaths = ','.join(chrom_bams)

        if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \
                all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths):
            info(avg_cov_output_fpath + ' exists, reusing')
        else:
            j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths,
                                   sample_names, output_dirpath, chr_len_fpath)
            if j and not j.is_done:
                jobs_to_wait.append(j)
            info()

        if len(jobs_to_wait) >= cnf.threads:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
            jobs_to_wait = []
        elif not jobs_to_wait:
            info('No jobs to submit.')
    if jobs_to_wait:
        wait_for_jobs(cnf, jobs_to_wait)
コード例 #10
0
def intersect_bed(cnf, bed1, bed2):
    bed1_fname, _ = splitext_plus(basename(bed1))
    bed2_fname, _ = splitext_plus(basename(bed2))
    output_fpath = join(cnf['work_dir'],
                        bed1_fname + '__' + bed2_fname + '.bed')
    bedtools = get_system_path(cnf, 'bedtools')
    cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals())
    call(cnf, cmdline, output_fpath, verify_output_not_empty=False)
    return output_fpath
コード例 #11
0
def add_project_files_to_jbrowse(cnf, bcbio_structure):
    genome = cnf.genome.name
    jbrowse_data_path, _, _ = set_folders(genome)

    jbrowse_dirpath = join(jbrowse_data_path, 'tracks')
    jbrowse_project_dirpath = join(jbrowse_dirpath,
                                   bcbio_structure.project_name)

    safe_mkdir(jbrowse_project_dirpath)
    jbrowse_tracks_fpath = join(jbrowse_data_path, 'tracks.conf')

    vcf_fpath_by_sample = None
    caller = bcbio_structure.variant_callers.get('vardict') or \
             bcbio_structure.variant_callers.get('vardict-java')
    if caller:
        vcf_fpath_by_sample = caller.get_filt_vcf_by_sample()

    for sample in bcbio_structure.samples:
        if sample.bam:
            index_bam(cnf, sample.bam, use_grid=True)

    for sample in bcbio_structure.samples:
        if all(isfile(join(jbrowse_project_dirpath, sample.name + ext)) for ext in ['.bam', '.bam.bai', '.vcf.gz', '.vcf.gz.tbi', '.bigwig'])\
                and check_tracks_in_configs(sample.name, bcbio_structure.project_name, jbrowse_tracks_fpath, vcf_fpath_by_sample):
            info(sample.name + ' was exported to jBrowse previously.')
            continue
        vcf_link = None
        if vcf_fpath_by_sample:
            vcf_fpath = vcf_fpath_by_sample[
                sample.name] if sample.name in vcf_fpath_by_sample else None
            if vcf_fpath and verify_file(vcf_fpath):
                vcf_link = create_jbrowse_symlink(genome,
                                                  bcbio_structure.project_name,
                                                  sample.name, vcf_fpath)
                if not verify_file(vcf_fpath + '.tbi'):
                    cmdline = '{tabix} {vcf_fpath}'.format(**locals())
                    call(cnf, cmdline, exit_on_error=False)
                create_jbrowse_symlink(genome, bcbio_structure.project_name,
                                       sample.name, vcf_fpath + '.tbi')

        if sample.bam:
            bam_link = create_jbrowse_symlink(genome,
                                              bcbio_structure.project_name,
                                              sample.name, sample.bam)
            create_jbrowse_symlink(genome, bcbio_structure.project_name,
                                   sample.name, sample.bam + '.bai')
            bigwig_link = create_jbrowse_symlink(
                genome, bcbio_structure.project_name, sample.name,
                splitext(sample.bam)[0] + '.bigwig')
            print_sample_tracks_info(sample.name, bcbio_structure.project_name,
                                     trunc_symlink(bam_link),
                                     trunc_symlink(bigwig_link),
                                     trunc_symlink(vcf_link),
                                     jbrowse_tracks_fpath)
コード例 #12
0
def vcf_one_per_line(cnf, vcf_fpath):
    info('Converting VCF to one-effect-per-line...')

    oneperline_vcf_fpath = intermediate_fname(cnf, vcf_fpath, 'opl')
    vcfoneperline_cmline = get_script_cmdline(cnf, 'perl', join('ext_tools', 'vcfOnePerLine.pl'))
    call(cnf, vcfoneperline_cmline, oneperline_vcf_fpath, stdin_fpath=vcf_fpath, exit_on_error=False)
    info()

    if not verify_file(oneperline_vcf_fpath):
        critical('Error: vcf_one_per_line didn\'t generate output file.')
    return oneperline_vcf_fpath
コード例 #13
0
def convert_to_bigwig(bedgraph_fpath, cnf, chr_len_fpath, bw_fpath):
    try:
        with file_transaction(cnf.work_dir, bw_fpath) as tx_fpath:
            cmdl = get_system_path(cnf,
                                   join(get_ext_tools_dirname(),
                                        'bedGraphToBigWig'),
                                   is_critical=True)
            cmdl += ' ' + bedgraph_fpath + ' ' + chr_len_fpath + ' ' + tx_fpath
            call(cnf, cmdl, exit_on_error=True)
    finally:
        os.remove(bedgraph_fpath)
    return bw_fpath
コード例 #14
0
def total_merge_bed(cnf, bed_fpath):
    bedops = get_system_path(cnf, 'bedops')
    if bedops:
        cmdline = '{bedops} --merge {bed_fpath}'.format(**locals())
        output_fpath = intermediate_fname(cnf, bed_fpath, 'total_merged')
        call(cnf, cmdline, output_fpath)
        return output_fpath
    else:
        bedtools = get_system_path(cnf, 'bedtools')
        cmdline = '{bedtools} merge -i {bed_fpath}'.format(**locals())
        output_fpath = intermediate_fname(cnf, bed_fpath, 'total_merged')
        call(cnf, cmdline, output_fpath)
        return output_fpath
コード例 #15
0
def group_and_merge_regions_by_gene(cnf, bed_fpath, keep_genes=False):
    output_fpath = intermediate_fname(cnf, bed_fpath, 'grp_mrg')

    group_merge_bed_py = get_system_path(
        cnf, 'python',
        join('tools', 'bed_processing', 'group_and_merge_by_gene.py'))

    cmdline = '{group_merge_bed_py} {bed_fpath}'.format(**locals())
    if not keep_genes:
        cmdline += ' | grep -vw Gene'

    call(cnf, cmdline, output_fpath)

    return output_fpath
コード例 #16
0
def main():
    cnf = read_opts_and_cnfs(extra_opts=[
        (['--bam'], dict(dest='bam', help='path to the BAM file')),
        (['--bed', '--capture',
          '--amplicons'], dict(dest='bed', help='capture panel/amplicons')),
        (['--pcr'],
         dict(
             dest='pcr',
             action='store_true',
             help='deduplication was not perfomed, thus do not try to dedup')),
    ],
                             required_keys=['bam'],
                             file_keys=['bam', 'bed'],
                             key_for_sample_name='bam',
                             proc_name=BCBioStructure.qualimap_name)

    index_bam(cnf, cnf.bam)
    info('Using alignment ' + cnf.bam)

    bed = ''
    if cnf.bed:
        bed = ' -gff ' + cnf.bed + ' '
        info('Using amplicons/capture panel ' + cnf.bed)

    qualimap = get_system_path(cnf, 'qualimap', is_critical=True)
    if not qualimap:
        critical('Cannot find qualimap')

    info()

    mem_cmdl = ''
    mem_m = get_qualimap_max_mem(cnf.bam)
    mem = str(int(mem_m)) + 'M'
    mem_cmdl = ' --java-mem-size=' + mem

    cmdline = (
        '{qualimap} bamqc --skip-duplicated -nt ' + str(cnf.threads) +
        mem_cmdl + ' -nr 5000 '
        '-bam {cnf.bam} -outdir {cnf.output_dir} {bed} -c -gd HUMAN').format(
            **locals())
    report_fpath = join(cnf.output_dir, 'qualimapReport.html')

    call(cnf,
         cmdline,
         output_fpath=report_fpath,
         stdout_to_outputfile=False,
         env_vars=dict(DISPLAY=None))

    info('Qualimap report: ' + str(report_fpath))
コード例 #17
0
def bam_to_bed(cnf, bam_fpath, to_gzip=True):
    info(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz'
                                                   if to_gzip else '.bed')
    bedtools = get_system_path(cnf, 'bedtools')
    gzip = get_system_path(cnf, 'gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call(cnf,
         cmdline,
         output_fpath=bam_bed_fpath,
         verify_output_not_empty=False)
    return bam_bed_fpath
コード例 #18
0
def call_sambamba(cnf,
                  cmdl,
                  bam_fpath,
                  output_fpath=None,
                  sambamba=None,
                  use_grid=False,
                  command_name='',
                  sample_name=None,
                  silent=False,
                  stdout_to_outputfile=True):
    sambamba = sambamba or get_system_path(
        cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True)
    sample_name = sample_name or basename(bam_fpath).split('.')[0]
    if use_grid:
        grid_sambabma = get_script_cmdline(
            cnf, 'python', join('tools', 'bed_processing', 'sambamba.py'))
        cmdl = cmdl.replace(' "', ' \'\"__QUOTE__')
        cmdl = cmdl.replace('" ', '__QUOTE__\"\' ')
        grid_cmdl = grid_sambabma + ' ' + bam_fpath + ' ' + sambamba + ' ' + cmdl
        job_name = command_name + '_' + sample_name
        j = submit_job(cnf,
                       grid_cmdl,
                       job_name=job_name,
                       output_fpath=output_fpath,
                       stdout_to_outputfile=stdout_to_outputfile)
        info()
        return j
    else:
        index_bam(cnf, bam_fpath, sambamba=sambamba)
        cmdl = sambamba + ' ' + cmdl
        stderr_dump = []
        res = call(cnf,
                   cmdl,
                   output_fpath=output_fpath,
                   exit_on_error=False,
                   stderr_dump=stderr_dump,
                   stdout_to_outputfile=stdout_to_outputfile,
                   silent=silent,
                   print_stderr=not silent)
        if not res:
            for l in stderr_dump:
                if 'sambamba-view: BAM index file (.bai) must be provided' in l:
                    if isfile(isfile(bam_fpath + '.bai')):
                        info('Removing .bai and re-indexing...')
                        os.remove(bam_fpath + '.bai')
                    index_bam(cnf, bam_fpath, sambamba)
                    res = call(cnf, cmdl, output_fpath=output_fpath)
        return res
コード例 #19
0
def evaluate_capture(cnf, project_dirpaths):
    cmdline = get_script_cmdline(cnf,
                                 'python',
                                 join('tools', 'evaluate_capture_target.py'),
                                 is_critical=True)
    project_dirpaths = ' '.join(project_dirpaths)
    cmdline += ' --genome {cnf.genome.name} --project-name {cnf.project_name} {project_dirpaths} '.format(
        **locals())
    cmdline += ' --exac-only-filtering --tricky-regions '
    if cnf.bed:
        cmdline += ' --bed ' + cnf.bed

    depth_thresholds = [10, 25, 50, 100]
    for min_depth in depth_thresholds:
        cmdline += ' --min-depth {min_depth}'.format(**locals())
        call(cnf, cmdline)
コード例 #20
0
def merge_vcfs(cnf, vcf_fpath_by_sname, combined_vcf_fpath):
    if cnf.reuse_intermediate and isfile(
            combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath +
                                                       '.gz'):
        info(combined_vcf_fpath + '.gz exists, reusing')
        return combined_vcf_fpath + '.gz'

    bcftools = get_system_path(cnf, 'bcftools')
    if not bcftools:
        info('bcftools is not found, skipping merging VCFs')
        return None

    cmdl = '{bcftools} merge --force-samples '.format(**locals())
    for sample, vcf_fpath in vcf_fpath_by_sname.iteritems():
        if vcf_fpath:
            cmdl += ' ' + vcf_fpath + ' '
    cmdl += ' -o ' + combined_vcf_fpath

    res = call(cnf,
               cmdl,
               output_fpath=combined_vcf_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        info('Joined VCFs, saved into ' + combined_vcf_fpath)
        if isfile(combined_vcf_fpath + '.tx.idx'):
            try:
                os.remove(combined_vcf_fpath + '.tx.idx')
            except OSError:
                info()
        return bgzip_and_tabix(combined_vcf_fpath)
    else:
        warn('Could not join VCFs')
        return None
コード例 #21
0
def make_circos_plot(cnf, bcbio_structure):
    circos_fpath = join(bcbio_structure.date_dirpath, 'circos.html')
    mutations_fpaths = get_mutations_fpaths(bcbio_structure)
    if not mutations_fpaths:
        err('File with Vardict results does not exist. Circos plot cannot be created.'
            )
        return
    if not bcbio_structure.seq2c_fpath:
        err('File with Seq2C results does not exist. Circos plot cannot be created.'
            )
        return
    cmdl = 'circos  --genome ' + cnf.genome.name + ' -o ' + bcbio_structure.date_dirpath + \
           ' --mutations ' + ','.join(mutations_fpaths) + ' --seq2c ' + bcbio_structure.seq2c_fpath + ' ' + bcbio_structure.bcbio_project_dirpath
    call(cnf, cmdl, exit_on_error=False, silent=False)
    if verify_file(circos_fpath):
        return circos_fpath
コード例 #22
0
def markdup_bam(cnf, in_bam_fpath, bammarkduplicates=None):
    """Perform non-stream based deduplication of BAM input files using biobambam.
    """
    if not bammarkduplicates:
        bammarkduplicates = get_system_path(cnf, 'bammarkduplicates')
        if not bammarkduplicates:
            warn('No biobambam bammarkduplicates, can\'t mark duplicates.')
            return None

    out_bam_fpath = add_suffix(in_bam_fpath, 'markdup')
    tmp_fpath = join(cnf.work_dir,
                     splitext_plus(basename(in_bam_fpath))[0] + '_markdup')
    safe_mkdir(dirname(tmp_fpath))
    cmdline = (
        '{bammarkduplicates} tmpfile={tmp_fpath} I={in_bam_fpath} O={out_bam_fpath}'
    ).format(**locals())
    res = call(cnf,
               cmdline,
               output_fpath=out_bam_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        return out_bam_fpath
    else:
        return None
コード例 #23
0
def picard_ins_size_hist(cnf, sample, bam_fpath, output_dir):
    picard = get_system_path(cnf, 'java', 'picard')
    if picard:
        safe_mkdir(dirname(sample.picard_ins_size_hist_txt_fpath))
        safe_mkdir(dirname(sample.picard_ins_size_hist_pdf_fpath))
        info('Picard ins size hist for "' + basename(bam_fpath) + '"')
        cmdline = '{picard} CollectInsertSizeMetrics' \
                  ' I={bam_fpath}' \
                  ' O={sample.picard_ins_size_hist_txt_fpath}' \
                  ' H={sample.picard_ins_size_hist_pdf_fpath}' \
                  ' VALIDATION_STRINGENCY=LENIENT'

        cmdline = cmdline.format(**locals())
        call(cnf,
             cmdline,
             output_fpath=sample.picard_ins_size_hist_txt_fpath,
             stdout_to_outputfile=False,
             exit_on_error=False)
コード例 #24
0
def index_bam(cnf, bam_fpath, sambamba=None, samtools=None, use_grid=False):
    if use_grid:
        return index_bam_grid(cnf, bam_fpath, sambamba)

    sambamba = sambamba or get_system_path(
        cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True)
    indexed_bam = bam_fpath + '.bai'
    if not isfile(indexed_bam) or getmtime(indexed_bam) < getmtime(bam_fpath):
        info('Indexing BAM, writing ' + indexed_bam + '...')
        cmdline = '{sambamba} index {bam_fpath}'.format(**locals())
        res = call(cnf, cmdline, exit_on_error=False)
        if not isfile(
                indexed_bam) or getmtime(indexed_bam) < getmtime(bam_fpath):
            samtools = samtools or get_system_path(cnf, 'samtools')
            cmdline = '{samtools} index {bam_fpath}'.format(**locals())
            call(cnf, cmdline)
    else:
        debug('Actual "bai" index exists.')
コード例 #25
0
def generate_combined_bam(cnf, bam_fpaths, temp_combined_bam_fpath, combined_bam_fpath):
    info('Combining %s bams into %s' % (len(bam_fpaths), combined_bam_fpath))
    if cnf.reuse_intermediate and verify_file(combined_bam_fpath, silent=True):
        return combined_bam_fpath

    # sorted_bam_paths = sorted(bam_fpaths, key=lambda bam_path: int(bam_path_to_dict(bam_path)['pos']))

    read_group_ids = map(bam_path_to_read_group_id, bam_fpaths)
    read_groups = [{'ID': read_group_id, 'SM': 0} for read_group_id in read_group_ids]

    out_bam = None
    for bam_fpath in bam_fpaths:
        try:
            ibam = pysam.AlignmentFile(bam_fpath, 'rb')

            if out_bam is None:
                header = {
                    'HD': {'VN': '1.4'},
                    'SQ': ibam.header['SQ'],
                    'RG': read_groups,
                }

                out_bam = pysam.AlignmentFile(temp_combined_bam_fpath, 'wb', header=header)

            # iterate over the reads
            rg_tag = (('RG', bam_path_to_read_group_id(bam_fpath)), )
            for r in ibam:
                r.tags = rg_tag
                out_bam.write(r)

            ibam.close()

        except (IOError, ValueError) as e:
            err('ERROR on file %s: %s', bam_fpath, e)
    if out_bam is not None:
        out_bam.close()

    sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True)
    cmdline = '{sambamba} sort -t {cnf.threads} {temp_combined_bam_fpath} -o {combined_bam_fpath}'.format(**locals())
    call(cnf, cmdline)
    cmdline = '{sambamba} index {combined_bam_fpath}'.format(**locals())
    call(cnf, cmdline)
    info(combined_bam_fpath + ' saved!')
コード例 #26
0
def igvtools_index(cnf, vcf_fpath):
    igvtools = get_system_path(cnf, 'igvtools')
    if not igvtools:
        err('Warning: no igvtools found, cannot index VCF.')
        return None
    if igvtools.endswith('.jar'):
        igvtools = get_java_tool_cmdline(cnf, 'igvtools')
        if igvtools is None:
            err('Warning: no jar igvtools found, cannot index VCF.')
            return None

    cmdline = '{igvtools} index {vcf_fpath}'.format(**locals())
    call(cnf, cmdline, exit_on_error=False)
    if exists('igv.log'):
        try:
            os.remove('igv.log')
        except OSError:
            pass
    return vcf_fpath + '.idx'
コード例 #27
0
def run_targqc(cnf, bam_by_sample, bed_fpath, output_dirpath):
    info('Running TargQC for downsampled BAMs')

    targqc = get_script_cmdline(cnf, 'python', 'targqc.py', is_critical=True)
    targqc_work_dir = join(cnf.work_dir, 'TargQC')
    targqc_log_dir = join(cnf.log_dir, 'TargQC')
    safe_mkdir(targqc_work_dir)
    safe_mkdir(targqc_log_dir)
    bed_cmdl = ''
    if bed_fpath:
        bed_cmdl = '--bed ' + bed_fpath
    bam_cmdl = ' '.join(bam_fpath + ',' + sname
                        for sname, bam_fpath in bam_by_sample.items())
    cmdl = '{targqc} --sys-cnf {cnf.sys_cnf} {bam_cmdl} {bed_cmdl} ' \
           '--work-dir {targqc_work_dir} --log-dir {targqc_log_dir} --project-name {cnf.project_name} ' \
           '-o {output_dirpath} --genome {cnf.genome.name}'.format(**locals())
    if cnf.reuse_intermediate:
        cmdl += ' --reuse'
    call(cnf, cmdl)
コード例 #28
0
def run_fastq(cnf,
              sample_name,
              l_r_fpath,
              r_r_fpath,
              output_dirpath,
              downsample_to=1e7):
    fastqc = get_system_path(cnf, 'fastqc', is_critical=True)
    java = get_system_path(cnf, 'java', is_critical=True)

    if downsample_to:
        info('Downsampling to ' + str(downsample_to))
        l_fpath, r_fpath = downsample(cnf,
                                      sample_name,
                                      l_r_fpath,
                                      r_r_fpath,
                                      downsample_to,
                                      output_dir=cnf.work_dir)

    # Joining fastq files to run on a combination
    fastqc_fpath = join(cnf.work_dir, sample_name + '.fq')
    info('Combining fastqs, writing to ' + fastqc_fpath)
    with open(fastqc_fpath, 'w') as out:
        out.write(open_gzipsafe(l_r_fpath).read())
        out.write(open_gzipsafe(r_r_fpath).read())

    # Running FastQC
    info('Running FastQC')
    tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp')
    safe_mkdir(tmp_dirpath)
    cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format(
        **locals())
    call(cnf, cmdline)

    # Cleaning and getting report
    sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc')
    if isfile(sample_fastqc_dirpath + '.zip'):
        os.remove(sample_fastqc_dirpath + '.zip')
    fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html')
    verify_file(fastqc_html_fpath, is_critical=True)

    return sample_fastqc_dirpath
コード例 #29
0
def vcf_merge(cnf, vcf_fpaths, combined_vcf_fpath):
    vcf_merge_cmdline = get_system_path(cnf, join('ext_tools', 'vcftools', 'scripts', 'vcf-merge'))
    if vcf_merge_cmdline is None:
        critical('No vcf_merge in path')

    cmdline = vcf_merge_cmdline + ' ' + ' '.join(vcf_fpaths)
    perl_module_dirpath = abspath(join(dirname(__file__), pardir, pardir, 'ext_modules', 'perl_modules'))
    os.environ['PERL5LIB'] = perl_module_dirpath

    res = call(cnf, cmdline, combined_vcf_fpath, exit_on_error=False)
    if not res:
        return None
コード例 #30
0
def remove_dups_picard(cnf, bam_fpath):
    picard = get_system_path(cnf, 'java', 'picard')
    if not picard:
        critical('No picard in the system')

    info('Running picard dedup for "' + basename(bam_fpath) + '"')

    dup_metrics_txt = join(cnf.work_dir, 'picard_dup_metrics.txt')
    output_fpath = intermediate_fname(cnf, bam_fpath, 'pcd_dedup')

    cmdline = '{picard} MarkDuplicates' \
              ' I={bam_fpath}' \
              ' O={output_fpath}' \
              ' METRICS_FILE={dup_metrics_txt}' \
              ' REMOVE_DUPLICATES=True' \
              ' VALIDATION_STRINGENCY=LENIENT'
    res = call(cnf,
               cmdline.format(**locals()),
               output_fpath=output_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)

    if res != output_fpath:  # error occurred, try to correct BAM and restart
        warn('Picard deduplication failed for "' + basename(bam_fpath) +
             '". Fixing BAM and restarting Picard...')
        bam_fpath = _fix_bam_for_picard(cnf, bam_fpath)
        res = call(cnf,
                   cmdline.format(**locals()),
                   output_fpath=output_fpath,
                   stdout_to_outputfile=False,
                   exit_on_error=False)

    if res == output_fpath:
        dup_rate = _parse_picard_dup_report(dup_metrics_txt)
        assert dup_rate <= 1.0 or dup_rate is None, str(dup_rate)
        info('Duplication rate (picard): ' + str(dup_rate))
        return output_fpath
    else:
        return None