def _submit_job(cnf, step, sample_name='', wait_for_steps=None, threads=1, is_critical=True, **kwargs): tool_cmdline = get_system_path(cnf, step.interpreter, step.script, is_critical=is_critical) if not tool_cmdline: return False kwargs['sample_name'] = sample_name cmdline = tool_cmdline + ' ' + step.param_line.format(**kwargs) info(step.name) job = submit_job(cnf, cmdline, job_name=step.job_name(sample_name), wait_for_steps=wait_for_steps, threads=threads) info() return job
def call_sambamba(cnf, cmdl, bam_fpath, output_fpath=None, sambamba=None, use_grid=False, command_name='', sample_name=None, silent=False, stdout_to_outputfile=True): sambamba = sambamba or get_system_path( cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) sample_name = sample_name or basename(bam_fpath).split('.')[0] if use_grid: grid_sambabma = get_script_cmdline( cnf, 'python', join('tools', 'bed_processing', 'sambamba.py')) cmdl = cmdl.replace(' "', ' \'\"__QUOTE__') cmdl = cmdl.replace('" ', '__QUOTE__\"\' ') grid_cmdl = grid_sambabma + ' ' + bam_fpath + ' ' + sambamba + ' ' + cmdl job_name = command_name + '_' + sample_name j = submit_job(cnf, grid_cmdl, job_name=job_name, output_fpath=output_fpath, stdout_to_outputfile=stdout_to_outputfile) info() return j else: index_bam(cnf, bam_fpath, sambamba=sambamba) cmdl = sambamba + ' ' + cmdl stderr_dump = [] res = call(cnf, cmdl, output_fpath=output_fpath, exit_on_error=False, stderr_dump=stderr_dump, stdout_to_outputfile=stdout_to_outputfile, silent=silent, print_stderr=not silent) if not res: for l in stderr_dump: if 'sambamba-view: BAM index file (.bai) must be provided' in l: if isfile(isfile(bam_fpath + '.bai')): info('Removing .bai and re-indexing...') os.remove(bam_fpath + '.bai') index_bam(cnf, bam_fpath, sambamba) res = call(cnf, cmdl, output_fpath=output_fpath) return res
def run_fastqc(cnf, fastq_fpath, output_basename, fastqc_dirpath, need_downsample=True): fastqc = get_system_path(cnf, 'fastqc', is_critical=True) java = get_system_path(cnf, 'java', is_critical=True) tmp_dirpath = join(cnf.work_dir, 'FastQC_' + output_basename + '_tmp') safe_mkdir(tmp_dirpath) cmdline_l = '{fastqc} --dir {tmp_dirpath} --extract -o {fastqc_dirpath} -f fastq -j {java} {fastq_fpath}'.format( **locals()) j = submit_job(cnf, cmdline_l, 'FastQC_' + output_basename, run_on_chara=True, stdout_to_outputfile=False) # output_fpath=join(fastqc_dirpath, output_basename + '_fastqc', 'fastqc_report.html')) return j
def _submit_region_cov(cnf, work_dir, chrom, bam_fpaths, sample_names, output_dirpath, chr_len_fpath): if not bam_fpaths or not sample_names: return None cmdline = get_script_cmdline(cnf, 'python', join('tools', 'get_region_coverage.py'), is_critical=True) cmdline += (' --chr ' + chrom + ' --bams ' + bam_fpaths + ' --samples ' + sample_names + ' -o ' + output_dirpath + ' -g ' + chr_len_fpath + ' ' + ' --work-dir ' + work_dir) if cnf.bed: cmdline += ' --bed ' + cnf.bed return submit_job( cnf, cmdline, chrom + '_coverage_' + ('project' if (',' in sample_names) else sample_names))
def index_bam_grid(cnf, bam_fpath, sambamba=None): indexed_bam = bam_fpath + '.bai' if not isfile(indexed_bam) or getctime(indexed_bam) < getctime(bam_fpath): info('Indexing BAM, writing ' + indexed_bam + '...') sambamba = sambamba or get_system_path( cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) if sambamba is None: sambamba = sambamba or get_system_path( cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) cmdline = '{sambamba} index {bam_fpath}'.format( **locals()) # -F (=not) 1024 (=duplicate) j = submit_job(cnf, cmdline, basename(bam_fpath) + '_index', output_fpath=indexed_bam, stdout_to_outputfile=False) info() return j else: debug('Actual "bai" index exists.') return None
def _annotate(cnf, samples): varannotate_cmdl = (get_script_cmdline( cnf, 'python', join('scripts', 'post', 'varannotate.py')) + ' --sys-cnf ' + cnf.sys_cnf + ' --run-cnf ' + cnf.run_cnf + ' --project-name ' + cnf.project_name + (' --reuse ' if cnf.reuse_intermediate else '') + ' --log-dir -' + ' --genome ' + cnf.genome.name + (' --no-check ' if cnf.no_check else '') + (' --qc' if cnf.qc else ' --no-qc') + ((' --caller ' + cnf.caller) if cnf.caller else '')) total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 not_submitted_samples = samples while not_submitted_samples: jobs_to_wait = [] submitted_samples = [] reused_samples = [] for sample in not_submitted_samples: if not sample.varannotate_dirpath: sample.varannotate_dirpath = join(sample.dirpath, source.varannotate_name) if not sample.anno_vcf_fpath: sample.anno_vcf_fpath = join( sample.varannotate_dirpath, add_suffix(basename(sample.vcf), 'anno')) output_fpath = sample.anno_vcf_fpath if not output_fpath.endswith('.gz'): output_fpath += '.gz' debug('Checking ' + output_fpath) if cnf.reuse_intermediate and isfile(output_fpath) and verify_vcf( output_fpath): info('Annotated results ' + output_fpath + ' exist, reusing.') reused_samples.append(sample) info() continue work_dir = join(cnf.work_dir, source.varannotate_name + '_' + sample.name) j = submit_job( cnf, cmdline=varannotate_cmdl + ' --vcf ' + sample.vcf + ' -o ' + sample.varannotate_dirpath + ' -s ' + sample.name + ' --work-dir ' + work_dir + ' --output-file ' + output_fpath, job_name='VA_' + cnf.project_name + '_' + sample.name, output_fpath=output_fpath, stdout_to_outputfile=False, work_dir=work_dir) if not j.is_done: jobs_to_wait.append(j) submitted_samples.append(sample) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No annotation jobs to submit.') info('') info('-' * 70) info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_vcf( j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed: if isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) else: err('Job was done, but j.work_dir ' + j.work_dir + ' does not exist') processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) info()
def _filter(cnf, samples, variants_fpath, variants_fname): # if cohort_mode: # info('Running vcf2txt.pl in cohort mode') # vcf2txt = get_script_cmdline(cnf, 'perl', 'vcf2txt', is_critical=True) # vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in samples} # cmdline = vcf2txt + ' ' + make_vcf2txt_cmdl_params(cnf, vcf_fpath_by_sample) # res = run_vcf2txt_with_retries(cnf, cmdline, variants_fpath) # if not res: # critical('Error: vcf2txt.pl crashed') total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 cohort_freqs_fpath = None # if cnf.variant_filtering.max_ratio_vardict2mut < 1.0: # cohort_freqs_fpath = join(cnf.work_dir, 'cohort_freqs.tsv') # info('*' * 70) # info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio_vardict2mut) + ', counting freqs in cohort') # # cnf.variant_filtering.max_ratio < 1.0 or \ # # cnf.fraction < 1.0 # cohort_freqs_fpath = count_cohort_freqs(cnf, samples, cohort_freqs_fpath, max_ratio=cnf.variant_filtering.max_ratio_vardict2mut) # info('*' * 70) # info() not_submitted_samples = samples while not_submitted_samples: reused_samples = [] jobs_to_wait = [] submitted_samples = [] for sample in not_submitted_samples: output_dirpath = sample.varfilter_dirpath = join( sample.dirpath, source.varfilter_name) output_fpath = sample.variants_fpath = join( sample.varfilter_dirpath, variants_fname) pass_output_fpath = add_suffix(sample.variants_fpath, variant_filtering.mut_pass_suffix) if cnf.reuse_intermediate and check_filtering_results(output_fpath) \ and check_filtering_results(pass_output_fpath): info('Filtered results ' + output_fpath + ' and ' + pass_output_fpath + ' exist, reusing.') reused_samples.append(sample) info() continue varfilter_py = 'varfilter' work_dir = join(cnf.work_dir, 'filt_' + sample.name) if not cnf.genome.dbsnp_multi_mafs: critical( 'Error: dbsnp_multi_mafs is not specified in the config ' + cnf.sys_cnf) cmdl = ( '{varfilter_py}' + ((' --sys-cnf ' + cnf.sys_cnf) if not cnf.filt_cnf else '') + ((' --run-cnf ' + cnf.run_cnf) if not cnf.filt_cnf else '') + ((' --filt-cnf ' + cnf.filt_cnf) if cnf.filt_cnf else '') + ' --vcf {sample.anno_vcf_fpath}' + ' --sample {sample.name}' + ' -o {output_dirpath}' + ' --output-file {sample.variants_fpath}' + ' --project-name ' + cnf.project_name + ' --genome {cnf.genome.name}' + ' --work-dir {work_dir}' + ' --debug ' + (' --cohort-freqs {cohort_freqs_fpath}' if cohort_freqs_fpath else '') + (' --reuse ' if cnf.reuse_intermediate else '') + ((' --caller ' + cnf.caller) if cnf.caller else '') + (' --qc' if cnf.qc else ' --no-qc') + (' --no-tsv' if not cnf.tsv else '') + ' --dbsnp-multi-mafs ' + adjust_path(cnf.genome.dbsnp_multi_mafs)).format(**locals()) with with_cnf(cnf, reuse_intermediate=False): j = submit_job(cnf, cmdl, job_name='_filt_' + sample.name, output_fpath=pass_output_fpath, stdout_to_outputfile=False, work_dir=work_dir) if not j.is_done: jobs_to_wait.append(j) submitted_samples.append(sample) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No filtering jobs to submit.') info('') info('-' * 70) info('Finihsed filtering ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_file( j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed and not cnf.debug: if isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) else: err('Job was done, but ' + j.work_dir + ' does not exist') processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) info() info('Combining results...') vcf2txt_fpaths = [s.variants_fpath for s in samples] variants_fpath, pass_variants_fpath = combine_results( cnf, samples, vcf2txt_fpaths, variants_fpath) if cnf.qc: _summarize_varqc(cnf, cnf.output_dir, samples, cnf.project_name, post_filter=True) return variants_fpath, pass_variants_fpath
def split_bam_files_use_grid(cnf, samples, combined_vcf_fpath, exac_features_fpath): samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=False) samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=True) vcfs_by_chrom = dict() tabix = get_system_path(cnf, 'tabix') for chrom in chromosomes: vcf_fpath = join(cnf.work_dir, str(chrom) + '.vcf') cmdline = '{tabix} -h {combined_vcf_fpath} {chrom} > {vcf_fpath}'.format( **locals()) call(cnf, cmdline) if verify_file(vcf_fpath): vcfs_by_chrom[chrom] = vcf_fpath output_dirpath = join(cnf.output_dir, 'combined_bams', cnf.project_name) safe_mkdir(output_dirpath) not_submitted_chroms = vcfs_by_chrom.keys() sample_names = ','.join(sample.name for sample in samples) sample_bams = ','.join(sample.bam for sample in samples) while not_submitted_chroms: jobs_to_wait = [] submitted_chroms = [] reused_chroms = [] for chrom, vcf_fpath in vcfs_by_chrom.iteritems(): if chrom not in not_submitted_chroms: continue output_fpaths = [ join( output_dirpath, chrom.replace('chr', '') + '-' + sample.name.replace('-', '_') + '.bam'.format(**locals())) for sample in samples ] if cnf.reuse_intermediate and all( verify_file(output_fpath, silent=True) for output_fpath in output_fpaths): info('BAM files for ' + chrom + ' chromosome exists, reusing') reused_chroms.append(chrom) continue else: # if exac_venv_pythonpath: # to avoid compatibility problems with pysam and tabix # cmdline = exac_venv_pythonpath + ' ' + get_system_path(cnf, # join('tools', 'split_bams_by_variants.py')) # else: cmdline = get_script_cmdline(cnf, 'python', join('tools', 'split_bams_by_variants.py'), is_critical=True) cmdline += ( ' --chr {chrom} --vcf {vcf_fpath} --samples {sample_names} ' + '--bams {sample_bams} -o {output_dirpath} --work-dir {cnf.work_dir} ' + '-g {cnf.genome.name} ').format(**locals()) if cnf.reuse_intermediate: cmdline += ' --reuse' if exac_features_fpath and verify_file(exac_features_fpath): cmdline += ' --features ' + exac_features_fpath j = submit_job(cnf, cmdline, chrom + '_split') info() submitted_chroms.append(chrom) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: break if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No jobs to submit.') not_submitted_chroms = [ chrom for chrom in not_submitted_chroms if chrom not in submitted_chroms and chrom not in reused_chroms ]