def calculate_coverage_use_grid(cnf, samples, output_dirpath): assert len(samples) > 0 sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) chr_len_fpath = get_chr_len_fpath(cnf) jobs_to_wait = [] for sample in samples: sample_output_dirpath = join(output_dirpath, sample.name) safe_mkdir(sample_output_dirpath) for chrom in chromosomes: info('Processing chromosome ' + chrom) avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz') sample_output_fpaths = [ join(output_dirpath, sample.name, chrom + '.txt.gz') for sample in samples ] sample_names = ','.join(sample.name for sample in samples) chrom_bams = [] for sample in samples: if not verify_file(sample.bam): err('BAM for ' + sample.name + ' is not exist!') continue output_bam_fpath = join( cnf.work_dir, basename(sample.name) + '_' + str(chrom) + '.bam') cmdline = '{sambamba} slice {sample.bam} {chrom}'.format( **locals()) call(cnf, cmdline, output_fpath=output_bam_fpath) if verify_file(output_bam_fpath): chrom_bams.append(output_bam_fpath) bam_fpaths = ','.join(chrom_bams) if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \ all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths): info(avg_cov_output_fpath + ' exists, reusing') else: j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths, sample_names, output_dirpath, chr_len_fpath) if j and not j.is_done: jobs_to_wait.append(j) info() if len(jobs_to_wait) >= cnf.threads: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) jobs_to_wait = [] elif not jobs_to_wait: info('No jobs to submit.') if jobs_to_wait: wait_for_jobs(cnf, jobs_to_wait)
def run_sambamba_use_grid(cnf, infos_by_key, mut_bed_fpath): sambamba_output_by_experiment = dict() not_submitted_experiments = infos_by_key.values() while not_submitted_experiments: jobs_to_wait = [] submitted_experiments = [] reused_experiments = [] for (group, uniq_key), e in infos_by_key.iteritems(): if e not in not_submitted_experiments: continue sambamba_output_fpath = join(cnf.work_dir, uniq_key + '__mutations.bed') sambamba_output_by_experiment[e] = sambamba_output_fpath if cnf.reuse_intermediate and verify_file(sambamba_output_fpath, silent=True): info(sambamba_output_fpath + ' exists, reusing') reused_experiments.append(e) continue else: if not e.sample.bam: err('Sample ' + e.sample.name + ' in ' + str(group) + ', ' + str(uniq_key) + ' has no BAM') continue j = sambamba_depth(cnf, mut_bed_fpath, e.sample.bam, output_fpath=sambamba_output_fpath, only_depth=True, silent=True, use_grid=True) submitted_experiments.append(e) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: break if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No jobs to submit.') not_submitted_experiments = [ e for e in not_submitted_experiments if e not in submitted_experiments and e not in reused_experiments ] return sambamba_output_by_experiment
def __get_mapped_reads(cnf, samples, bam_by_sample, output_fpath): if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath, samples mapped_reads_by_sample = OrderedDict() job_by_sample = dict() total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 not_submitted_samples = samples while not_submitted_samples: jobs_to_wait = [] submitted_samples = [] reused_samples = [] for s in not_submitted_samples: with with_cnf(cnf, work_dir=join(cnf.work_dir, s.name)) as cnf: safe_mkdir(cnf.work_dir) # if verify_file(s.targetcov_json_fpath, silent=True): # info('Parsing targetSeq output ' + s.targetcov_json_fpath) # with open(s.targetcov_json_fpath) as f: # data = load(f, object_pairs_hook=OrderedDict) # cov_report = SampleReport.load(data, s) # mapped_reads = next(rec.value for rec in cov_report.records if rec.metric.name == 'Mapped reads') # info(s.name + ': ') # info(' Mapped reads: ' + str(mapped_reads)) # mapped_reads_by_sample[s.name] = mapped_reads # reused_samples.append(s) # continue # # else: if s.name not in bam_by_sample: err('No BAM for ' + s.name + ', not running Seq2C') return None, None info('Submitting a sambamba job to get mapped read numbers') bam_fpath = bam_by_sample[s.name] j = number_of_mapped_reads(cnf, bam_fpath, dedup=True, use_grid=True, sample_name=s.name) job_by_sample[s.name] = j submitted_samples.append(s) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [_s for _s in not_submitted_samples if _s not in submitted_samples and _s not in reused_samples] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No annotation jobs to submit.') info('') info('-' * 70) info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_file(j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed: if 'work_dir' in j.__dict__ and isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples] info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) info() # wait_for_jobs(cnf, job_by_sample.values()) for s_name, j in job_by_sample.items(): if j and j.is_done and not j.is_failed: with open(j.output_fpath) as f: mapped_reads = int(f.read().strip()) info(s_name + ': ') info(' Mapped reads: ' + str(mapped_reads)) mapped_reads_by_sample[s_name] = mapped_reads else: err('ERROR: ' + s_name + ' could not get mapped reads, log saved to ' + j.log_fpath) with open(output_fpath, 'w') as f: for sample_name, mapped_reads in mapped_reads_by_sample.items(): f.write(sample_name + '\t' + str(mapped_reads) + '\n') verify_file(output_fpath, is_critical=True) successful_samples = [s for s in samples if s.name in mapped_reads_by_sample] info('Samples processed: ' + str(len(samples)) + ', successfully: ' + str(len(successful_samples))) return output_fpath, successful_samples
def __seq2c_coverage(cnf, samples, bams_by_sample, bed_fpath, is_wgs, output_fpath): if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath jobs_by_sample = dict() depth_output_by_sample = dict() seq2cov_output_by_sample = dict() seq2c_work_dirpath = join(cnf.work_dir, source.seq2c_name) safe_mkdir(seq2c_work_dirpath) info() total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 not_submitted_samples = samples while not_submitted_samples: jobs_to_wait = [] submitted_samples = [] reused_samples = [] for s in not_submitted_samples: info('*' * 50) info(s.name + ':') with with_cnf(cnf, work_dir=join(cnf.work_dir, s.name)) as cnf: safe_mkdir(cnf.work_dir) seq2cov_output_by_sample[s.name] = join(seq2c_work_dirpath, s.name + '.seq2cov.txt') if not cnf.reuse_intermediate and isfile(seq2cov_output_by_sample[s.name]): os.remove(seq2cov_output_by_sample[s.name]) if cnf.reuse_intermediate and verify_file(seq2cov_output_by_sample[s.name], silent=True): info(seq2cov_output_by_sample[s.name] + ' exists, reusing') reused_samples.append(s) continue elif verify_file(s.targetcov_detailed_tsv, silent=True): info('Using targetcov detailed output for Seq2C coverage.') info(s.name + ': using targetseq output') targetcov_details_to_seq2cov(cnf, s.targetcov_detailed_tsv, seq2cov_output_by_sample[s.name], s.name, is_wgs=is_wgs) reused_samples.append(s) continue else: info(s.name + ': ' + s.targetcov_detailed_tsv + ' does not exist: submitting sambamba depth') bam_fpath = bams_by_sample[s.name] depth_output = join(seq2c_work_dirpath, s.name + '_depth' + '.txt') depth_output_by_sample[s.name] = depth_output if cnf.reuse_intermediate and verify_file(depth_output, silent=True): info(depth_output + ' exists, reusing') reused_samples.append(s) continue else: j = sambamba_depth(cnf, bed_fpath, bam_fpath, depth_output, use_grid=True, sample_name=s.name) jobs_by_sample[s.name] = j submitted_samples.append(s) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [_s for _s in not_submitted_samples if _s not in submitted_samples and _s not in reused_samples] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No annotation jobs to submit.') info('') info('-' * 70) info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_file(j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed: if 'work_dir' in j.__dict__ and isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples] info() info('*' * 50) info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) # wait_for_jobs(cnf, jobs_by_sample.values()) for s_name, seq2cov_output_fpath in seq2cov_output_by_sample.items(): if not isfile(seq2cov_output_fpath): if verify_file(depth_output_by_sample[s_name], is_critical=True, description='depth_output_by_sample for ' + s_name): info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name]) bed_col_num = count_bed_cols(bed_fpath) sambamba_depth_to_seq2cov(cnf, depth_output_by_sample[s_name], seq2cov_output_by_sample[s_name], s_name, bed_col_num) # script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'), # is_critical=True) # bedcov_hist_fpath = depth_output_by_sample[s_name] # cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals()) # j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name]) # sum_jobs_by_sample[s_name] = j # sum_jobs_by_sample = dict() # info('* Submitting seq2cov output *') # for s_name, j in jobs_by_sample.items(): # if not verify_file(seq2cov_output_by_sample[s_name], silent=True): # info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name]) # # script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'), # is_critical=True) # bedcov_hist_fpath = depth_output_by_sample[s_name] # bed_col_num = count_bed_cols(seq2c_bed) # cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals()) # j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name]) # sum_jobs_by_sample[s_name] = j # # wait_for_jobs(cnf, sum_jobs_by_sample.values()) info() info('Done') info('*' * 50) info() info('Combining seq2cov output') with open(output_fpath, 'w') as out: for i, s in enumerate(samples): verify_file(seq2cov_output_by_sample[s.name], description='seq2cov_output for ' + s.name, is_critical=True) with open(seq2cov_output_by_sample[s.name]) as inp: for l in inp: out.write(l) verify_file(output_fpath, description='__simulate_cov2cnv_w_bedtools output_fpath', is_critical=True) info('Saved combined seq2cov output to ' + output_fpath) info() return output_fpath
def run_targqc(cnf, output_dir, samples, target_bed, features_bed, genes_fpath=None): max_threads = cnf.threads threads_per_sample = 1 # max(max_threads / len(samples), 1) summary_threads = min(len(samples), max_threads) info('Number of threads to run summary: ' + str(summary_threads)) jobs_to_wait = [] if not cnf.only_summary: original_target_bed = target_bed features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds( cnf, features_bed, target_bed) gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \ extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed) if not genes_fpath: genes_fpath = join(cnf.work_dir, 'genes.txt') with open(genes_fpath, 'w') as f: f.write('\n'.join(g + '\t' + c for g, c in gene_keys_list)) info('*' * 70) info() step = _prep_steps(cnf, threads_per_sample, summary_threads, samples, target_bed, original_target_bed, features_bed, features_no_genes_bed, genes_fpath) summary_wait_for_steps = [] for sample in samples: info('Processing ' + basename(sample.name)) input_params = '' if sample.bam: input_params = ' --bam ' + sample.bam elif sample.l_fpath and sample.r_fpath: input_params = ' -1 ' + sample.l_fpath + ' -2 ' + sample.r_fpath if cnf.downsampled and sample.fastqc_dirpath: input_params += ' --downsampled --fastqc-dirpath ' + sample.fastqc_dirpath j = _submit_job(cnf, step, sample.name, threads=threads_per_sample, input_params=input_params, targqc_dirpath=sample.targqc_dirpath) jobs_to_wait.append(j) summary_wait_for_steps.append(step.job_name(sample.name)) info('Done ' + basename(sample.name)) info() wait_for_jobs(cnf, jobs_to_wait) info('Making targqc summary') return summarize_targqc(cnf, summary_threads, output_dir, samples, bed_fpath=target_bed, features_fpath=features_bed)
def _annotate(cnf, samples): varannotate_cmdl = (get_script_cmdline( cnf, 'python', join('scripts', 'post', 'varannotate.py')) + ' --sys-cnf ' + cnf.sys_cnf + ' --run-cnf ' + cnf.run_cnf + ' --project-name ' + cnf.project_name + (' --reuse ' if cnf.reuse_intermediate else '') + ' --log-dir -' + ' --genome ' + cnf.genome.name + (' --no-check ' if cnf.no_check else '') + (' --qc' if cnf.qc else ' --no-qc') + ((' --caller ' + cnf.caller) if cnf.caller else '')) total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 not_submitted_samples = samples while not_submitted_samples: jobs_to_wait = [] submitted_samples = [] reused_samples = [] for sample in not_submitted_samples: if not sample.varannotate_dirpath: sample.varannotate_dirpath = join(sample.dirpath, source.varannotate_name) if not sample.anno_vcf_fpath: sample.anno_vcf_fpath = join( sample.varannotate_dirpath, add_suffix(basename(sample.vcf), 'anno')) output_fpath = sample.anno_vcf_fpath if not output_fpath.endswith('.gz'): output_fpath += '.gz' debug('Checking ' + output_fpath) if cnf.reuse_intermediate and isfile(output_fpath) and verify_vcf( output_fpath): info('Annotated results ' + output_fpath + ' exist, reusing.') reused_samples.append(sample) info() continue work_dir = join(cnf.work_dir, source.varannotate_name + '_' + sample.name) j = submit_job( cnf, cmdline=varannotate_cmdl + ' --vcf ' + sample.vcf + ' -o ' + sample.varannotate_dirpath + ' -s ' + sample.name + ' --work-dir ' + work_dir + ' --output-file ' + output_fpath, job_name='VA_' + cnf.project_name + '_' + sample.name, output_fpath=output_fpath, stdout_to_outputfile=False, work_dir=work_dir) if not j.is_done: jobs_to_wait.append(j) submitted_samples.append(sample) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No annotation jobs to submit.') info('') info('-' * 70) info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_vcf( j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed: if isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) else: err('Job was done, but j.work_dir ' + j.work_dir + ' does not exist') processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) info()
def _filter(cnf, samples, variants_fpath, variants_fname): # if cohort_mode: # info('Running vcf2txt.pl in cohort mode') # vcf2txt = get_script_cmdline(cnf, 'perl', 'vcf2txt', is_critical=True) # vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in samples} # cmdline = vcf2txt + ' ' + make_vcf2txt_cmdl_params(cnf, vcf_fpath_by_sample) # res = run_vcf2txt_with_retries(cnf, cmdline, variants_fpath) # if not res: # critical('Error: vcf2txt.pl crashed') total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 cohort_freqs_fpath = None # if cnf.variant_filtering.max_ratio_vardict2mut < 1.0: # cohort_freqs_fpath = join(cnf.work_dir, 'cohort_freqs.tsv') # info('*' * 70) # info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio_vardict2mut) + ', counting freqs in cohort') # # cnf.variant_filtering.max_ratio < 1.0 or \ # # cnf.fraction < 1.0 # cohort_freqs_fpath = count_cohort_freqs(cnf, samples, cohort_freqs_fpath, max_ratio=cnf.variant_filtering.max_ratio_vardict2mut) # info('*' * 70) # info() not_submitted_samples = samples while not_submitted_samples: reused_samples = [] jobs_to_wait = [] submitted_samples = [] for sample in not_submitted_samples: output_dirpath = sample.varfilter_dirpath = join( sample.dirpath, source.varfilter_name) output_fpath = sample.variants_fpath = join( sample.varfilter_dirpath, variants_fname) pass_output_fpath = add_suffix(sample.variants_fpath, variant_filtering.mut_pass_suffix) if cnf.reuse_intermediate and check_filtering_results(output_fpath) \ and check_filtering_results(pass_output_fpath): info('Filtered results ' + output_fpath + ' and ' + pass_output_fpath + ' exist, reusing.') reused_samples.append(sample) info() continue varfilter_py = 'varfilter' work_dir = join(cnf.work_dir, 'filt_' + sample.name) if not cnf.genome.dbsnp_multi_mafs: critical( 'Error: dbsnp_multi_mafs is not specified in the config ' + cnf.sys_cnf) cmdl = ( '{varfilter_py}' + ((' --sys-cnf ' + cnf.sys_cnf) if not cnf.filt_cnf else '') + ((' --run-cnf ' + cnf.run_cnf) if not cnf.filt_cnf else '') + ((' --filt-cnf ' + cnf.filt_cnf) if cnf.filt_cnf else '') + ' --vcf {sample.anno_vcf_fpath}' + ' --sample {sample.name}' + ' -o {output_dirpath}' + ' --output-file {sample.variants_fpath}' + ' --project-name ' + cnf.project_name + ' --genome {cnf.genome.name}' + ' --work-dir {work_dir}' + ' --debug ' + (' --cohort-freqs {cohort_freqs_fpath}' if cohort_freqs_fpath else '') + (' --reuse ' if cnf.reuse_intermediate else '') + ((' --caller ' + cnf.caller) if cnf.caller else '') + (' --qc' if cnf.qc else ' --no-qc') + (' --no-tsv' if not cnf.tsv else '') + ' --dbsnp-multi-mafs ' + adjust_path(cnf.genome.dbsnp_multi_mafs)).format(**locals()) with with_cnf(cnf, reuse_intermediate=False): j = submit_job(cnf, cmdl, job_name='_filt_' + sample.name, output_fpath=pass_output_fpath, stdout_to_outputfile=False, work_dir=work_dir) if not j.is_done: jobs_to_wait.append(j) submitted_samples.append(sample) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No filtering jobs to submit.') info('') info('-' * 70) info('Finihsed filtering ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_file( j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed and not cnf.debug: if isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) else: err('Job was done, but ' + j.work_dir + ' does not exist') processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) info() info('Combining results...') vcf2txt_fpaths = [s.variants_fpath for s in samples] variants_fpath, pass_variants_fpath = combine_results( cnf, samples, vcf2txt_fpaths, variants_fpath) if cnf.qc: _summarize_varqc(cnf, cnf.output_dir, samples, cnf.project_name, post_filter=True) return variants_fpath, pass_variants_fpath
def make_fastqc_reports(cnf, fastq_fpaths, output_dir): # if isdir(fastqc_dirpath): # if isdir(fastqc_dirpath + '.bak'): # try: # shutil.rmtree(fastqc_dirpath + '.bak') # except OSError: # pass # if not isdir(fastqc_dirpath + '.bak'): # os.rename(fastqc_dirpath, fastqc_dirpath + '.bak') # if isdir(fastqc_dirpath): # err('Could not run and combine fastqc because it already exists and could not be moved to fastqc.bak') # return None fastqc = get_system_path(cnf, 'fastqc') if not fastqc: err('FastQC is not found, cannot make reports') return None else: safe_mkdir(output_dir) fqc_samples = [] fastqc_jobs = [] for fastq_fpath in fastq_fpaths: s = FQC_Sample(name=splitext_plus(basename(fastq_fpath))[0], fastq_fpath=fastq_fpath) fqc_samples.extend([s]) info('Added sample ' + s.name) for fqc_s in fqc_samples: if cnf.reuse_intermediate and verify_file(fqc_s.fastqc_html_fpath, silent=True): info(fqc_s.fastqc_html_fpath + ' exists, reusing') else: fastqc_jobs.append( run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir)) info() wait_for_jobs(cnf, fastqc_jobs) fastqc_jobs = [] # while True: for fqc_s in fqc_samples: fqc_s.fastqc_html_fpath = find_fastqc_html(output_dir, fqc_s.name) not_done_fqc = [ fqc_s for fqc_s in fqc_samples if not verify_file(fqc_s.fastqc_html_fpath, description='Not found FastQC html for ' + fqc_s.name) ] # if not not_done_fqc: # info('') # info('Every FastQC job is done, moving on.') # info('-' * 70) # break # else: # info('') # info('Some FastQC jobs are not done (' + ', '.join(f.name for f in not_done_fqc) + '). Retrying them.') # info('') # for fqc_s in not_done_fqc: # fastqc_jobs.append(run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir)) # wait_for_jobs(cnf, fastqc_jobs) for fqc_s in fqc_samples: sample_fastqc_dirpath = join(output_dir, fqc_s.name + '_fastqc') if isfile(sample_fastqc_dirpath + '.zip'): try: os.remove(sample_fastqc_dirpath + '.zip') except OSError: pass comb_fastqc_fpath = join(output_dir, 'fastqc.html') write_fastqc_combo_report(cnf, comb_fastqc_fpath, fqc_samples) verify_file(comb_fastqc_fpath, is_critical=True) info('Combined FastQC saved to ' + comb_fastqc_fpath) return comb_fastqc_fpath
def split_bam_files_use_grid(cnf, samples, combined_vcf_fpath, exac_features_fpath): samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=False) samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=True) vcfs_by_chrom = dict() tabix = get_system_path(cnf, 'tabix') for chrom in chromosomes: vcf_fpath = join(cnf.work_dir, str(chrom) + '.vcf') cmdline = '{tabix} -h {combined_vcf_fpath} {chrom} > {vcf_fpath}'.format( **locals()) call(cnf, cmdline) if verify_file(vcf_fpath): vcfs_by_chrom[chrom] = vcf_fpath output_dirpath = join(cnf.output_dir, 'combined_bams', cnf.project_name) safe_mkdir(output_dirpath) not_submitted_chroms = vcfs_by_chrom.keys() sample_names = ','.join(sample.name for sample in samples) sample_bams = ','.join(sample.bam for sample in samples) while not_submitted_chroms: jobs_to_wait = [] submitted_chroms = [] reused_chroms = [] for chrom, vcf_fpath in vcfs_by_chrom.iteritems(): if chrom not in not_submitted_chroms: continue output_fpaths = [ join( output_dirpath, chrom.replace('chr', '') + '-' + sample.name.replace('-', '_') + '.bam'.format(**locals())) for sample in samples ] if cnf.reuse_intermediate and all( verify_file(output_fpath, silent=True) for output_fpath in output_fpaths): info('BAM files for ' + chrom + ' chromosome exists, reusing') reused_chroms.append(chrom) continue else: # if exac_venv_pythonpath: # to avoid compatibility problems with pysam and tabix # cmdline = exac_venv_pythonpath + ' ' + get_system_path(cnf, # join('tools', 'split_bams_by_variants.py')) # else: cmdline = get_script_cmdline(cnf, 'python', join('tools', 'split_bams_by_variants.py'), is_critical=True) cmdline += ( ' --chr {chrom} --vcf {vcf_fpath} --samples {sample_names} ' + '--bams {sample_bams} -o {output_dirpath} --work-dir {cnf.work_dir} ' + '-g {cnf.genome.name} ').format(**locals()) if cnf.reuse_intermediate: cmdline += ' --reuse' if exac_features_fpath and verify_file(exac_features_fpath): cmdline += ' --features ' + exac_features_fpath j = submit_job(cnf, cmdline, chrom + '_split') info() submitted_chroms.append(chrom) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: break if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No jobs to submit.') not_submitted_chroms = [ chrom for chrom in not_submitted_chroms if chrom not in submitted_chroms and chrom not in reused_chroms ]
def dedup_and_sort_bams_use_grid(cnf, samples, do_sort=False): jobs_to_wait = [] not_submitted_samples = [sample for sample in samples] done_samples = [] while not_submitted_samples: jobs_to_wait = [] submitted_samples = [] reused_samples = [] for sample in not_submitted_samples: if do_sort: output_bam_fpath = join(cnf.work_dir, sample.name + '.dedup.sorted.bam') else: output_bam_fpath = join(cnf.work_dir, sample.name + '.dedup.bam') if cnf.reuse_intermediate and verify_file(output_bam_fpath, silent=True): info(output_bam_fpath + ' exists, reusing') sample.bam = output_bam_fpath done_samples.append(sample) reused_samples.append(sample) continue else: if do_sort: cmdline = 'sort {sample.bam} -o {output_bam_fpath}'.format( **locals()) j = call_sambamba(cnf, cmdline, output_fpath=output_bam_fpath, bam_fpath=sample.bam, use_grid=True, command_name='sort', sample_name=sample.name, stdout_to_outputfile=False) else: cmdline = 'view -f bam -F "not duplicate and not failed_quality_control" {sample.bam}'.format( **locals()) j = call_sambamba(cnf, cmdline, output_fpath=output_bam_fpath, bam_fpath=sample.bam, use_grid=True, command_name='dedup', sample_name=sample.name) info() sample.bam = output_bam_fpath done_samples.append(sample) submitted_samples.append(sample) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: break if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No jobs to submit.') not_submitted_samples = [ sample for sample in not_submitted_samples if sample not in submitted_samples and sample not in reused_samples ] return done_samples