def finalize_one(cnf, *abnormal_regions_reports): msg = ['Regions with abnormal regions finished for ' + cnf.sample + ':'] if abnormal_regions_reports: msg.append('Abnormal region reports: ') info('Abnormal region reports:') for rep in abnormal_regions_reports: msg.append(' ' + rep) info(' ' + rep)
def process_one(cnf): sample = VarSample(cnf.sample, cnf.output_dir, vcf=cnf.vcf, bam=cnf.bam, genome=cnf.genome) step_greetings('Fixing "SAMPLE" INFO annotation and SAMPLE header...') vcf_fpath = fix_vcf_sample_name(cnf, sample.name, cnf.vcf) # this method will also gunzip the vcf file # sample.vcf = fix_chromosome_names(cnf, sample.vcf) # if cnf.vcf.endswith('.gz'): # vcf_fpath = intermediate_fname(cnf, splitext(sample.vcf)[0], None) # info('Ungzipping ' + sample.vcf + ', writing to ' + vcf_fpath) # gunzip = get_system_path(cnf, 'gunzip', is_critical=True) # cmdl = '{gunzip} {sample.vcf} --to-stdout'.format(**locals()) # call(cnf, cmdl, output_fpath=vcf_fpath) # verify_vcf(vcf_fpath) # sample.vcf = vcf_fpath step_greetings('Removing rejeted records...') pass_vcf_fpath = remove_rejected(cnf, vcf_fpath) info() # if sample.vcf is None: # err('No variants left for ' + cnf.vcf + ': all rejected and removed.') # return None, None, None # # In mutect, running paired analysis on a single sample could lead # # to a "none" sample column. Removing that column. # info('get_sample_column_index') # none_idx = get_sample_column_index(sample.vcf, 'none', suppress_warn=True) # if none_idx is not None: # info('Removing the "none" column.') # def fn(line, i): # if line and not line.startswith('##'): # ts = line.split('\t') # del ts[9 + none_idx] # return '\t'.join(ts) + '\n' # return line # sample.vcf = iterate_file(cnf, sample.vcf, fn, suffix='none_col') # Replacing so the main sample goes first (if it is not already) # main_idx = get_sample_column_index(sample.vcf, sample.name) # if main_idx: # info('Moving the main sample column (' + sample.name + ') to the first place.') # def fn(line, i): # if line and not line.startswith('##'): # ts = line.split('\t') # main_sample_field = ts[9 + main_idx] # del ts[9 + main_idx] # ts = ts[:9] + [main_sample_field] + ts[9:] # return '\t'.join(ts) + '\n' # return line # sample.vcf = iterate_file(cnf, sample.vcf, fn, suffix='main_col') anno_vcf_fpath = run_annotators(cnf, pass_vcf_fpath, sample.bam) return finialize_annotate_file(cnf, anno_vcf_fpath, sample, cnf.caller)
def finalize_all(cnf, samples, results): for (sample_name, cnf), (vcf, tsv, maf) in zip(samples.items(), results): if vcf or tsv: info(sample_name + ':') if vcf: info(' ' + vcf) if tsv: info(' ' + tsv) if maf: info(' ' + maf)
def picard_ins_size_hist(cnf, sample, bam_fpath, output_dir): picard = get_system_path(cnf, 'java', 'picard') if picard: safe_mkdir(dirname(sample.picard_ins_size_hist_txt_fpath)) safe_mkdir(dirname(sample.picard_ins_size_hist_pdf_fpath)) info('Picard ins size hist for "' + basename(bam_fpath) + '"') cmdline = '{picard} CollectInsertSizeMetrics' \ ' I={bam_fpath}' \ ' O={sample.picard_ins_size_hist_txt_fpath}' \ ' H={sample.picard_ins_size_hist_pdf_fpath}' \ ' VALIDATION_STRINGENCY=LENIENT' cmdline = cmdline.format(**locals()) call(cnf, cmdline, output_fpath=sample.picard_ins_size_hist_txt_fpath, stdout_to_outputfile=False, exit_on_error=False)
def proc_fastq(cnf, sample, l_fpath, r_fpath): if cnf.downsample_to: info('Downsampling the reads to ' + str(cnf.downsample_to)) l_fpath, r_fpath = downsample(cnf, sample.nname, l_fpath, r_fpath, cnf.downsample_to, output_dir=cnf.work_dir, suffix='subset') sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) bwa = get_system_path(cnf, 'bwa') bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if not (sambamba and bwa and bammarkduplicates): critical( 'sambamba, BWA, and bammarkduplicates are required to align BAM') info() info('Aligning reads to the reference') bam_fpath = align(cnf, sample, l_fpath, r_fpath, sambamba, bwa, bammarkduplicates, cnf.genome.bwa, cnf.is_pcr) bam_fpath = verify_bam(bam_fpath) if not bam_fpath: critical('Sample ' + sample + ' was not aligned successfully.') return bam_fpath
def process_one(cnf, output_dir, bam_fpath, features_bed, features_no_genes_bed): sample = TargQC_Sample(cnf.sample, output_dir, bed=cnf.bed, bam=cnf.bam) sample.l_fpath = cnf.l_fpath sample.r_fpath = cnf.r_fpath # if not sample.bam and sample.l_fpath and sample.r_fpath: # sample.bam = proc_fastq(cnf, sample, verify_file(cnf.l_fpath), verify_file(cnf.r_fpath)) info('Using alignment ' + sample.bam) if not bam_fpath: critical(sample.name + ': BAM file is required.') target_bed = verify_file(cnf.bed, is_critical=True) if cnf.bed else None bam_fpath = verify_file(sample.bam, is_critical=True) index_bam(cnf, bam_fpath) gene_keys_list = None if cnf.prep_bed is not False: info('Preparing the BED file.') features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds( cnf, features_bed, target_bed) gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \ extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed) else: info('The BED file is ready, skipping preparing.') gene_keys_set, gene_keys_list, _, _, _ = \ extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed) picard_ins_size_hist(cnf, sample, bam_fpath, output_dir) avg_depth, gene_by_name_and_chrom, reports = make_targqc_reports( cnf, output_dir, sample, bam_fpath, features_bed, features_no_genes_bed, target_bed, gene_keys_list) # #if cnf.extended: # try: # info('Generating flagged regions report...') # flagged_report = generate_flagged_regions_report(cnf, cnf.output_dir, sample, avg_depth, gene_by_name_and_chrom) # if not flagged_report: # err('Flagged regions report was not generated') # err() # except: # err(format_exc()) return reports
def main(args): cnf = read_opts_and_cnfs(extra_opts=[ (['--bam'], dict(dest='bam', help='a path to the BAM file to study')), (['-1'], dict(dest='l_fpath')), (['-2'], dict(dest='r_fpath')), (['--bed', '--capture', '--amplicons'], dict(dest='bed', help='a BED file for capture panel or amplicons')), (['--exons', '--exome', '--features'], dict( dest='features', help= 'a BED file with real CDS/Exon/Gene/Transcript regions with annotations (default "features" is in system_config)' )), (['--exons-no-genes', '--features-no-genes'], dict( dest='features_no_genes', help= 'a BED file with real CDS/Exon regions with annotations, w/o Gene/Transcript records (default "features" is in system_config)' )), (['--original-bed'], dict(dest='original_target_bed', help=SUPPRESS_HELP)), (['--original-exons', '--original-features'], dict( dest='original_features_bed', help='original features genes bed file path (just for reporting)') ), (['--reannotate'], dict(dest='reannotate', help='re-annotate BED file with gene names', action='store_true', default=False)), (['--no-prep-bed'], dict(dest='prep_bed', help='do not fix input beds and exons', action='store_false', default=True)), (['-e', '--extended'], dict(dest='extended', help='extended - flagged regions and missed variants', action='store_true', default=False)), (['--genes'], dict(dest='genes', help='custom list of genes')), (['--padding'], dict( dest='padding', help= 'integer indicating the number of bases to extend each target region up and down-stream. ' 'Default is ' + str(defaults['coverage_reports']['padding']), type='int')), (['--no-dedup'], dict(dest='no_dedup', action='store_true', help=SUPPRESS_HELP)), (['--downsample-to'], dict(dest='downsample_to', type='int', help=SUPPRESS_HELP)), (['--downsampled'], dict(dest='downsampled', action='store_true', help=SUPPRESS_HELP)), (['--fastqc-dirpath'], dict(dest='fastqc_dirpath', help=SUPPRESS_HELP)) ], file_keys=['bam', 'l_fpath', 'r_fpath', 'bed'], key_for_sample_name='bam') if cnf.padding: cnf.coverage_reports.padding = cnf.padding check_system_resources(cnf, required=['bedtools'], optional=[]) check_genome_resources(cnf) features_bed = adjust_path(cnf.features) if cnf.features else adjust_path( cnf.genome.features) if features_bed: info('Features: ' + features_bed) features_bed = verify_file(features_bed) else: info('No features BED found') if cnf.bed: cnf.bed = verify_file(cnf.bed, is_critical=True) info('Using amplicons/capture panel ' + cnf.bed) elif features_bed: info('WGS, taking CDS as target') cnf.bam = verify_bam(cnf.bam, is_critical=True) reports = process_one(cnf, cnf.output_dir, cnf.bam, features_bed=features_bed, features_no_genes_bed=cnf.features_no_genes) summary_report, gene_report = reports[:2] info('') info('*' * 70) if summary_report.txt_fpath: info('Summary report: ' + summary_report.txt_fpath) if gene_report: if gene_report.txt_fpath: info('All regions: ' + gene_report.txt_fpath + ' (' + str(len(gene_report.rows)) + ' regions)') if len(reports) > 2: selected_regions_report = reports[2] if selected_regions_report.txt_fpath: info('Flagged regions: ' + selected_regions_report.txt_fpath + ' (' + str(len(selected_regions_report.rows)) + ' regions)') for fpaths in reports: if fpaths: ok = True info('Checking expected results...') if not isinstance(fpaths, list): fpaths = [fpaths] for fpath in fpaths: if isinstance(fpath, basestring): if not verify_file(fpath): ok = False if ok: info('The results are good.') if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir'])
def finalize_one(cnf, anno_vcf_fpath): msg = ['Annoatation finished for ' + cnf.sample + ':'] if anno_vcf_fpath: msg.append('VCF: ' + anno_vcf_fpath) info('Saved final VCF to ' + anno_vcf_fpath)
def main(args): cnf = read_opts_and_cnfs( extra_opts=[ (['--vcf', '--var'], dict( dest='vcf', help='variants to filter') ), (['--vcf2txt'], dict( dest='vcf2txt', help='variants in vcf2txt to filter') ), (['--cohort-freqs'], dict( dest='cohort_freqs_fpath', help='frequencies of variants in a cohort') ), (['--qc'], dict( dest='qc', action='store_true', default=True, help=SUPPRESS_HELP) ), (['--no-qc'], dict( dest='qc', action='store_false', help=SUPPRESS_HELP) ), (['--no-tsv'], dict( dest='tsv', action='store_false', default=True, help=SUPPRESS_HELP) ), ], required_keys=['vcf'], file_keys=['vcf'], key_for_sample_name='vcf', proc_name=source.varfilter_name + '_post') check_system_resources(cnf, required=['perl']) check_genome_resources(cnf) if not cnf.output_file: cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt') safe_mkdir(dirname(cnf.output_file)) safe_mkdir(cnf.output_dir) if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'): verify_vcf(cnf.vcf, is_critical=True) if not cnf.vcf2txt: vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file) if not vcf2txt_res_fpath: critical('vcf2txt run returned non-0') info('Saved vcf2txt output to ' + vcf2txt_res_fpath) else: cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True) info('Input is vcf2txt output, grepping by sample name ' + cnf.sample) vcf2txt_res_fpath = cnf.output_file with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx: with open(cnf.vcf2txt) as f, open(tx, 'w') as out: for i, l in enumerate(f): if l.strip(): if i == 0: out.write(l) else: if l.split('\t')[0] == cnf.sample: out.write(l) info('Using vcf2txt from ' + vcf2txt_res_fpath) # if is_local(): # vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl')) # info('Running vardict2mut perl') # res = run_vardict2mut(cnf, vcf2txt_res_fpath, # add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'), # vardict2mut_executable=vardict2mut_pl) # if not res: # critical('vardict2mut.pl run returned non-0') mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix)) if not mut_fpath: err('vardict2mut failed') else: info('Saved passed mutations to ' + mut_fpath) var_s = source.VarSample(cnf.sample, cnf.output_dir) var_s.anno_vcf_fpath = cnf.vcf var_s.varfilter_dirpath = var_s.dirpath ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0] ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt')) var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz' var_s.variants_fpath = vcf2txt_res_fpath var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix) ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass') var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass') filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath) index_vcf(cnf, var_s.name, filt_vcf, cnf.caller) index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller) if cnf.qc: report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s) qc_dirpath = join(cnf.output_dir, 'qc') safe_mkdir(qc_dirpath) qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name) info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')') info('-' * 70) info() if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir']) info() info('*' * 70) info('Done filtering ' + var_s.name)