def markdup_bam(cnf, in_bam_fpath, bammarkduplicates=None): """Perform non-stream based deduplication of BAM input files using biobambam. """ if not bammarkduplicates: bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if not bammarkduplicates: warn('No biobambam bammarkduplicates, can\'t mark duplicates.') return None out_bam_fpath = add_suffix(in_bam_fpath, 'markdup') tmp_fpath = join(cnf.work_dir, splitext_plus(basename(in_bam_fpath))[0] + '_markdup') safe_mkdir(dirname(tmp_fpath)) cmdline = ( '{bammarkduplicates} tmpfile={tmp_fpath} I={in_bam_fpath} O={out_bam_fpath}' ).format(**locals()) res = call(cnf, cmdline, output_fpath=out_bam_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: return out_bam_fpath else: return None
def main(): info(' '.join(sys.argv)) info() parser = OptionParser( usage='Usage: ' + basename(__file__) + ' --bed BED_file --bam BAM_file -g hg19 -o Output_BEDGRAPH_file ' '--work-dir work_directory --chr chromosome') parser.add_option('-o', dest='output_dir') parser.add_option('--samples', dest='sample_names') parser.add_option('--bams', dest='bams') parser.add_option('--vcf', dest='vcf_fpath') parser.add_option('--chr', dest='chrom') parser.add_option('--bed', dest='bed', help='BED file.') parser.add_option('-g', '--genome', dest='chr_len_fpath', help='File with chromosomes lengths.') parser.add_option('--work-dir', dest='work_dir', help='Work directory.') (opts, args) = parser.parse_args(sys.argv[1:]) cnf = Config(opts.__dict__, determine_sys_cnf(opts), {}) samples = [ BaseSample(sample_name, None, bam=bam) for (sample_name, bam) in zip(cnf.sample_names.split(','), cnf.bams.split(',')) ] if not cnf.output_dir or not cnf.bams: critical(parser.usage) safe_mkdir(cnf.output_dir) safe_mkdir(cnf.work_dir) get_regions_coverage(cnf, samples) info('Done.')
def run_vcf2txt_vardict2mut_for_samples(cnf, var_samples, output_dirpath, vcf2txt_out_fpath, caller_name=None, threads_num=1): threads_num = min(len(var_samples), cnf.threads) info('Number of threads for filtering: ' + str(threads_num)) safe_mkdir(output_dirpath) vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in var_samples} res = run_vcf2txt(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath) if not res: err('vcf2txt run returned non-0') return None # vardict2mut_py = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py')) # if not vardict2mut_py: # critical('vardict2mut_py not found') info('Running vardict2mut') res = run_vardict2mut( cnf, vcf2txt_out_fpath, add_suffix(vcf2txt_out_fpath, variant_filtering.mut_pass_suffix)) if not res: critical('vardict2mut.py run returned non-0') mut_fpath = res mut_fpath = convert_gpfs_path_to_url(mut_fpath) info() info('Done filtering with vcf2txt/vardict2mut, saved to ' + str(mut_fpath)) return mut_fpath
def split_bams(cnf, samples, vcf_fpath): variants_by_chrom = parse_variants(vcf_fpath) temp_output_dirpath = join(cnf.work_dir, 'temp') safe_mkdir(temp_output_dirpath) info('Splitting BAM files...') for chrom, variants in variants_by_chrom.iteritems(): chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq) chr_lengths_dict = dict((c, l) for (c, l) in chr_lengths) chr_length = chr_lengths_dict[chrom] transcripts = get_transcipts_with_exons_from_features(verify_file(cnf.features, is_critical=True), cur_chrom=chrom) bams_created_before = [] bams_by_sample = defaultdict(list) info('Extracting variant coverage for all samples for ' + chrom + ', ' + str(len(variants)) + ' variants') for variant in variants: variant_bams_by_sample = extract_variant_from_bams(cnf, temp_output_dirpath, transcripts, chr_length, samples, chrom, variant, bams_created_before) bams_created_before.extend(variant_bams_by_sample.values()) for sample_name, bam_fpath in variant_bams_by_sample.iteritems(): bams_by_sample[sample_name].append(bam_fpath) chrom = chrom.replace('chr', '') info() for sample_name, bam_fpaths in bams_by_sample.iteritems(): info('Making combined BAMs for chr' + chrom + ' for sample ' + sample_name) bam_fname = '{chrom}-{sample_name}.bam'.format(**locals()) temp_combined_bam_fpath = join(temp_output_dirpath, bam_fname) combined_bam_fpath = join(cnf.output_dir, bam_fname) generate_combined_bam(cnf, bam_fpaths, temp_combined_bam_fpath, combined_bam_fpath) info() info('Removing BAM files...') shutil.rmtree(temp_output_dirpath, ignore_errors=True)
def proc_args(argv): info(' '.join(sys.argv)) info() description = 'This script generates target QC reports for each BAM provided as an input. ' \ 'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir' parser = OptionParser(description=description, usage=description) add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c')) parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis') parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :') parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.') parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True) (opts, args) = parser.parse_args() logger.is_debug = opts.debug if len(args) == 0: parser.print_usage() sys.exit(1) if len(args) == 1 and not args[0].endswith('.bam'): sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv')) bam_by_sample = OrderedDict() for s, b in zip(sample_names, bam_fpaths): bam_by_sample[s] = b else: bam_by_sample = find_bams(args) run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed')) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) check_genome_resources(cnf) cnf.output_dir = adjust_path(cnf.output_dir) verify_dir(dirname(cnf.output_dir), is_critical=True) safe_mkdir(cnf.output_dir) if not cnf.project_name: cnf.project_name = basename(cnf.output_dir) info('Project name: ' + cnf.project_name) cnf.proc_name = 'Seq2C' set_up_dirs(cnf) samples = [ source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath) for s_name, bam_fpath in bam_by_sample.items()] info('Samples: ') for s in samples: info(' ' + s.name) samples.sort(key=lambda _s: _s.key_to_sort()) target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None if not cnf.only_summary: cnf.qsub_runner = adjust_system_path(cnf.qsub_runner) if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.') verify_file(cnf.qsub_runner, is_critical=True) return cnf, samples, target_bed, cnf.output_dir
def calculate_coverage_use_grid(cnf, samples, output_dirpath): assert len(samples) > 0 sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) chr_len_fpath = get_chr_len_fpath(cnf) jobs_to_wait = [] for sample in samples: sample_output_dirpath = join(output_dirpath, sample.name) safe_mkdir(sample_output_dirpath) for chrom in chromosomes: info('Processing chromosome ' + chrom) avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz') sample_output_fpaths = [ join(output_dirpath, sample.name, chrom + '.txt.gz') for sample in samples ] sample_names = ','.join(sample.name for sample in samples) chrom_bams = [] for sample in samples: if not verify_file(sample.bam): err('BAM for ' + sample.name + ' is not exist!') continue output_bam_fpath = join( cnf.work_dir, basename(sample.name) + '_' + str(chrom) + '.bam') cmdline = '{sambamba} slice {sample.bam} {chrom}'.format( **locals()) call(cnf, cmdline, output_fpath=output_bam_fpath) if verify_file(output_bam_fpath): chrom_bams.append(output_bam_fpath) bam_fpaths = ','.join(chrom_bams) if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \ all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths): info(avg_cov_output_fpath + ' exists, reusing') else: j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths, sample_names, output_dirpath, chr_len_fpath) if j and not j.is_done: jobs_to_wait.append(j) info() if len(jobs_to_wait) >= cnf.threads: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) jobs_to_wait = [] elif not jobs_to_wait: info('No jobs to submit.') if jobs_to_wait: wait_for_jobs(cnf, jobs_to_wait)
def finialize_annotate_file(cnf, vcf_fpath, sample, callername=None): # vcf_fpath = leave_first_sample(cnf, vcf_fpath) # if not cnf.no_check: # vcf_fpath = _filter_malformed_fields(cnf, vcf_fpath) if not cnf.no_check and callername and 'vardict' not in callername: info() info('Adding SAMPLE=' + sample.name + ' annotation...') vcf_fpath = add_annotation(cnf, vcf_fpath, 'SAMPLE', sample.name, number='1', type_='String', description='Sample name') final_vcf_fpath = join( cnf.output_dir, sample.name + (('-' + callername) if callername else '') + '.anno.vcf') if cnf.output_file: final_vcf_fpath = cnf.output_file if not vcf_fpath.endswith('.gz') and final_vcf_fpath.endswith('.gz'): final_vcf_fpath = splitext(final_vcf_fpath)[0] if vcf_fpath.endswith('.gz') and not final_vcf_fpath.endswith('.gz'): final_vcf_fpath = final_vcf_fpath + '.gz' info('Moving final VCF ' + vcf_fpath + ' to ' + final_vcf_fpath) if isfile(final_vcf_fpath): os.remove(final_vcf_fpath) shutil.copy(vcf_fpath, final_vcf_fpath) if cnf.qc: report = qc.make_report(cnf, final_vcf_fpath, sample) qc_dirpath = join(cnf.output_dir, 'qc') safe_mkdir(qc_dirpath) report = qc.save_report(cnf, report, sample, callername, qc_dirpath, source.varqc_name) info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')') info('-' * 70) info() if final_vcf_fpath.endswith('.gz'): if not is_gz(final_vcf_fpath): err(final_vcf_fpath + ' is in incorrect gzip format') anno_vcf_fpath_ungz = splitext(final_vcf_fpath)[0] anno_vcf_fpath_gz = final_vcf_fpath os.rename(anno_vcf_fpath_gz, anno_vcf_fpath_ungz) else: info(final_vcf_fpath + ' is a good gzipped file.') return [final_vcf_fpath] else: info('Compressing and indexing with bgzip+tabix ' + final_vcf_fpath) final_vcf_fpath = bgzip_and_tabix(cnf, final_vcf_fpath) info('Saved VCF again to ' + final_vcf_fpath) return [final_vcf_fpath]
def combine_targqc(cnf, bcbio_structures, tag_by_sample): samples = [s for bs in bcbio_structures for s in bs.samples] output_dir = join(cnf.output_dir, BCBioStructure.targqc_summary_dir) safe_mkdir(output_dir) summarize_targetcov.summarize_targqc(cnf, cnf.threads or len(samples), output_dir, samples, tag_by_sample=tag_by_sample)
def add_project_files_to_jbrowse(cnf, bcbio_structure): genome = cnf.genome.name jbrowse_data_path, _, _ = set_folders(genome) jbrowse_dirpath = join(jbrowse_data_path, 'tracks') jbrowse_project_dirpath = join(jbrowse_dirpath, bcbio_structure.project_name) safe_mkdir(jbrowse_project_dirpath) jbrowse_tracks_fpath = join(jbrowse_data_path, 'tracks.conf') vcf_fpath_by_sample = None caller = bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') if caller: vcf_fpath_by_sample = caller.get_filt_vcf_by_sample() for sample in bcbio_structure.samples: if sample.bam: index_bam(cnf, sample.bam, use_grid=True) for sample in bcbio_structure.samples: if all(isfile(join(jbrowse_project_dirpath, sample.name + ext)) for ext in ['.bam', '.bam.bai', '.vcf.gz', '.vcf.gz.tbi', '.bigwig'])\ and check_tracks_in_configs(sample.name, bcbio_structure.project_name, jbrowse_tracks_fpath, vcf_fpath_by_sample): info(sample.name + ' was exported to jBrowse previously.') continue vcf_link = None if vcf_fpath_by_sample: vcf_fpath = vcf_fpath_by_sample[ sample.name] if sample.name in vcf_fpath_by_sample else None if vcf_fpath and verify_file(vcf_fpath): vcf_link = create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, vcf_fpath) if not verify_file(vcf_fpath + '.tbi'): cmdline = '{tabix} {vcf_fpath}'.format(**locals()) call(cnf, cmdline, exit_on_error=False) create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, vcf_fpath + '.tbi') if sample.bam: bam_link = create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, sample.bam) create_jbrowse_symlink(genome, bcbio_structure.project_name, sample.name, sample.bam + '.bai') bigwig_link = create_jbrowse_symlink( genome, bcbio_structure.project_name, sample.name, splitext(sample.bam)[0] + '.bigwig') print_sample_tracks_info(sample.name, bcbio_structure.project_name, trunc_symlink(bam_link), trunc_symlink(bigwig_link), trunc_symlink(vcf_link), jbrowse_tracks_fpath)
def main(args): if len(args) < 2: critical('Usage: ' + __file__ + ' InputRootDirectory OutputRootDirectory [Build=hg38]') sys.exit(1) inp_root = adjust_path(args[0]) out_root = adjust_path(args[1]) build = 'hg38' if len(args) >= 3: build = args[2] chain_fpath = chains[build.lower()] for inp_dirpath, subdirs, files in os.walk(inp_root): for fname in files: if fname == 'sample1-cn_mops.bed': pass if fname.endswith('.bed'): inp_fpath = adjust_path(join(inp_dirpath, fname)) print inp_fpath + ': ' + str( count_bed_cols(inp_fpath)) + ' columns' out_dirpath = adjust_path( join(out_root, relpath(inp_dirpath, inp_root))) safe_mkdir(out_dirpath) out_fpath = adjust_path(join(out_dirpath, fname)) unlifted_fpath = adjust_path( join(out_dirpath, fname + '.unlifted')) cmdline = '' with open(inp_fpath) as f: fs = f.readline().split('\t') try: int(fs[6]) int(fs[7]) except: info('Cutting ' + inp_fpath) cmdline += 'cut -f1,2,3,4 "{inp_fpath}" > __cut; ' cmdline += liftover_fpath + ' __cut {chain_fpath} "{out_fpath}" "{unlifted_fpath}"' cmdline = cmdline.format(**locals()) info(cmdline) os.system(cmdline) verify_file(out_fpath) if isfile(unlifted_fpath): if getsize(unlifted_fpath) <= 0: os.remove(unlifted_fpath) else: err('Some records were unlifted and saved to ' + unlifted_fpath)
def main(): info(' '.join(sys.argv)) info() description = 'This script runs preprocessing.' parser = OptionParser(description=description) parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath') parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath') parser.add_option('--sample', dest='sample_name', help='Sample name') parser.add_option('--suffix', dest='suffix', default='subset', help='Output files suffix') parser.add_option('-o', dest='output_dir', help='Output directory path') parser.add_option( '--downsample-to', dest='downsample_to', default=5e5, type='int', help= 'Downsample reads to avoid excessive processing times with large files. ' 'Default is 1 million. Set to 0 to turn off downsampling.') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) (opts, args) = parser.parse_args() logger.is_debug = opts.debug cnf = Config(opts.__dict__, determine_sys_cnf(opts), determine_run_cnf(opts)) left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True) right_reads_fpath = verify_file( opts.right_reads_fpath, is_critical=True) if opts.right_reads_fpath else None output_dirpath = adjust_path( opts.output_dir) if opts.output_dir else critical( 'Please, specify output directory with -o') safe_mkdir(output_dirpath) verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True) with workdir(cnf): info('Downsampling to ' + str(cnf.downsample_to)) downsample(cnf, cnf.sample_name, left_reads_fpath, right_reads_fpath, cnf.downsample_to, output_dir=cnf.output_dir, suffix=cnf.suffix)
def main(): info(' '.join(sys.argv)) info() cnf, bcbio_structure = bcbio_summary_script_proc_params( 'expression', BCBioStructure.expression_dir) step_greetings('Gene expression heatmaps summary for all samples') report_caption_names = ['Gene counts', 'Exon counts', 'Gene TPM', 'Isoform TPM'] genes_dict, transcripts_dict = _get_gene_transcripts_id(cnf) for counts_fname, report_caption_name in zip(bcbio_structure.counts_names, report_caption_names): counts_fpath = join(bcbio_structure.expression_dirpath, counts_fname) if not verify_file(counts_fpath, silent=True): raw_counts_fpath = join(bcbio_structure.expression_dirpath, 'raw', 'combined.' + counts_fname.replace('.tsv', '')) info('Annotating ' + report_caption_name + ' from ' + raw_counts_fpath) annotate_gene_counts(cnf, raw_counts_fpath, counts_fpath, genes_dict) verify_file(counts_fpath, is_critical=True, description=counts_fname) isoforms_found = counts_fname == 'isoform.sf.tpm' and counts_fpath used_dict = transcripts_dict if isoforms_found else genes_dict report_fpath = join(safe_mkdir(join(bcbio_structure.expression_dirpath, 'html')), counts_fname.replace('.tsv', '') + '.html') make_gene_expression_heatmaps(cnf, bcbio_structure, counts_fpath, used_dict, report_fpath, report_caption_name, keep_gene_names=isoforms_found) info('Done')
def concat_fastqs(self, get_fastq_regexp, cnf): info('Preparing fastq files for the project named ' + self.name or self.az_project_name) if self.mergred_dir_found: info(' found already merged fastq dir, skipping.') return if not self.sample_by_name: err(' no samples found.') return safe_mkdir(self.fastq_dirpath) for s in self.sample_by_name.values(): _concat_fastq(cnf, s.find_raw_fastq(get_fastq_regexp, 'R1'), s.l_fpath) _concat_fastq(cnf, s.find_raw_fastq(get_fastq_regexp, 'R2'), s.r_fpath) info()
def _summarize_varqc(cnf, output_dir, samples, caption, post_filter=False): name = source.varqc_name if post_filter: name = source.varqc_after_name varqc_dir = join(output_dir, name) safe_mkdir(varqc_dir) info('VarQC ' + ('(post-filtering) ' if post_filter else '') + 'summary, saving to ' + output_dir) jsons_by_sample = dict() for s in samples: fpath = join((s.varannotate_dirpath if not post_filter else s.varfilter_dirpath), 'qc', s.name + (('-' + cnf.caller) if cnf.caller else '') + '.' + name + '.json') if verify_file(fpath): jsons_by_sample[s.name] = fpath htmls_by_sample = dict() for s in samples: fpath = join((s.varannotate_dirpath if not post_filter else s.varfilter_dirpath), 'qc', s.name + (('-' + cnf.caller) if cnf.caller else '') + '.' + name + '.html') if verify_file(fpath): htmls_by_sample[s.name] = fpath report = FullReport.construct_from_sample_report_jsons( samples, output_dir, jsons_by_sample=jsons_by_sample, htmls_by_sample=htmls_by_sample) full_summary_fpaths = report.save_into_files( cnf, join(varqc_dir, name), caption='Variant QC' + (' post-varfilter' if post_filter else '') + ((', ' + caption) if caption else '')) info() info('*' * 70) for fpath in full_summary_fpaths: if fpath: info(fpath) return full_summary_fpaths
def picard_ins_size_hist(cnf, sample, bam_fpath, output_dir): picard = get_system_path(cnf, 'java', 'picard') if picard: safe_mkdir(dirname(sample.picard_ins_size_hist_txt_fpath)) safe_mkdir(dirname(sample.picard_ins_size_hist_pdf_fpath)) info('Picard ins size hist for "' + basename(bam_fpath) + '"') cmdline = '{picard} CollectInsertSizeMetrics' \ ' I={bam_fpath}' \ ' O={sample.picard_ins_size_hist_txt_fpath}' \ ' H={sample.picard_ins_size_hist_pdf_fpath}' \ ' VALIDATION_STRINGENCY=LENIENT' cmdline = cmdline.format(**locals()) call(cnf, cmdline, output_fpath=sample.picard_ins_size_hist_txt_fpath, stdout_to_outputfile=False, exit_on_error=False)
def _symlink_to_dir(fpath, dirpath): if not isdir(dirpath): safe_mkdir(dirpath) dst_path = join(dirpath, basename(fpath)) if islink(dst_path) or isfile(dst_path): try: os.remove(dst_path) except OSError: err('Cannot symlink ' + fpath + ' -> ' + dst_path + ': cannot remove ' + dst_path) return try: symlink_plus(fpath, dst_path) except OSError: err('Cannot symlink ' + fpath + ' -> ' + dst_path)
def run_fastqc(cnf, fastq_fpath, output_basename, fastqc_dirpath, need_downsample=True): fastqc = get_system_path(cnf, 'fastqc', is_critical=True) java = get_system_path(cnf, 'java', is_critical=True) tmp_dirpath = join(cnf.work_dir, 'FastQC_' + output_basename + '_tmp') safe_mkdir(tmp_dirpath) cmdline_l = '{fastqc} --dir {tmp_dirpath} --extract -o {fastqc_dirpath} -f fastq -j {java} {fastq_fpath}'.format( **locals()) j = submit_job(cnf, cmdline_l, 'FastQC_' + output_basename, run_on_chara=True, stdout_to_outputfile=False) # output_fpath=join(fastqc_dirpath, output_basename + '_fastqc', 'fastqc_report.html')) return j
def create_jbrowse_symlink(genome, project_name, sample, file_fpath): jbrowse_data_path, _, _ = set_folders(genome) jbrowse_dirpath = join(jbrowse_data_path, 'tracks') jbrowse_project_dirpath = join(jbrowse_dirpath, project_name) base, ext = splitext_plus(file_fpath) if ext in ['.tbi', '.bai']: base, ext2 = splitext_plus(base) ext = ext2 + ext sym_link = join(jbrowse_project_dirpath, sample + ext) if not verify_dir(jbrowse_project_dirpath): safe_mkdir(jbrowse_project_dirpath) if isfile(file_fpath) and not isfile(sym_link): try: os.symlink(file_fpath, sym_link) except OSError: warn(traceback.format_exc()) if isfile(sym_link): change_permissions(sym_link) return sym_link
def set_up_dirs(cnf, log_dir_name='log'): """ Creates output_dir, work_dir; sets up log """ if cnf.output_dir: cnf.output_dir = adjust_path(cnf.output_dir) safe_mkdir(cnf.output_dir, 'output_dir') info('Saving into ' + cnf.output_dir) set_up_work_dir(cnf) if cnf.log_dir == '-': cnf.log_dir = None else: if not cnf.log_dir: cnf.log_dir = join(cnf.work_dir, log_dir_name) safe_mkdir(cnf.log_dir) info('Created log dir ' + cnf.log_dir) set_up_log(cnf)
def run_targqc(cnf, bam_by_sample, bed_fpath, output_dirpath): info('Running TargQC for downsampled BAMs') targqc = get_script_cmdline(cnf, 'python', 'targqc.py', is_critical=True) targqc_work_dir = join(cnf.work_dir, 'TargQC') targqc_log_dir = join(cnf.log_dir, 'TargQC') safe_mkdir(targqc_work_dir) safe_mkdir(targqc_log_dir) bed_cmdl = '' if bed_fpath: bed_cmdl = '--bed ' + bed_fpath bam_cmdl = ' '.join(bam_fpath + ',' + sname for sname, bam_fpath in bam_by_sample.items()) cmdl = '{targqc} --sys-cnf {cnf.sys_cnf} {bam_cmdl} {bed_cmdl} ' \ '--work-dir {targqc_work_dir} --log-dir {targqc_log_dir} --project-name {cnf.project_name} ' \ '-o {output_dirpath} --genome {cnf.genome.name}'.format(**locals()) if cnf.reuse_intermediate: cmdl += ' --reuse' call(cnf, cmdl)
def merge_bcbio_yamls(cnf, bcbio_structures): today_date = datetime.datetime.now() today_bcbio_date = today_date.strftime("%Y-%m-%d") safe_mkdir(today_bcbio_date) bcbio_cnfs = [bs.bcbio_cnf for bs in bcbio_structures] merged_yaml_fpath = join(cnf.output_dir, 'config', 'bcbio.yaml') merged_bcbio_cnf = dict() merged_bcbio_cnf['fc_date'] = today_bcbio_date merged_bcbio_cnf['fc_name'] = 'bcbio' merged_bcbio_cnf['upload'] = bcbio_cnfs[0]['upload'] merged_bcbio_cnf['details'] = [] for bs_cnf in bcbio_cnfs: bs_cnf['fc_date'] = today_bcbio_date bs_cnf['fc_name'] = 'bcbio' merged_bcbio_cnf['details'].extend(bs_cnf['details']) with open(merged_yaml_fpath, 'w') as yaml_file: yaml_file.write(save_yaml(merged_bcbio_cnf)) return merged_bcbio_cnf
def combine_varqc(cnf, bcbio_structures, tag_by_sample, varqc_dirname, varqc_name, caption): callers = [] samples = [] for bc in bcbio_structures: for vc in bc.variant_callers.values(): if vc.name not in [c.name for c in callers]: callers.append(vc) jsons_by_sample_by_caller = defaultdict(dict) htmls_by_sample_by_caller = defaultdict(dict) for bc in bcbio_structures: for vc in bc.variant_callers.values(): fpath_by_sample = vc.find_fpaths_by_sample(varqc_dirname, varqc_name, 'json', bc.final_dirpath) for sname, fpath in fpath_by_sample.items(): jsons_by_sample_by_caller[vc.name][sname] = fpath fpath_by_sample = vc.find_fpaths_by_sample(varqc_dirname, varqc_name, 'html', bc.final_dirpath) for sname, fpath in fpath_by_sample.items(): htmls_by_sample_by_caller[vc.name][sname] = fpath samples.extend(vc.samples) output_dir = join(cnf.output_dir, varqc_dirname) safe_mkdir(output_dir) if jsons_by_sample_by_caller and htmls_by_sample_by_caller: summarize_qc.make_summary_reports(cnf, 1, output_dir, callers, samples, jsons_by_sample_by_caller, htmls_by_sample_by_caller, tag_by_sample, varqc_name=varqc_name, caption=caption) else: err('Not JSON and HTML found, cannot generate summary reports.')
def run_fastq(cnf, sample_name, l_r_fpath, r_r_fpath, output_dirpath, downsample_to=1e7): fastqc = get_system_path(cnf, 'fastqc', is_critical=True) java = get_system_path(cnf, 'java', is_critical=True) if downsample_to: info('Downsampling to ' + str(downsample_to)) l_fpath, r_fpath = downsample(cnf, sample_name, l_r_fpath, r_r_fpath, downsample_to, output_dir=cnf.work_dir) # Joining fastq files to run on a combination fastqc_fpath = join(cnf.work_dir, sample_name + '.fq') info('Combining fastqs, writing to ' + fastqc_fpath) with open(fastqc_fpath, 'w') as out: out.write(open_gzipsafe(l_r_fpath).read()) out.write(open_gzipsafe(r_r_fpath).read()) # Running FastQC info('Running FastQC') tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp') safe_mkdir(tmp_dirpath) cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format( **locals()) call(cnf, cmdline) # Cleaning and getting report sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc') if isfile(sample_fastqc_dirpath + '.zip'): os.remove(sample_fastqc_dirpath + '.zip') fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html') verify_file(fastqc_html_fpath, is_critical=True) return sample_fastqc_dirpath
def set_up_work_dir(cnf): # timestamp = str(datetime.datetime.now()) # user_prid = getpass.getuser() # hasher = hashlib.sha1( + timestamp) # path_hash = base64.urlsafe_b64encode(hasher.digest()[0:4])[:-1] if not cnf.work_dir: if cnf.output_dir: work_dir_name = 'work' + ('_' + cnf.sample if cnf.sample else '') cnf.work_dir = join(cnf.output_dir, work_dir_name) info('Work dir: ' + cnf.work_dir) # if not cnf.reuse_intermediate and isdir(cnf.work_dir): # rmtree(cnf.work_dir) else: cnf.work_dir = tempfile.mkdtemp() info('Creating temprorary directory for work dir: ' + cnf.work_dir) else: cnf.work_dir = adjust_path(cnf.work_dir) info('Work dir: ' + cnf.work_dir) safe_mkdir(cnf.work_dir, 'working directory')
def main(): info(' '.join(sys.argv)) info() description = 'This script converts Vardict TXT file to VCF.' parser = OptionParser( description=description, usage='Usage: ' + basename(__file__) + ' [-o Output_directory -c Var_caller_name] Project_directory') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser) parser.add_option('--log-dir', dest='log_dir', default='-') parser.add_option('-c', '--caller', dest='caller_name', default='vardict') parser.add_option('-o', dest='output_dir', help='Output directory.') cnf, bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths, tags, is_wgs_in_bcbio, is_rnaseq \ = process_post_bcbio_args(parser) if not bcbio_project_dirpaths: parser.print_help(file=sys.stderr) sys.exit(1) bcbio_structures = [] for bcbio_project_dirpath, bcbio_cnf, final_dirpath in zip( bcbio_project_dirpaths, bcbio_cnfs, final_dirpaths): bs = BCBioStructure(cnf, bcbio_project_dirpath, bcbio_cnf, final_dirpath) bcbio_structures.append(bs) cnf.work_dir = cnf.work_dir or adjust_path(join(cnf.output_dir, 'work')) safe_mkdir(cnf.work_dir) info('') info('*' * 70) for bs in bcbio_structures: for sample in bs.samples: if sample.phenotype != 'normal': convert_vardict_txts_to_bcbio_vcfs(cnf, bs, sample)
def markdup_sam(cnf, in_sam_fpath, samblaster=None): """Perform non-stream based deduplication of SAM input files using samblaster. """ if not samblaster: samblaster = get_system_path(cnf, 'samblaster') if not samblaster: warn('No samblaster, can\'t mark duplicates.') return None out_sam_fpath = add_suffix(in_sam_fpath, 'markdup') tmp_fpath = join(cnf.work_dir, splitext_plus(basename(in_sam_fpath))[0] + '_markdup') safe_mkdir(dirname(tmp_fpath)) cmdline = '{samblaster} -i {in_sam_fpath} -o {out_sam_fpath}'.format( **locals()) res = call(cnf, cmdline, output_fpath=out_sam_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: return out_sam_fpath else: return None
def generate_flagged_regions_report(cnf, output_dir, sample, ave_depth, gene_by_key): depth_threshs = cnf.coverage_reports.depth_thresholds report = PerRegionSampleReport( sample=sample, metric_storage=get_detailed_metric_storage(depth_threshs)) report.add_record('Sample', sample.name) safe_mkdir(sample.flagged_regions_dirpath) ''' 1. Detect depth threshold (ave sample coverage * DEPTH_THRESH_FROM_AVE_COV) 2. Select regions covered in less than MIN_DEPTH_PERCENT_AT_THRESH at threshold 3. Sort by % at threshold 4. Select those parts of those regions where % = 0, save to BED 5. Find HotSpots at those regions 6. Intersect HotSpots with tracks For each gene where are regions with parts % = 0: sort them by part where % = 0 ''' #vcf_dbs = ['oncomine', 'dbsnp', 'cosmic'] vcf_dbs = ['oncomine'] from source._deprecated_clinical_reporting.clinical_parser import get_key_or_target_bed_genes key_genes, _ = get_key_or_target_bed_genes( cnf.bed, verify_file(adjust_system_path(cnf.key_genes), 'key genes')) depth_cutoff = get_depth_cutoff(ave_depth, depth_threshs) genes_sorted = sorted(gene_by_key.values()) min_cov, max_cov = min_and_max_based_on_outliers(genes_sorted) for coverage_type in ['low', 'high']: info('Selecting and saving ' + coverage_type + ' covered genes') selected_genes = [] if coverage_type == 'low': selected_genes = [ g for g in genes_sorted if g.gene_name in key_genes and (any( e.rates_within_threshs[depth_cutoff] < MIN_DEPTH_PERCENT_AT_THRESH for e in g.get_exons()) or any( a.rates_within_threshs[depth_cutoff] < MIN_DEPTH_PERCENT_AT_THRESH for a in g.get_amplicons())) ] else: if max_cov: selected_genes = [ g for g in genes_sorted if g.gene_name in key_genes and (any( e.avg_depth > max_cov for e in g.get_exons()) or any( a.avg_depth > max_cov for a in g.get_amplicons())) ] for region_type in ['exons', 'target']: selected_regions = [] for gene in selected_genes: if coverage_type == 'low': cur_regions = [ a for a in (gene.get_amplicons() if region_type == 'target' else gene.get_exons()) if a.rates_within_threshs[depth_cutoff] < MIN_DEPTH_PERCENT_AT_THRESH and 'Multi' not in a.feature ] else: cur_regions = [ a for a in (gene.get_amplicons() if region_type == 'target' else gene.get_exons()) if a.avg_depth > max_cov and 'Multi' not in a.feature ] selected_regions.extend(cur_regions) if selected_regions: selected_regions_bed_fpath = join( sample.flagged_regions_dirpath, coverage_type + '_cov_' + region_type + '.bed') save_regions_to_bed(cnf, selected_regions, selected_regions_bed_fpath) # Report cov for Hotspots for db in vcf_dbs: res = _report_normalize_coverage_for_variant_sites( cnf, sample, ave_depth, db, selected_regions_bed_fpath, selected_regions, depth_cutoff, region_type, coverage_type) if not res: return None report = make_flat_region_report(sample, selected_regions, depth_threshs) flagged_txt_fpath = add_suffix( add_suffix(sample.flagged_txt, region_type), coverage_type) flagged_tsv_fpath = add_suffix( add_suffix(sample.flagged_tsv, region_type), coverage_type) report.save_txt(flagged_txt_fpath) report.save_tsv(flagged_tsv_fpath) info('') info(coverage_type + ' covered ' + region_type + '(total ' + str(len(selected_regions)) + ') for sample ' + sample.name + ' saved into:') info(' ' + flagged_txt_fpath + ', ' + flagged_tsv_fpath) return report
def __get_mapped_reads(cnf, samples, bam_by_sample, output_fpath): if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath, samples mapped_reads_by_sample = OrderedDict() job_by_sample = dict() total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 not_submitted_samples = samples while not_submitted_samples: jobs_to_wait = [] submitted_samples = [] reused_samples = [] for s in not_submitted_samples: with with_cnf(cnf, work_dir=join(cnf.work_dir, s.name)) as cnf: safe_mkdir(cnf.work_dir) # if verify_file(s.targetcov_json_fpath, silent=True): # info('Parsing targetSeq output ' + s.targetcov_json_fpath) # with open(s.targetcov_json_fpath) as f: # data = load(f, object_pairs_hook=OrderedDict) # cov_report = SampleReport.load(data, s) # mapped_reads = next(rec.value for rec in cov_report.records if rec.metric.name == 'Mapped reads') # info(s.name + ': ') # info(' Mapped reads: ' + str(mapped_reads)) # mapped_reads_by_sample[s.name] = mapped_reads # reused_samples.append(s) # continue # # else: if s.name not in bam_by_sample: err('No BAM for ' + s.name + ', not running Seq2C') return None, None info('Submitting a sambamba job to get mapped read numbers') bam_fpath = bam_by_sample[s.name] j = number_of_mapped_reads(cnf, bam_fpath, dedup=True, use_grid=True, sample_name=s.name) job_by_sample[s.name] = j submitted_samples.append(s) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [_s for _s in not_submitted_samples if _s not in submitted_samples and _s not in reused_samples] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No annotation jobs to submit.') info('') info('-' * 70) info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_file(j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed: if 'work_dir' in j.__dict__ and isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples] info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) info() # wait_for_jobs(cnf, job_by_sample.values()) for s_name, j in job_by_sample.items(): if j and j.is_done and not j.is_failed: with open(j.output_fpath) as f: mapped_reads = int(f.read().strip()) info(s_name + ': ') info(' Mapped reads: ' + str(mapped_reads)) mapped_reads_by_sample[s_name] = mapped_reads else: err('ERROR: ' + s_name + ' could not get mapped reads, log saved to ' + j.log_fpath) with open(output_fpath, 'w') as f: for sample_name, mapped_reads in mapped_reads_by_sample.items(): f.write(sample_name + '\t' + str(mapped_reads) + '\n') verify_file(output_fpath, is_critical=True) successful_samples = [s for s in samples if s.name in mapped_reads_by_sample] info('Samples processed: ' + str(len(samples)) + ', successfully: ' + str(len(successful_samples))) return output_fpath, successful_samples
def __seq2c_coverage(cnf, samples, bams_by_sample, bed_fpath, is_wgs, output_fpath): if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath jobs_by_sample = dict() depth_output_by_sample = dict() seq2cov_output_by_sample = dict() seq2c_work_dirpath = join(cnf.work_dir, source.seq2c_name) safe_mkdir(seq2c_work_dirpath) info() total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 not_submitted_samples = samples while not_submitted_samples: jobs_to_wait = [] submitted_samples = [] reused_samples = [] for s in not_submitted_samples: info('*' * 50) info(s.name + ':') with with_cnf(cnf, work_dir=join(cnf.work_dir, s.name)) as cnf: safe_mkdir(cnf.work_dir) seq2cov_output_by_sample[s.name] = join(seq2c_work_dirpath, s.name + '.seq2cov.txt') if not cnf.reuse_intermediate and isfile(seq2cov_output_by_sample[s.name]): os.remove(seq2cov_output_by_sample[s.name]) if cnf.reuse_intermediate and verify_file(seq2cov_output_by_sample[s.name], silent=True): info(seq2cov_output_by_sample[s.name] + ' exists, reusing') reused_samples.append(s) continue elif verify_file(s.targetcov_detailed_tsv, silent=True): info('Using targetcov detailed output for Seq2C coverage.') info(s.name + ': using targetseq output') targetcov_details_to_seq2cov(cnf, s.targetcov_detailed_tsv, seq2cov_output_by_sample[s.name], s.name, is_wgs=is_wgs) reused_samples.append(s) continue else: info(s.name + ': ' + s.targetcov_detailed_tsv + ' does not exist: submitting sambamba depth') bam_fpath = bams_by_sample[s.name] depth_output = join(seq2c_work_dirpath, s.name + '_depth' + '.txt') depth_output_by_sample[s.name] = depth_output if cnf.reuse_intermediate and verify_file(depth_output, silent=True): info(depth_output + ' exists, reusing') reused_samples.append(s) continue else: j = sambamba_depth(cnf, bed_fpath, bam_fpath, depth_output, use_grid=True, sample_name=s.name) jobs_by_sample[s.name] = j submitted_samples.append(s) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [_s for _s in not_submitted_samples if _s not in submitted_samples and _s not in reused_samples] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No annotation jobs to submit.') info('') info('-' * 70) info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_file(j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed: if 'work_dir' in j.__dict__ and isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples] info() info('*' * 50) info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) # wait_for_jobs(cnf, jobs_by_sample.values()) for s_name, seq2cov_output_fpath in seq2cov_output_by_sample.items(): if not isfile(seq2cov_output_fpath): if verify_file(depth_output_by_sample[s_name], is_critical=True, description='depth_output_by_sample for ' + s_name): info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name]) bed_col_num = count_bed_cols(bed_fpath) sambamba_depth_to_seq2cov(cnf, depth_output_by_sample[s_name], seq2cov_output_by_sample[s_name], s_name, bed_col_num) # script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'), # is_critical=True) # bedcov_hist_fpath = depth_output_by_sample[s_name] # cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals()) # j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name]) # sum_jobs_by_sample[s_name] = j # sum_jobs_by_sample = dict() # info('* Submitting seq2cov output *') # for s_name, j in jobs_by_sample.items(): # if not verify_file(seq2cov_output_by_sample[s_name], silent=True): # info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name]) # # script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'), # is_critical=True) # bedcov_hist_fpath = depth_output_by_sample[s_name] # bed_col_num = count_bed_cols(seq2c_bed) # cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals()) # j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name]) # sum_jobs_by_sample[s_name] = j # # wait_for_jobs(cnf, sum_jobs_by_sample.values()) info() info('Done') info('*' * 50) info() info('Combining seq2cov output') with open(output_fpath, 'w') as out: for i, s in enumerate(samples): verify_file(seq2cov_output_by_sample[s.name], description='seq2cov_output for ' + s.name, is_critical=True) with open(seq2cov_output_by_sample[s.name]) as inp: for l in inp: out.write(l) verify_file(output_fpath, description='__simulate_cov2cnv_w_bedtools output_fpath', is_critical=True) info('Saved combined seq2cov output to ' + output_fpath) info() return output_fpath
def postprocess_vcf(cnf, work_dir, var_sample, caller_name, variants, mutations, vcf2txt_res_fpath): if cnf is None: global glob_cnf cnf = glob_cnf info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': writing filtered VCFs') filter_values = set(variants.values()) # Saving .anno.filt.vcf.gz and .anno.filt.pass.vcf ungz, gz = None, None if var_sample.filt_vcf_fpath.endswith('.gz'): ungz = splitext(var_sample.filt_vcf_fpath)[0] gz = var_sample.filt_vcf_fpath else: ungz = var_sample.filt_vcf_fpath gz = var_sample.filt_vcf_fpath + '.gz' if not var_sample.filt_tsv_fpath: var_sample.filt_tsv_fpath = splitext(ungz)[0] + '.tsv' if cnf.reuse_intermediate \ and verify_file(var_sample.filt_vcf_fpath, silent=True) \ and verify_file(var_sample.pass_filt_vcf_fpath, silent=True) \ and verify_file(var_sample.filt_tsv_fpath, silent=True): info(var_sample.filt_vcf_fpath + ' and ' + var_sample.pass_filt_vcf_fpath + ' exist; reusing.') else: safe_mkdir(dirname(var_sample.filt_vcf_fpath)) safe_mkdir(dirname(var_sample.pass_filt_vcf_fpath)) with open_gzipsafe(var_sample.anno_vcf_fpath) as vcf_f, \ file_transaction(work_dir, ungz) as filt_tx, \ file_transaction(work_dir, var_sample.pass_filt_vcf_fpath) as pass_tx: with open(filt_tx, 'w') as filt_f, open(pass_tx, 'w') as pass_f: info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': opened ' + var_sample.anno_vcf_fpath + ', writing to ' + ungz + ' and ' + var_sample.pass_filt_vcf_fpath) for l in vcf_f: if l.startswith('#'): if l.startswith('#CHROM'): filt_f.write( '##FILTER=<ID=vcf2txt,Description="Hard-filtered by vcf2txt.pl">\n' ) filt_f.write( '##FILTER=<ID=vardict2mut,Description="Hard-filtered by vardict2mut.pl">\n' ) for filt_val in filter_values: if filt_val != 'PASS': filt_f.write('##FILTER=<ID=' + filt_val + ',Description="">\n') filt_f.write(l) pass_f.write(l) else: ts = l.split('\t') chrom, pos, alt = ts[0], ts[1], ts[4] if (chrom, pos, alt) in mutations: ts[6] = 'PASS' filt_f.write('\t'.join(ts)) pass_f.write('\t'.join(ts)) else: if ts[6] in ['', '.', 'PASS']: ts[6] = '' filter_value = variants.get((chrom, pos, alt)) if filter_value is None: ts[6] += 'vcf2txt' elif filter_value == 'TRUE': ts[6] += 'vardict2mut' else: ts[6] += filter_value filt_f.write('\t'.join(ts)) info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': saved filtered VCFs to ' + ungz + ' and ' + var_sample.pass_filt_vcf_fpath) if False: info() info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': writing filtered TSVs') # Converting to TSV - saving .anno.filt.tsv if 'tsv_fields' in cnf.annotation and cnf.tsv: tmp_tsv_fpath = make_tsv(cnf, ungz, var_sample.name) if not tmp_tsv_fpath: err('TSV convertion didn\'t work') else: if isfile(var_sample.filt_tsv_fpath): os.remove(var_sample.filt_tsv_fpath) shutil.copy(tmp_tsv_fpath, var_sample.filt_tsv_fpath) info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': saved filtered TSV to ' + var_sample.filt_tsv_fpath) info('Done postprocessing filtered VCF.') return ungz