def get_chr_len_fpath(cnf): chr_len_fpath = join(cnf.work_dir, 'chr_lengths.txt') if cnf.reuse_intermediate and file_exists(chr_len_fpath): info(chr_len_fpath + ' exists, reusing') return chr_len_fpath else: if not cnf.genome.seq: critical('There is no "seq" key in ' + cnf.sys_cnf + ' for "' + cnf.genome.name + '" section') return None chr_lengths = get_chr_lengths_from_seq(adjust_path(cnf.genome.seq)) with file_transaction(cnf.work_dir, chr_len_fpath) as tx: with open(tx, 'w') as handle: for c, l in chr_lengths: handle.write(c + '\t' + str(l) + '\n') return chr_len_fpath
def _concat_fastq(cnf, fastq_fpaths, output_fpath): if len(fastq_fpaths) == 1: if not isfile(output_fpath): info(' no need to merge - symlinking ' + fastq_fpaths[0] + ' -> ' + output_fpath) if not isdir(dirname(output_fpath)): critical('Dir for the symlink ' + dirname(output_fpath) + ' does not exist') os.symlink(fastq_fpaths[0], output_fpath) return output_fpath else: info(' merging ' + ', '.join(fastq_fpaths)) if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') else: with file_transaction(cnf.work_dir, output_fpath) as tx: with open(tx, 'w') as out: for fq_fpath in fastq_fpaths: with open(fq_fpath, 'r') as inp: shutil.copyfileobj(inp, out) return output_fpath
def sort_bed_by_alphabet(cnf, input_bed_fpath, output_bed_fpath=None, chr_len_fpath=None): chr_lengths = get_chr_lengths(cnf, chr_len_fpath) chromosomes = set([c for (c, l) in chr_lengths]) output_bed_fpath = adjust_path( output_bed_fpath) if output_bed_fpath else add_suffix( input_bed_fpath, 'sorted') regions = defaultdict(list) info('Sorting regions...') chunk_size = 10 chunk_counter = 0 with open(input_bed_fpath) as f: with file_transaction(cnf.work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] if chrom not in chromosomes: continue if chunk_counter == chunk_size or not regions[chrom]: chunk_counter = 0 regions[chrom].append('') regions[chrom][-1] += l chunk_counter += 1 for chr in sorted(regions.keys()): for region in regions[chr]: out.write(region) return output_bed_fpath
def get_bedgraph_coverage(cnf, bam_fpath, chr_len_fpath=None, output_fpath=None, bed_fpath=None, exit_on_error=True): chr_len_fpath = chr_len_fpath or get_chr_len_fpath(cnf) dedup_bam = intermediate_fname(cnf, bam_fpath, source.dedup_bam) if not verify_bam(dedup_bam, silent=True): info('Deduplicating bam file ' + bam_fpath) remove_dups(cnf, bam_fpath, dedup_bam) else: info(dedup_bam + ' exists') index_bam(cnf, dedup_bam) bam_bed_fpath = bam_to_bed(cnf, dedup_bam, to_gzip=False) if getsize(bam_bed_fpath) <= 0: info('No coverage for ' + bam_fpath + ', skipping.') return None sorted_bed_fpath = sort_bed_by_alphabet(cnf, bam_bed_fpath, chr_len_fpath=chr_len_fpath) if bed_fpath: in_bed_fpath = intersect_bed(cnf, sorted_bed_fpath, bed_fpath) else: in_bed_fpath = sorted_bed_fpath if not verify_file(in_bed_fpath, silent=True): info('No coverage in ' + in_bed_fpath) return None bedgraph_fpath = output_fpath or '%s.bedgraph' % splitext(bam_fpath)[0] with file_transaction(cnf.work_dir, bedgraph_fpath) as tx_fpath: bedtools = get_system_path(cnf, 'bedtools') cmdl = '{bedtools} genomecov -bg -split -g {chr_len_fpath} -i {in_bed_fpath}'.format( **locals()) call(cnf, cmdl, exit_on_error=exit_on_error, output_fpath=tx_fpath) return bedgraph_fpath
def annotate_gene_counts(cnf, counts_fpath, ann_counts_fpath, genes_dict): unannotated_fpath = counts_fpath if not verify_file(unannotated_fpath): critical('Not found counts ' + unannotated_fpath) with file_transaction(cnf.work_dir, ann_counts_fpath) as tx: with open(tx, 'w') as annotated_f: with open(unannotated_fpath) as f: for i, l in enumerate(f): if i == 0: header = l.replace('\n', '').split('\t') l = '\t'.join(header + ['HUGO']) annotated_f.write(l + '\n') continue fs = l.replace('\n', '').split('\t') gene_and_exon = fs[0].split(':') gene_id = gene_and_exon[0] if gene_id not in genes_dict: continue gene_symbol = genes_dict[gene_id] l = '\t'.join(fs + [gene_symbol]) annotated_f.write(l + '\n') if not verify_file(ann_counts_fpath): critical('Could not annotate counts ' + unannotated_fpath)
def parse_svs(cnf, sv_file, out_bed_fpath): """ Parse sv vcf into a bed file """ bp_dict = {} vcf_reader = vcf.Reader(filename=sv_file) with file_transaction(cnf.work_dir, out_bed_fpath) as tx: with open(tx, 'w') as out: for record in vcf_reader: if record.FILTER is None: record.FILTER = [] try: # if there is no SVTYPE or MATEID, ignore for now if record.INFO['SVTYPE'] == "BND": if record.INFO['MATEID'][0] not in bp_dict: #Store the record in the dict until its pair is found bp_dict[record.ID] = record else: #If the other BND is in the dict, annotate record2 = bp_dict[record.INFO['MATEID'][0]] try: if record.samples[0]["PR"][1] + record.samples[0]["SR"][1] >= 5: out.write('\t'.join([str(record.CHROM), str(record.POS), str(record2.CHROM), str(record2.POS) + '\n'])) except AttributeError: pass #remove used record from the dict del bp_dict[record.INFO['MATEID'][0]] else: #first check if 'END' is specified if 'END' in record.INFO: try: # require 1Mb difference in coordinates and evidence from 10 reads or more if record.samples[0]["PR"][1] + record.samples[0]["SR"][1] >= 10 and abs(record.INFO['END'] - record.POS) > 1000000: out.write('\t'.join([str(record.CHROM), str(record.POS), str(record.CHROM), str(record.INFO['END']) + '\n'])) except AttributeError: pass except KeyError: pass
def write_combined_results(cnf, variants_fpath, samples, vcf2txt_fpaths, freq_in_cohort_by_vark, count_in_cohort_by_vark, suffix=variant_filtering.mut_pass_suffix, do_cohort_filtering=True): artefacts_samples = OrderedDefaultDict(list) artefacts_data = OrderedDict() variants_count = defaultdict(int) written_lines_count = 0 status_col, reason_col, n_samples_col, n_var_col, pcnt_sample_col, ave_af_col, incidentalome_col \ = None, None, None, None, None, None, None with file_transaction(cnf.work_dir, variants_fpath) as tx: with open(tx, 'w') as out: for sample_i, (sample, vcf2txt_fpath) in enumerate( zip(samples, vcf2txt_fpaths)): mut_fpath = add_suffix(vcf2txt_fpath, suffix) with file_transaction(cnf.work_dir, mut_fpath) as fixed_mut_fpath_tx: with open(mut_fpath) as f, open(fixed_mut_fpath_tx, 'w') as fixed_f_out: for line_i, l in enumerate(f): fs = l.replace('\n', '').split('\t') if line_i == 0 and sample_i == 0: out.write(l) if line_i == 0: fixed_f_out.write(l) if status_col is not None and status_col != fs.index( 'Significance'): critical( 'Different format in ' + mut_fpath + ': status_col=' + str(fs.index('Significance')) + ', but the first sample was ' + str(status_col) + ', please rerun VarFilter from the beginning' ) status_col = fs.index('Significance') reason_col = status_col + 1 n_samples_col = fs.index('N_samples') n_var_col = fs.index('N_Var') pcnt_sample_col = fs.index('Pcnt_sample') ave_af_col = fs.index('Ave_AF') if 'Incidentalome' in fs: incidentalome_col = fs.index( 'Incidentalome') if line_i > 0: fs = l.replace('\n', '').split('\t') chrom, pos, db_id, ref, alt = fs[1:6] vark = ':'.join([chrom, pos, ref, alt]) assert len(fs) > reason_col, 'len(fs)=' + str(len(fs)) + ' > reason_col=' + str(reason_col) + \ ' in ' + sample.name + ', ' + vcf2txt_fpath + ' for line\n' + l freq = freq_in_cohort_by_vark[vark] cnt = count_in_cohort_by_vark[vark] fs[n_samples_col] = str(len(samples)) fs[n_var_col] = str(cnt) fs[pcnt_sample_col] = str(freq) fs[ave_af_col] = '' l = '\t'.join(fs) + '\n' if do_cohort_filtering: if fs[status_col] in ['known', 'likely']: variants_count['not_filtered'] += 1 elif freq >= cnf.variant_filtering.max_ratio and cnt > cnf.variant_filtering.max_sample_cnt: artefacts_samples[vark].append( sample.name) # if incidentalome_col: # fs.remove(fs[incidentalome_col]) artefacts_data[vark] = fs continue variants_count['good_freq'] += 1 fixed_f_out.write(l) out.write(l) written_lines_count += 1 return artefacts_samples, artefacts_data, variants_count, written_lines_count
def combine_results(cnf, samples, vcf2txt_fpaths, variants_fpath, pass_variants_fpath=None, reject_variants_fpath=None): info('Combining vcf2txt variants') not_existing_snames = [] if cnf.reuse_intermediate and isfile(variants_fpath) and verify_file( variants_fpath): info('Combined filtered results ' + variants_fpath + ' exist, reusing.') else: for sample_i, (sample, vcf2txt_fpath) in enumerate(zip(samples, vcf2txt_fpaths)): if not verify_file(vcf2txt_fpath, description='variants file'): not_existing_snames.append(sample.name) if not_existing_snames: critical( 'For some samples do not exist, variants file was not found: ' + ', '.join(not_existing_snames)) with file_transaction(cnf.work_dir, variants_fpath) as tx: with open(tx, 'w') as out: for sample_i, (sample, vcf2txt_fpath) in enumerate( zip(samples, vcf2txt_fpaths)): with open(vcf2txt_fpath) as f: for line_i, l in enumerate(f): if line_i == 0 and sample_i == 0: out.write(l) if line_i > 0: out.write(l) verify_file(variants_fpath, is_critical=True, description='combined mutation calls') info('Saved vcf2txt variants to ' + variants_fpath) info() info('Combining PASSed mutations') pass_variants_fpath = pass_variants_fpath or add_suffix( variants_fpath, variant_filtering.mut_pass_suffix) reject_variants_fpath = reject_variants_fpath or add_suffix( variants_fpath, variant_filtering.mut_reject_suffix) not_existing_pass_snames = [] if cnf.reuse_intermediate and isfile(pass_variants_fpath) and verify_file(pass_variants_fpath)\ and isfile(reject_variants_fpath) and verify_file(reject_variants_fpath): info('Combined PASSed filtered results ' + pass_variants_fpath + ' exist, reusing.') else: for sample_i, (sample, vcf2txt_fpath) in enumerate(zip(samples, vcf2txt_fpaths)): if not verify_file(add_suffix(vcf2txt_fpath, variant_filtering.mut_pass_suffix), description='PASS variants file'): not_existing_pass_snames.append(sample.name) if not_existing_pass_snames: critical( 'For some samples do not exist, PASS variants file was not found: ' + ', '.join(not_existing_pass_snames)) info('*' * 70) if cnf.variant_filtering.max_ratio < 1.0: info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio)) else: info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio) + ', i.e. no filter') info('Calculating frequences of variants in the cohort') info('*' * 70) freq_in_cohort_by_vark, count_in_cohort_by_vark = count_mutations_freq( cnf, samples, vcf2txt_fpaths) reject_freq_in_cohort_by_vark, reject_count_in_cohort_by_vark = count_mutations_freq( cnf, samples, vcf2txt_fpaths, suffix=variant_filtering.mut_reject_suffix) info() if cnf.variant_filtering.max_ratio < 1.0: info('Saving passing threshold if cohort freq < ' + str(cnf.variant_filtering.max_ratio) + ' to ' + pass_variants_fpath) artefacts_samples, artefacts_data, variants_count, written_lines_count = write_combined_results( cnf, pass_variants_fpath, samples, vcf2txt_fpaths, freq_in_cohort_by_vark, count_in_cohort_by_vark, suffix=variant_filtering.mut_pass_suffix, do_cohort_filtering=True) _, _, _, reject_written_lines_count = write_combined_results( cnf, reject_variants_fpath, samples, vcf2txt_fpaths, reject_freq_in_cohort_by_vark, reject_count_in_cohort_by_vark, suffix=variant_filtering.mut_reject_suffix, do_cohort_filtering=False) if len(artefacts_samples.keys()) > 0: reason = 'cohort freq > ' + str(cnf.variant_filtering.max_ratio) with open(reject_variants_fpath) as f: line = f.readline().split() reason_col = line.index('Reason') if 'Reason' in line else None with open(reject_variants_fpath, 'a') as f: for vark, samples in artefacts_samples.items(): fs = artefacts_data[vark] if reason_col: fs[reason_col] = reason else: fs.append(reason) f.write('\t'.join(fs) + '\n') info('Skipped artefacts with cohort freq > ' + str(cnf.variant_filtering.max_ratio) + ' and sample count > ' + str(cnf.variant_filtering.max_sample_cnt) + ': ' + str(len(artefacts_samples.keys()))) info('Added artefacts into ' + reject_variants_fpath) info('All variants not under filtering: ' + str(variants_count['not_filtered'])) if len(artefacts_samples.keys()) > 0: info('Variants not under filtering with freq > ' + str(cnf.variant_filtering.max_ratio) + ': ' + str(variants_count['good_freq'])) verify_file(pass_variants_fpath, 'PASS variants file', is_critical=True) info('Written ' + str(written_lines_count) + ' records to ' + pass_variants_fpath) info('Written ' + str(reject_written_lines_count + len(artefacts_samples.keys())) + ' rejected records to ' + reject_variants_fpath) variants_fpath = verify_file(variants_fpath, is_critical=True) pass_variants_fpath = verify_file(pass_variants_fpath, is_critical=True) if not_existing_snames or not_existing_pass_snames: return None, None return variants_fpath, pass_variants_fpath
def postprocess_vcf(cnf, work_dir, var_sample, caller_name, variants, mutations, vcf2txt_res_fpath): if cnf is None: global glob_cnf cnf = glob_cnf info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': writing filtered VCFs') filter_values = set(variants.values()) # Saving .anno.filt.vcf.gz and .anno.filt.pass.vcf ungz, gz = None, None if var_sample.filt_vcf_fpath.endswith('.gz'): ungz = splitext(var_sample.filt_vcf_fpath)[0] gz = var_sample.filt_vcf_fpath else: ungz = var_sample.filt_vcf_fpath gz = var_sample.filt_vcf_fpath + '.gz' if not var_sample.filt_tsv_fpath: var_sample.filt_tsv_fpath = splitext(ungz)[0] + '.tsv' if cnf.reuse_intermediate \ and verify_file(var_sample.filt_vcf_fpath, silent=True) \ and verify_file(var_sample.pass_filt_vcf_fpath, silent=True) \ and verify_file(var_sample.filt_tsv_fpath, silent=True): info(var_sample.filt_vcf_fpath + ' and ' + var_sample.pass_filt_vcf_fpath + ' exist; reusing.') else: safe_mkdir(dirname(var_sample.filt_vcf_fpath)) safe_mkdir(dirname(var_sample.pass_filt_vcf_fpath)) with open_gzipsafe(var_sample.anno_vcf_fpath) as vcf_f, \ file_transaction(work_dir, ungz) as filt_tx, \ file_transaction(work_dir, var_sample.pass_filt_vcf_fpath) as pass_tx: with open(filt_tx, 'w') as filt_f, open(pass_tx, 'w') as pass_f: info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': opened ' + var_sample.anno_vcf_fpath + ', writing to ' + ungz + ' and ' + var_sample.pass_filt_vcf_fpath) for l in vcf_f: if l.startswith('#'): if l.startswith('#CHROM'): filt_f.write( '##FILTER=<ID=vcf2txt,Description="Hard-filtered by vcf2txt.pl">\n' ) filt_f.write( '##FILTER=<ID=vardict2mut,Description="Hard-filtered by vardict2mut.pl">\n' ) for filt_val in filter_values: if filt_val != 'PASS': filt_f.write('##FILTER=<ID=' + filt_val + ',Description="">\n') filt_f.write(l) pass_f.write(l) else: ts = l.split('\t') chrom, pos, alt = ts[0], ts[1], ts[4] if (chrom, pos, alt) in mutations: ts[6] = 'PASS' filt_f.write('\t'.join(ts)) pass_f.write('\t'.join(ts)) else: if ts[6] in ['', '.', 'PASS']: ts[6] = '' filter_value = variants.get((chrom, pos, alt)) if filter_value is None: ts[6] += 'vcf2txt' elif filter_value == 'TRUE': ts[6] += 'vardict2mut' else: ts[6] += filter_value filt_f.write('\t'.join(ts)) info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': saved filtered VCFs to ' + ungz + ' and ' + var_sample.pass_filt_vcf_fpath) if False: info() info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': writing filtered TSVs') # Converting to TSV - saving .anno.filt.tsv if 'tsv_fields' in cnf.annotation and cnf.tsv: tmp_tsv_fpath = make_tsv(cnf, ungz, var_sample.name) if not tmp_tsv_fpath: err('TSV convertion didn\'t work') else: if isfile(var_sample.filt_tsv_fpath): os.remove(var_sample.filt_tsv_fpath) shutil.copy(tmp_tsv_fpath, var_sample.filt_tsv_fpath) info(var_sample.name + ((', ' + caller_name) if caller_name else '') + ': saved filtered TSV to ' + var_sample.filt_tsv_fpath) info('Done postprocessing filtered VCF.') return ungz
def run_vardict2mut(cnf, vcf2txt_res_fpath, vardict2mut_res_fpath=None, vardict2mut_executable=None): cmdline = None if vardict2mut_res_fpath is None: vardict2mut_res_fpath = add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix) vardict2mut_reject_fpath = add_suffix(vcf2txt_res_fpath, variant_filtering.mut_reject_suffix) check_filtering_results(vardict2mut_res_fpath) if not vardict2mut_executable: # vardict2mut_executable = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py')) vardict2mut_executable = 'vardict2mut' c = cnf.variant_filtering cmdline = '{vardict2mut_executable} {vcf2txt_res_fpath} ' if vardict2mut_executable.endswith('.pl'): cmdline += ' --report_reason ' if c.min_hotspot_freq is not None and c.min_hotspot_freq != 'default': cmdline += ' -F ' + str(c.min_hotspot_freq) if c.max_ratio_vardict2mut is not None: cmdline += ' -R ' + str(c.max_ratio_vardict2mut) if cnf.genome.filter_common_snp: cmdline += ' --filter_common_snp {cnf.genome.filter_common_snp} ' if cnf.genome.filter_common_artifacts: cmdline += ' --filter_common_artifacts {cnf.genome.filter_common_artifacts} ' if cnf.genome.actionable: cmdline += ' --actionable {cnf.genome.actionable} ' if cnf.genome.compendia_ms7_hotspot: cmdline += ' --compendia_ms7_hotspot {cnf.genome.compendia_ms7_hotspot} ' if cnf.snpeffect_export_polymorphic: cmdline += ' --snpeffect_export_polymorphic {cnf.snpeffect_export_polymorphic} ' if cnf.actionable_hotspot: cmdline += ' --actionable_hotspot {cnf.actionable_hotspot} ' if cnf.ruledir: cmdline += ' --ruledir {cnf.ruledir} ' cmdline = cmdline.format(**locals()) res = call(cnf, cmdline, vardict2mut_res_fpath, exit_on_error=False) else: filt_yaml_fpath = join(cnf.work_dir, 'filt_cnf.yaml') info('Writing filtering yaml into ' + filt_yaml_fpath) with file_transaction(cnf.work_dir, filt_yaml_fpath) as tx, open( filt_yaml_fpath, 'w') as out: with open(cnf.run_cnf) as run_cnf: lines = [] met_variant_filtering = False for l in run_cnf: if l.startswith('variant_filtering:'): met_variant_filtering = True continue if met_variant_filtering: if l.startswith(' '): out.write(l.lstrip()) else: break cmdline += ' --filt-cnf ' + filt_yaml_fpath cmdline += ' --work-dir ' + cnf.work_dir cmdline += (' --debug ' if cnf.debug else '') cmdline += ' --genome ' + cnf.genome.name cmdline += ' -o ' + vardict2mut_res_fpath cmdline += ' --o-reject ' + vardict2mut_reject_fpath if cnf.cohort_freqs_fpath: cmdline += ' --cohort-freqs ' + cnf.cohort_freqs_fpath cmdline = cmdline.format(**locals()) res = call(cnf, cmdline, output_fpath=vardict2mut_res_fpath, stdout_to_outputfile=False) if not res: return None else: return res
def downsample(cnf, sample_name, fastq_L_fpath, fastq_R_fpath, N, output_dir, suffix=None, quick=False): """ get N random headers from a fastq file without reading the whole thing into memory modified from: http://www.biostars.org/p/6544/ quick=True will just grab the first N reads rather than do a true downsampling """ sample_name = sample_name or splitext(''.join( lc if lc == rc else '' for lc, rc in izip(fastq_L_fpath, fastq_R_fpath)))[0] l_out_fpath = join(output_dir, add_suffix(basename(fastq_L_fpath), suffix or 'subset')) r_out_fpath = join(output_dir, add_suffix(basename(fastq_R_fpath), suffix or 'subset')) if cnf.reuse_intermediate and verify_file( l_out_fpath, silent=True) and verify_file(r_out_fpath, silent=True): info(l_out_fpath + ' and ' + r_out_fpath + ' exist, reusing.') return l_out_fpath, r_out_fpath info('Processing ' + sample_name) N = int(N) records_num = N if quick: rand_records = range(N) else: info(sample_name + ': getting number of reads in fastq...') records_num = sum(1 for _ in open_gzipsafe(fastq_L_fpath)) / 4 if records_num > LIMIT: info(sample_name + ' the number of reads is higher than ' + str(LIMIT) + ', sampling from only first ' + str(LIMIT)) records_num = LIMIT info(sample_name + ': ' + str(records_num) + ' reads') if records_num < N: info(sample_name + ': and it is less than ' + str(N) + ', so no downsampling.') return fastq_L_fpath, fastq_R_fpath else: info(sample_name + ': downsampling to ' + str(N)) rand_records = sorted(random.sample(xrange(records_num), N)) info('Opening ' + fastq_L_fpath) fh1 = open_gzipsafe(fastq_L_fpath) info('Opening ' + fastq_R_fpath) fh2 = open_gzipsafe(fastq_R_fpath) if fastq_R_fpath else None out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath) written_records = 0 with file_transaction(cnf.work_dir, out_files) as tx_out_files: if isinstance(tx_out_files, basestring): tx_out_f1 = tx_out_files else: tx_out_f1, tx_out_f2 = tx_out_files info('Opening ' + str(tx_out_f1) + ' to write') sub1 = open_gzipsafe(tx_out_f1, "w") info('Opening ' + str(tx_out_f2) + ' to write') sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None rec_no = -1 for rr in rand_records: while rec_no < rr: rec_no += 1 for i in range(4): fh1.readline() if fh2: for i in range(4): fh2.readline() for i in range(4): sub1.write(fh1.readline()) if sub2: sub2.write(fh2.readline()) written_records += 1 rec_no += 1 if written_records % 10000 == 0: info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no)) if rec_no > records_num: info(sample_name + ' reached the limit of ' + str(records_num), ' read lines, stopping.') break info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no)) fh1.close() sub1.close() if fastq_R_fpath: fh2.close() sub2.close() info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written') return l_out_fpath, r_out_fpath
def write_to_csv_file(work_dir, jira_case, project_list_fpath, country_id, project_name, samples_num=None, analysis_dirpath=None, html_report_url=None): info('Reading project list ' + project_list_fpath) with open(project_list_fpath) as f: lines = f.readlines() uncom_lines = [l.strip() for l in lines if not l.strip().startswith('#')] header = uncom_lines[0].strip() info('header: ' + header) header_keys = header.split( ',' ) # 'Updated By,PID,Name,JIRA URL,HTML report path,Datestamp,Data Hub,Analyses directory UK,Analyses directory US,Type,Division,Department,Sample Number,Reporter,Assignee,Description,IGV,Notes' index_of_pid = header_keys.index('PID') if index_of_pid == -1: index_of_pid = 1 values_by_keys_by_pid = OrderedDict() for l in uncom_lines[1:]: if l: values = map(__unquote, l.split(',')) pid = values[index_of_pid] values_by_keys_by_pid[pid] = OrderedDict(zip(header_keys, values)) pid = project_name with file_transaction(work_dir, project_list_fpath) as tx_fpath: if pid not in values_by_keys_by_pid.keys(): # info(pid + ' not in ' + str(values_by_keys_by_pid.keys())) info('Adding new record for ' + pid) values_by_keys_by_pid[pid] = OrderedDict( zip(header_keys, [''] * len(header_keys))) else: info('Updating existing record for ' + pid) d = values_by_keys_by_pid[pid] for k in header_keys: if k not in d: err('Error: ' + k + ' not in ' + project_list_fpath + ' for ' + pid) d['PID'] = pid if analysis_dirpath: d['Analyses directory ' + (country_id if not is_local() else 'US')] = analysis_dirpath if project_name and ( analysis_dirpath or not __unquote(d['Name']) ): # update only if running after bcbio, or no value there at all d['Name'] = project_name if html_report_url and ( analysis_dirpath or not __unquote(d['HTML report path']) ): # update only if running after bcbio, or no value there at all d['HTML report path'] = html_report_url if jira_case: d['JIRA URL'] = jira_case.url # if 'Updated By' in d and __unquote(d['Updated By']): d['Updated By'] = getpass.getuser() if jira_case.description: d['Description'] = jira_case.summary if jira_case.data_hub: d['Data Hub'] = jira_case.data_hub if jira_case.type: d['Type'] = jira_case.type if jira_case.department: d['Department'] = jira_case.department if jira_case.division: d['Division'] = jira_case.division if jira_case.assignee: d['Assignee'] = jira_case.assignee if jira_case.reporter: d['Reporter'] = jira_case.reporter if samples_num: d['Sample Number'] = str(samples_num) d['Datestamp'] = timestamp() new_line = ','.join( __requote(d.get(k, '').replace(',', ';').replace('\n', ' | ')) or '' for k in header_keys) with open(tx_fpath, 'w') as f: os.umask(0002) try: os.chmod(tx_fpath, 0774) except OSError: err(format_exc()) for l in lines: if not l: pass if l.startswith('#'): f.write(l) else: l = unicode(l, 'utf-8') l_ascii = l.encode('ascii', 'ignore') if ',' + project_name + ',' in l_ascii or ',"' + project_name + '",' in l_ascii: info('Old csv line: ' + l_ascii) # f.write('#' + l) else: f.write(l) f.write(new_line + '\n') info() info('New line: ' + new_line) info()
else: if slept >= limit: return None else: if not silent: err('Waiting ' + str(timeout) + ' seconds...') time.sleep(timeout) slept += timeout if not silent: err('Retrying...') err() return res_ res = None # = proc or output_fpath if output_fpath and not output_is_dir: with file_transaction(cnf.work_dir, output_fpath) as tx_out_fpath: res = do_handle_oserror(cmdline, tx_out_fpath, stderr_dump=stderr_dump, max_number_of_tries=max_number_of_tries) else: res = do_handle_oserror(cmdline, stderr_dump=stderr_dump, max_number_of_tries=max_number_of_tries) if res is not None: clean() return res clean() if res:
def _extract_fields(cnf, vcf_fpath, samplename, main_sample_index=0): fname, _ = splitext_plus(basename(vcf_fpath)) tsv_fpath = join(cnf.work_dir, fname + '.tsv') if cnf.get('reuse_intermediate'): if file_exists(tsv_fpath): info(tsv_fpath + ' exists, reusing') return tsv_fpath manual_tsv_fields = cnf.annotation['tsv_fields'] if not manual_tsv_fields: return None all_fields = [] basic_fields = [] info_fields = [] eff_fields = [] gt_fields = [] tumor_gt = 'GEN[' + str(main_sample_index) + '].' normal_gt = 'GEN[' + str(1 - main_sample_index) + '].' lines = [] with open(vcf_fpath) as inp: reader = vcf.Reader(inp) info('TSV saver: Building field list') for f in [rec.keys()[0] for rec in manual_tsv_fields]: if f.startswith('GEN'): _f = f.split('.')[1] if len(reader.samples) > 0: if _f in reader.formats: gt_fields.append(_f) all_fields.append(f.replace('GEN[*].', tumor_gt)) if len(reader.samples) > 1: all_fields.append(f.replace('GEN[*].', normal_gt)) else: warn('TSV Saver: Warning: ' + f + ' is not in VCF header FORMAT records') elif f in ['CHROM', 'POS', 'REF', 'ALT', 'ID', 'FILTER', 'QUAL']: all_fields.append(f) basic_fields.append(f) elif any(f.startswith(af) and af in reader.infos for af in ['EFF', 'ANN']): all_fields.append(f) eff_fields.append(f) else: if f in reader.infos: info_fields.append(f) all_fields.append(f) elif f == 'SAMPLE': all_fields.append(f) else: warn('TSV Saver: Warning: ' + f + ' is not in VCF header INFO records') info('TSV saver: Iterating over records...') d = OrderedDict() for rec in reader: for f in basic_fields: d[f] = rec.__dict__[f] for f in info_fields: d[f] = rec.INFO[f] if f in rec.INFO else '' if 'SAMPLE' not in d: d['SAMPLE'] = samplename if eff_fields: eff = rec.INFO.get(eff_fields[0][:3]) if not eff: for f in eff_fields: d[f] = '' else: eff_fs = eff[0].split('|') eff_d = dict() for val, header in zip(eff_fs, ['ALLELE', 'EFFECT', 'IMPACT', 'GENE', 'GENEID', 'FEATURE', 'FEATUREID', 'BIOTYPE', 'RANK', 'HGVS_C', 'HGVS_P', 'CDNA_POSLEN', 'CDS_POSLEN', 'AA_POSLEN', 'DISTANCE', 'LOG']): if 'POSLEN' in header: eff_d[header.split('_')[0] + '_POS'] = val.split('/')[0] if val else '' eff_d[header.split('_')[0] + '_LEN'] = val.split('/')[1] if val else '' else: eff_d[header] = val #ANN=GA |3_prime_UTR_variant|MODIFIER|RPL22|RPL22|transcript|NM_000983.3|Coding|4/4|c.*173dupT|||||173|; #Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO' for f in eff_fields: d[f] = eff_d[f.split('.')[1]] if rec.FORMAT: for _f in gt_fields: if _f in rec.FORMAT: d[tumor_gt + _f] = rec.samples[main_sample_index][_f] if len(rec.samples) > 1 - main_sample_index: d[normal_gt + _f] = rec.samples[1 - main_sample_index][_f] else: d[normal_gt + _f] = '' else: d[tumor_gt + _f] = '' d[normal_gt + _f] = '' fs = [] for f in all_fields: v = d[f] fs.append(v if v != '.' else '') lines.append(fs) info('TSV saver: Adding GEN[*] fields both for sample and for matched normal...') field_map = dict() for rec in manual_tsv_fields: k = rec.keys()[0] v = rec.values()[0] if k.startswith('GEN[*].'): _f = k.split('.')[1] field_map[tumor_gt + _f] = v field_map[normal_gt + _f] = 'Matched_' + v else: field_map[k] = v info('TSV saver: Writing TSV to ' + tsv_fpath) with file_transaction(cnf.work_dir, tsv_fpath) as tx: with open(tx, 'w') as out: out.write('\t'.join(field_map[f] for f in all_fields) + '\n') for fs in lines: new_fs = [] for f in fs: if isinstance(f, list): new_fs.append(','.join(map(str, f))) elif f is None: new_fs.append('') else: new_fs.append(str(f)) out.write('\t'.join(new_fs) + '\n') info('TSV saver: saved ' + tsv_fpath) return tsv_fpath
def main(args): cnf = read_opts_and_cnfs( extra_opts=[ (['--vcf', '--var'], dict( dest='vcf', help='variants to filter') ), (['--vcf2txt'], dict( dest='vcf2txt', help='variants in vcf2txt to filter') ), (['--cohort-freqs'], dict( dest='cohort_freqs_fpath', help='frequencies of variants in a cohort') ), (['--qc'], dict( dest='qc', action='store_true', default=True, help=SUPPRESS_HELP) ), (['--no-qc'], dict( dest='qc', action='store_false', help=SUPPRESS_HELP) ), (['--no-tsv'], dict( dest='tsv', action='store_false', default=True, help=SUPPRESS_HELP) ), ], required_keys=['vcf'], file_keys=['vcf'], key_for_sample_name='vcf', proc_name=source.varfilter_name + '_post') check_system_resources(cnf, required=['perl']) check_genome_resources(cnf) if not cnf.output_file: cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt') safe_mkdir(dirname(cnf.output_file)) safe_mkdir(cnf.output_dir) if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'): verify_vcf(cnf.vcf, is_critical=True) if not cnf.vcf2txt: vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file) if not vcf2txt_res_fpath: critical('vcf2txt run returned non-0') info('Saved vcf2txt output to ' + vcf2txt_res_fpath) else: cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True) info('Input is vcf2txt output, grepping by sample name ' + cnf.sample) vcf2txt_res_fpath = cnf.output_file with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx: with open(cnf.vcf2txt) as f, open(tx, 'w') as out: for i, l in enumerate(f): if l.strip(): if i == 0: out.write(l) else: if l.split('\t')[0] == cnf.sample: out.write(l) info('Using vcf2txt from ' + vcf2txt_res_fpath) # if is_local(): # vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl')) # info('Running vardict2mut perl') # res = run_vardict2mut(cnf, vcf2txt_res_fpath, # add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'), # vardict2mut_executable=vardict2mut_pl) # if not res: # critical('vardict2mut.pl run returned non-0') mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix)) if not mut_fpath: err('vardict2mut failed') else: info('Saved passed mutations to ' + mut_fpath) var_s = source.VarSample(cnf.sample, cnf.output_dir) var_s.anno_vcf_fpath = cnf.vcf var_s.varfilter_dirpath = var_s.dirpath ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0] ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt')) var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz' var_s.variants_fpath = vcf2txt_res_fpath var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix) ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass') var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass') filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath) index_vcf(cnf, var_s.name, filt_vcf, cnf.caller) index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller) if cnf.qc: report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s) qc_dirpath = join(cnf.output_dir, 'qc') safe_mkdir(qc_dirpath) qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name) info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')') info('-' * 70) info() if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir']) info() info('*' * 70) info('Done filtering ' + var_s.name)
def convert_vardict_txts_to_bcbio_vcfs(cnf, bs, sample, output_dir=None, pass_only=False): info('') info('Preparing data for ' + sample.name) anno_filt_vcf_fpath = sample.find_filt_vcf_by_callername(cnf.caller_name) if not anno_filt_vcf_fpath: return None, None if not output_dir: output_dir = cnf.output_dir or os.path.dirname(anno_filt_vcf_fpath) output_vcf_fpath = join( output_dir, sample.name + '-' + cnf.caller_name + filt_vcf_ending) pass_output_vcf_fpath = add_suffix(output_vcf_fpath, 'pass') if cnf.reuse_intermediate and verify_vcf( output_vcf_fpath + '.gz') and verify_vcf(pass_output_vcf_fpath + '.gz'): info(output_vcf_fpath + '.gz and ' + pass_output_vcf_fpath + '.gz exists, reusing') return output_vcf_fpath + '.gz', pass_output_vcf_fpath + '.gz' info('Parsing PASS and REJECT mutations...') pass_mut_dict, reject_mut_dict, filter_values = get_mutation_dicts( cnf, bs, sample, pass_only=pass_only) sorted_mut_dict = combine_mutations(pass_mut_dict, reject_mut_dict) info('') info('Writing VCFs') vcf_reader = vcf.Reader(open_gzipsafe(anno_filt_vcf_fpath, 'r')) vcf_reader = add_keys_to_header(vcf_reader, filter_values) with file_transaction(cnf.work_dir, output_vcf_fpath) as filt_tx, \ file_transaction(cnf.work_dir, pass_output_vcf_fpath) as pass_tx: vcf_writer = None if not pass_only: vcf_writer = vcf.Writer(open(filt_tx, 'w'), template=vcf_reader) vcf_pass_writer = vcf.Writer(open(pass_tx, 'w'), template=vcf_reader) for key, mut in sorted_mut_dict.items(): record = get_record_from_vcf(vcf_reader, mut) if record: if key in pass_mut_dict: record.FILTER = ['PASS'] if mut.reason: record.INFO['Reason'] = mut.reason.replace(' ', '_') elif pass_only: continue elif key in reject_mut_dict: if not mut.reason: continue reject_reason_ids = [ filter_descriptions_dict[reason] if reason in filter_descriptions_dict else reason for reason in mut.reason.split(' and ') ] record.FILTER = [';'.join(reject_reason_ids)] if mut.signif: record.INFO['Signif'] = mut.signif if mut.status: record.INFO['Status'] = mut.status if vcf_writer: vcf_writer.write_record(record) if key in pass_mut_dict: vcf_pass_writer.write_record(record) else: warn('No record was found in ' + anno_filt_vcf_fpath + ' for mutation ' + str(mut)) output_gzipped_vcf_fpath = None if vcf_writer: vcf_writer.close() output_gzipped_vcf_fpath = bgzip_and_tabix(cnf, output_vcf_fpath) info('VCF file for vardict.txt is saved to ' + output_gzipped_vcf_fpath) vcf_pass_writer.close() output_gzipped_pass_vcf_fpath = bgzip_and_tabix(cnf, pass_output_vcf_fpath) info('VCF file for vardict.PASS.txt is saved to ' + output_gzipped_pass_vcf_fpath) return output_gzipped_vcf_fpath, output_gzipped_pass_vcf_fpath