def markdup_bam(cnf, in_bam_fpath, bammarkduplicates=None): """Perform non-stream based deduplication of BAM input files using biobambam. """ if not bammarkduplicates: bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if not bammarkduplicates: warn('No biobambam bammarkduplicates, can\'t mark duplicates.') return None out_bam_fpath = add_suffix(in_bam_fpath, 'markdup') tmp_fpath = join(cnf.work_dir, splitext_plus(basename(in_bam_fpath))[0] + '_markdup') safe_mkdir(dirname(tmp_fpath)) cmdline = ( '{bammarkduplicates} tmpfile={tmp_fpath} I={in_bam_fpath} O={out_bam_fpath}' ).format(**locals()) res = call(cnf, cmdline, output_fpath=out_bam_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: return out_bam_fpath else: return None
def join_vcf2txt_results(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath): info('WGS; running vcftxt separately for each sample to save memory.') vcf2txt_outputs_by_vcf_fpath = OrderedDict() for vcf_fpath in vcf_fpath_by_sample.values(): sample_output_fpath = add_suffix(vcf2txt_out_fpath, splitext(basename(vcf_fpath))[0]) vcf2txt_outputs_by_vcf_fpath[vcf_fpath] = sample_output_fpath info() info('Joining vcf2txt ouputs... (' + str(len(vcf2txt_outputs_by_vcf_fpath)) + ' out of ' + str(len(vcf_fpath_by_sample)) + ' successful), ' + 'writing to ' + vcf2txt_out_fpath) with file_transaction(cnf.work_dir, vcf2txt_out_fpath) as tx: with open(tx, 'w') as out: for i, (vcf_fpath, sample_output_fpath) in enumerate( vcf2txt_outputs_by_vcf_fpath.items()): info(' Reading ' + sample_output_fpath) with open(sample_output_fpath) as inp: for j, l in enumerate(inp): if j == 0 and i != 0: continue out.write(l) if verify_file(vcf2txt_out_fpath): info('Saved ' + vcf2txt_out_fpath) return vcf2txt_out_fpath else: return None
def run_vcf2txt_vardict2mut_for_samples(cnf, var_samples, output_dirpath, vcf2txt_out_fpath, caller_name=None, threads_num=1): threads_num = min(len(var_samples), cnf.threads) info('Number of threads for filtering: ' + str(threads_num)) safe_mkdir(output_dirpath) vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in var_samples} res = run_vcf2txt(cnf, vcf_fpath_by_sample, vcf2txt_out_fpath) if not res: err('vcf2txt run returned non-0') return None # vardict2mut_py = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py')) # if not vardict2mut_py: # critical('vardict2mut_py not found') info('Running vardict2mut') res = run_vardict2mut( cnf, vcf2txt_out_fpath, add_suffix(vcf2txt_out_fpath, variant_filtering.mut_pass_suffix)) if not res: critical('vardict2mut.py run returned non-0') mut_fpath = res mut_fpath = convert_gpfs_path_to_url(mut_fpath) info() info('Done filtering with vcf2txt/vardict2mut, saved to ' + str(mut_fpath)) return mut_fpath
def _mutations_records(general_section, bcbio_structure, base_dirpath): records = [] caller = bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') _base_mut_fname = variant_filtering.mut_fname_template.format( caller_name=caller.name) _base_mut_fpath = join(bcbio_structure.date_dirpath, _base_mut_fname) mut_fpath = add_suffix(_base_mut_fpath, variant_filtering.mut_pass_suffix) single_mut_fpath = add_suffix( add_suffix(_base_mut_fpath, variant_filtering.mut_single_suffix), variant_filtering.mut_pass_suffix) paired_mut_fpath = add_suffix( add_suffix(_base_mut_fpath, variant_filtering.mut_paired_suffix), variant_filtering.mut_pass_suffix) mut_fpath = verify_file(mut_fpath, silent=True) single_mut_fpath = verify_file(single_mut_fpath, silent=True) paired_mut_fpath = verify_file(paired_mut_fpath, silent=True) for fpath, metric_name in ((mut_fpath, MUTATIONS_NAME), (single_mut_fpath, MUTATIONS_SINGLE_NAME), (paired_mut_fpath, MUTATIONS_PAIRED_NAME)): if fpath: metric = Metric(metric_name, common=True) rec = Record(metric=metric, value=basename(fpath), url=relpath(fpath, base_dirpath)) general_section.add_metric(metric) records.append(rec) if bcbio_structure.seq2c_fpath and isfile(bcbio_structure.seq2c_fpath): metric = Metric(CNV_NAME, common=True) fpath = bcbio_structure.seq2c_fpath rec = Record(metric=metric, value=basename(fpath), url=relpath(fpath, base_dirpath)) general_section.add_metric(metric) records.append(rec) return records
def count_mutations_freq(cnf, samples, vcf2txt_fpaths, suffix=variant_filtering.mut_pass_suffix): count_in_cohort_by_vark = defaultdict(int) total_varks = 0 total_duplicated_count = 0 total_records_count = 0 for sample_i, (sample, vcf2txt_fpath) in enumerate(zip(samples, vcf2txt_fpaths)): met_in_this_sample = set() processed_fpath = add_suffix(vcf2txt_fpath, suffix) if not isfile(processed_fpath): critical(processed_fpath + ' does not exist; please, rerun VarFilter.') with open(processed_fpath) as f: for line_i, l in enumerate(f): if line_i > 0: fs = l.replace('\n', '').split() if not fs: continue chrom, pos, db_id, ref, alt = fs[1:6] vark = ':'.join([chrom, pos, ref, alt]) if vark in met_in_this_sample: if suffix == variant_filtering.mut_pass_suffix: total_duplicated_count += 1 else: count_in_cohort_by_vark[vark] += 1 if suffix == variant_filtering.mut_pass_suffix: met_in_this_sample.add(vark) total_varks += 1 total_records_count += 1 if suffix == variant_filtering.mut_pass_suffix: info('Counted ' + str(len(count_in_cohort_by_vark)) + ' different variants ' + 'in ' + str(len(samples)) + ' samples with total ' + str(total_varks) + ' records') info('Duplicated varks for this sample: ' + str(total_duplicated_count) + ' out of total ' + str(total_records_count) + ' records. Duplicated were not counted into cohort frequencies.') freq_in_cohort_by_vark = dict() max_freq = 0 for vark, count in count_in_cohort_by_vark.items(): f = float(count) / len(samples) freq_in_cohort_by_vark[vark] = f if f > max_freq: max_freq = f if suffix == variant_filtering.mut_pass_suffix: info('Maximum frequency in cohort is ' + str(max_freq)) return freq_in_cohort_by_vark, count_in_cohort_by_vark
def get_mutations_fpaths(bcbio_structure): caller = bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') _base_mut_fname = variant_filtering.mut_fname_template.format( caller_name=caller.name) _base_mut_fpath = join(bcbio_structure.date_dirpath, _base_mut_fname) mut_fpath = add_suffix(_base_mut_fpath, variant_filtering.mut_pass_suffix) single_mut_fpath = add_suffix( add_suffix(_base_mut_fpath, variant_filtering.mut_single_suffix), variant_filtering.mut_pass_suffix) paired_mut_fpath = add_suffix( add_suffix(_base_mut_fpath, variant_filtering.mut_paired_suffix), variant_filtering.mut_pass_suffix) mut_fpath = verify_file(mut_fpath, silent=True) single_mut_fpath = verify_file(single_mut_fpath, silent=True) paired_mut_fpath = verify_file(paired_mut_fpath, silent=True) mutations_fpaths = [ f for f in [mut_fpath, single_mut_fpath, paired_mut_fpath] if f ] return mutations_fpaths
def evaluate_capture(cnf, bcbio_structures): samples = [s for bs in bcbio_structures for s in bs.samples] min_samples = math.ceil(cnf.min_ratio * len(samples)) info('Filtering regions by depth') regions = check_regions_depth(cnf, bcbio_structures, min_samples) if not regions: err('No regions were filtered.') return None if cnf.bed or cnf.tricky_regions: regions = intersect_regions(cnf, bcbio_structures, regions, min_samples) regions_fname = 'filtered_regions.txt' regions_fpath = join( cnf.output_dir, add_suffix(regions_fname, str(cnf.min_depth)) if cnf.min_depth else regions_fname) with open(regions_fpath, 'w') as out: out.write('## Minimal percent of region with low coverage: ' + str((1 - cnf.min_percent) * 100) + '%\n') out.write( '## Minimal percent of samples that share the same feature: ' + str(cnf.min_ratio * 100) + '%\n') if not cnf.min_depth: out.write( '## Coverage threshold Nx is 10x for cell line and 100x for plasma\n' ) else: out.write('## Coverage threshold Nx is ' + str(cnf.min_depth) + 'x\n') out.write('\t'.join([ '#Chr', 'Start', 'End', 'Size', 'Gene', 'Depth<Nx', 'SamplesSharingSameFeature', 'Annotation' ]) + '\n') for region in sorted(regions, key=lambda x: (x[0], int(x[1]))): out.write('\t'.join([str(val) for val in region]) + '\n') info() info(str(len(regions)) + ' regions were saved into ' + regions_fpath) bgzip_and_tabix(cnf, regions_fpath, tabix_parameters='-p bed') return regions_fpath
def align(cnf, sample, l_fpath, r_fpath, sambamba, bwa, bammarkduplicates, bwa_prefix, is_pcr=False): sam_fpath = join(cnf.work_dir, sample.name + '_downsampled.sam') bam_fpath = splitext(sam_fpath)[0] + '.bam' sorted_bam_fpath = add_suffix(bam_fpath, 'sorted') bwa_cmdline = '{bwa} mem {bwa_prefix} {l_fpath} {r_fpath} '.format( **locals()) res = call(cnf, bwa_cmdline, output_fpath=sam_fpath, exit_on_error=False) if not res: return None cmdline = '{sambamba} view -t {cnf.threads} -S -f bam {sam_fpath}'.format( **locals()) call(cnf, cmdline, output_fpath=bam_fpath) prefix = splitext(sorted_bam_fpath)[0] cmdline = '{sambamba} sort -t {cnf.threads} {bam_fpath} -o {sorted_bam_fpath}'.format( **locals()) call(cnf, cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False) if not is_pcr: markdup_bam_fpath = markdup_bam(cnf, sorted_bam_fpath, bammarkduplicates) if markdup_bam_fpath: sorted_bam_fpath = markdup_bam_fpath index_bam(cnf, sorted_bam_fpath, sambamba=sambamba) return sorted_bam_fpath
def sort_bed_by_alphabet(cnf, input_bed_fpath, output_bed_fpath=None, chr_len_fpath=None): chr_lengths = get_chr_lengths(cnf, chr_len_fpath) chromosomes = set([c for (c, l) in chr_lengths]) output_bed_fpath = adjust_path( output_bed_fpath) if output_bed_fpath else add_suffix( input_bed_fpath, 'sorted') regions = defaultdict(list) info('Sorting regions...') chunk_size = 10 chunk_counter = 0 with open(input_bed_fpath) as f: with file_transaction(cnf.work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] if chrom not in chromosomes: continue if chunk_counter == chunk_size or not regions[chrom]: chunk_counter = 0 regions[chrom].append('') regions[chrom][-1] += l chunk_counter += 1 for chr in sorted(regions.keys()): for region in regions[chr]: out.write(region) return output_bed_fpath
def markdup_sam(cnf, in_sam_fpath, samblaster=None): """Perform non-stream based deduplication of SAM input files using samblaster. """ if not samblaster: samblaster = get_system_path(cnf, 'samblaster') if not samblaster: warn('No samblaster, can\'t mark duplicates.') return None out_sam_fpath = add_suffix(in_sam_fpath, 'markdup') tmp_fpath = join(cnf.work_dir, splitext_plus(basename(in_sam_fpath))[0] + '_markdup') safe_mkdir(dirname(tmp_fpath)) cmdline = '{samblaster} -i {in_sam_fpath} -o {out_sam_fpath}'.format( **locals()) res = call(cnf, cmdline, output_fpath=out_sam_fpath, stdout_to_outputfile=False, exit_on_error=False) if res: return out_sam_fpath else: return None
def downsample(cnf, sample_name, fastq_L_fpath, fastq_R_fpath, N, output_dir, suffix=None, quick=False): """ get N random headers from a fastq file without reading the whole thing into memory modified from: http://www.biostars.org/p/6544/ quick=True will just grab the first N reads rather than do a true downsampling """ sample_name = sample_name or splitext(''.join( lc if lc == rc else '' for lc, rc in izip(fastq_L_fpath, fastq_R_fpath)))[0] l_out_fpath = join(output_dir, add_suffix(basename(fastq_L_fpath), suffix or 'subset')) r_out_fpath = join(output_dir, add_suffix(basename(fastq_R_fpath), suffix or 'subset')) if cnf.reuse_intermediate and verify_file( l_out_fpath, silent=True) and verify_file(r_out_fpath, silent=True): info(l_out_fpath + ' and ' + r_out_fpath + ' exist, reusing.') return l_out_fpath, r_out_fpath info('Processing ' + sample_name) N = int(N) records_num = N if quick: rand_records = range(N) else: info(sample_name + ': getting number of reads in fastq...') records_num = sum(1 for _ in open_gzipsafe(fastq_L_fpath)) / 4 if records_num > LIMIT: info(sample_name + ' the number of reads is higher than ' + str(LIMIT) + ', sampling from only first ' + str(LIMIT)) records_num = LIMIT info(sample_name + ': ' + str(records_num) + ' reads') if records_num < N: info(sample_name + ': and it is less than ' + str(N) + ', so no downsampling.') return fastq_L_fpath, fastq_R_fpath else: info(sample_name + ': downsampling to ' + str(N)) rand_records = sorted(random.sample(xrange(records_num), N)) info('Opening ' + fastq_L_fpath) fh1 = open_gzipsafe(fastq_L_fpath) info('Opening ' + fastq_R_fpath) fh2 = open_gzipsafe(fastq_R_fpath) if fastq_R_fpath else None out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath) written_records = 0 with file_transaction(cnf.work_dir, out_files) as tx_out_files: if isinstance(tx_out_files, basestring): tx_out_f1 = tx_out_files else: tx_out_f1, tx_out_f2 = tx_out_files info('Opening ' + str(tx_out_f1) + ' to write') sub1 = open_gzipsafe(tx_out_f1, "w") info('Opening ' + str(tx_out_f2) + ' to write') sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None rec_no = -1 for rr in rand_records: while rec_no < rr: rec_no += 1 for i in range(4): fh1.readline() if fh2: for i in range(4): fh2.readline() for i in range(4): sub1.write(fh1.readline()) if sub2: sub2.write(fh2.readline()) written_records += 1 rec_no += 1 if written_records % 10000 == 0: info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no)) if rec_no > records_num: info(sample_name + ' reached the limit of ' + str(records_num), ' read lines, stopping.') break info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no)) fh1.close() sub1.close() if fastq_R_fpath: fh2.close() sub2.close() info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written') return l_out_fpath, r_out_fpath
def write_combined_results(cnf, variants_fpath, samples, vcf2txt_fpaths, freq_in_cohort_by_vark, count_in_cohort_by_vark, suffix=variant_filtering.mut_pass_suffix, do_cohort_filtering=True): artefacts_samples = OrderedDefaultDict(list) artefacts_data = OrderedDict() variants_count = defaultdict(int) written_lines_count = 0 status_col, reason_col, n_samples_col, n_var_col, pcnt_sample_col, ave_af_col, incidentalome_col \ = None, None, None, None, None, None, None with file_transaction(cnf.work_dir, variants_fpath) as tx: with open(tx, 'w') as out: for sample_i, (sample, vcf2txt_fpath) in enumerate( zip(samples, vcf2txt_fpaths)): mut_fpath = add_suffix(vcf2txt_fpath, suffix) with file_transaction(cnf.work_dir, mut_fpath) as fixed_mut_fpath_tx: with open(mut_fpath) as f, open(fixed_mut_fpath_tx, 'w') as fixed_f_out: for line_i, l in enumerate(f): fs = l.replace('\n', '').split('\t') if line_i == 0 and sample_i == 0: out.write(l) if line_i == 0: fixed_f_out.write(l) if status_col is not None and status_col != fs.index( 'Significance'): critical( 'Different format in ' + mut_fpath + ': status_col=' + str(fs.index('Significance')) + ', but the first sample was ' + str(status_col) + ', please rerun VarFilter from the beginning' ) status_col = fs.index('Significance') reason_col = status_col + 1 n_samples_col = fs.index('N_samples') n_var_col = fs.index('N_Var') pcnt_sample_col = fs.index('Pcnt_sample') ave_af_col = fs.index('Ave_AF') if 'Incidentalome' in fs: incidentalome_col = fs.index( 'Incidentalome') if line_i > 0: fs = l.replace('\n', '').split('\t') chrom, pos, db_id, ref, alt = fs[1:6] vark = ':'.join([chrom, pos, ref, alt]) assert len(fs) > reason_col, 'len(fs)=' + str(len(fs)) + ' > reason_col=' + str(reason_col) + \ ' in ' + sample.name + ', ' + vcf2txt_fpath + ' for line\n' + l freq = freq_in_cohort_by_vark[vark] cnt = count_in_cohort_by_vark[vark] fs[n_samples_col] = str(len(samples)) fs[n_var_col] = str(cnt) fs[pcnt_sample_col] = str(freq) fs[ave_af_col] = '' l = '\t'.join(fs) + '\n' if do_cohort_filtering: if fs[status_col] in ['known', 'likely']: variants_count['not_filtered'] += 1 elif freq >= cnf.variant_filtering.max_ratio and cnt > cnf.variant_filtering.max_sample_cnt: artefacts_samples[vark].append( sample.name) # if incidentalome_col: # fs.remove(fs[incidentalome_col]) artefacts_data[vark] = fs continue variants_count['good_freq'] += 1 fixed_f_out.write(l) out.write(l) written_lines_count += 1 return artefacts_samples, artefacts_data, variants_count, written_lines_count
def convert_vardict_txts_to_bcbio_vcfs(cnf, bs, sample, output_dir=None, pass_only=False): info('') info('Preparing data for ' + sample.name) anno_filt_vcf_fpath = sample.find_filt_vcf_by_callername(cnf.caller_name) if not anno_filt_vcf_fpath: return None, None if not output_dir: output_dir = cnf.output_dir or os.path.dirname(anno_filt_vcf_fpath) output_vcf_fpath = join( output_dir, sample.name + '-' + cnf.caller_name + filt_vcf_ending) pass_output_vcf_fpath = add_suffix(output_vcf_fpath, 'pass') if cnf.reuse_intermediate and verify_vcf( output_vcf_fpath + '.gz') and verify_vcf(pass_output_vcf_fpath + '.gz'): info(output_vcf_fpath + '.gz and ' + pass_output_vcf_fpath + '.gz exists, reusing') return output_vcf_fpath + '.gz', pass_output_vcf_fpath + '.gz' info('Parsing PASS and REJECT mutations...') pass_mut_dict, reject_mut_dict, filter_values = get_mutation_dicts( cnf, bs, sample, pass_only=pass_only) sorted_mut_dict = combine_mutations(pass_mut_dict, reject_mut_dict) info('') info('Writing VCFs') vcf_reader = vcf.Reader(open_gzipsafe(anno_filt_vcf_fpath, 'r')) vcf_reader = add_keys_to_header(vcf_reader, filter_values) with file_transaction(cnf.work_dir, output_vcf_fpath) as filt_tx, \ file_transaction(cnf.work_dir, pass_output_vcf_fpath) as pass_tx: vcf_writer = None if not pass_only: vcf_writer = vcf.Writer(open(filt_tx, 'w'), template=vcf_reader) vcf_pass_writer = vcf.Writer(open(pass_tx, 'w'), template=vcf_reader) for key, mut in sorted_mut_dict.items(): record = get_record_from_vcf(vcf_reader, mut) if record: if key in pass_mut_dict: record.FILTER = ['PASS'] if mut.reason: record.INFO['Reason'] = mut.reason.replace(' ', '_') elif pass_only: continue elif key in reject_mut_dict: if not mut.reason: continue reject_reason_ids = [ filter_descriptions_dict[reason] if reason in filter_descriptions_dict else reason for reason in mut.reason.split(' and ') ] record.FILTER = [';'.join(reject_reason_ids)] if mut.signif: record.INFO['Signif'] = mut.signif if mut.status: record.INFO['Status'] = mut.status if vcf_writer: vcf_writer.write_record(record) if key in pass_mut_dict: vcf_pass_writer.write_record(record) else: warn('No record was found in ' + anno_filt_vcf_fpath + ' for mutation ' + str(mut)) output_gzipped_vcf_fpath = None if vcf_writer: vcf_writer.close() output_gzipped_vcf_fpath = bgzip_and_tabix(cnf, output_vcf_fpath) info('VCF file for vardict.txt is saved to ' + output_gzipped_vcf_fpath) vcf_pass_writer.close() output_gzipped_pass_vcf_fpath = bgzip_and_tabix(cnf, pass_output_vcf_fpath) info('VCF file for vardict.PASS.txt is saved to ' + output_gzipped_pass_vcf_fpath) return output_gzipped_vcf_fpath, output_gzipped_pass_vcf_fpath
def combine_results(cnf, samples, vcf2txt_fpaths, variants_fpath, pass_variants_fpath=None, reject_variants_fpath=None): info('Combining vcf2txt variants') not_existing_snames = [] if cnf.reuse_intermediate and isfile(variants_fpath) and verify_file( variants_fpath): info('Combined filtered results ' + variants_fpath + ' exist, reusing.') else: for sample_i, (sample, vcf2txt_fpath) in enumerate(zip(samples, vcf2txt_fpaths)): if not verify_file(vcf2txt_fpath, description='variants file'): not_existing_snames.append(sample.name) if not_existing_snames: critical( 'For some samples do not exist, variants file was not found: ' + ', '.join(not_existing_snames)) with file_transaction(cnf.work_dir, variants_fpath) as tx: with open(tx, 'w') as out: for sample_i, (sample, vcf2txt_fpath) in enumerate( zip(samples, vcf2txt_fpaths)): with open(vcf2txt_fpath) as f: for line_i, l in enumerate(f): if line_i == 0 and sample_i == 0: out.write(l) if line_i > 0: out.write(l) verify_file(variants_fpath, is_critical=True, description='combined mutation calls') info('Saved vcf2txt variants to ' + variants_fpath) info() info('Combining PASSed mutations') pass_variants_fpath = pass_variants_fpath or add_suffix( variants_fpath, variant_filtering.mut_pass_suffix) reject_variants_fpath = reject_variants_fpath or add_suffix( variants_fpath, variant_filtering.mut_reject_suffix) not_existing_pass_snames = [] if cnf.reuse_intermediate and isfile(pass_variants_fpath) and verify_file(pass_variants_fpath)\ and isfile(reject_variants_fpath) and verify_file(reject_variants_fpath): info('Combined PASSed filtered results ' + pass_variants_fpath + ' exist, reusing.') else: for sample_i, (sample, vcf2txt_fpath) in enumerate(zip(samples, vcf2txt_fpaths)): if not verify_file(add_suffix(vcf2txt_fpath, variant_filtering.mut_pass_suffix), description='PASS variants file'): not_existing_pass_snames.append(sample.name) if not_existing_pass_snames: critical( 'For some samples do not exist, PASS variants file was not found: ' + ', '.join(not_existing_pass_snames)) info('*' * 70) if cnf.variant_filtering.max_ratio < 1.0: info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio)) else: info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio) + ', i.e. no filter') info('Calculating frequences of variants in the cohort') info('*' * 70) freq_in_cohort_by_vark, count_in_cohort_by_vark = count_mutations_freq( cnf, samples, vcf2txt_fpaths) reject_freq_in_cohort_by_vark, reject_count_in_cohort_by_vark = count_mutations_freq( cnf, samples, vcf2txt_fpaths, suffix=variant_filtering.mut_reject_suffix) info() if cnf.variant_filtering.max_ratio < 1.0: info('Saving passing threshold if cohort freq < ' + str(cnf.variant_filtering.max_ratio) + ' to ' + pass_variants_fpath) artefacts_samples, artefacts_data, variants_count, written_lines_count = write_combined_results( cnf, pass_variants_fpath, samples, vcf2txt_fpaths, freq_in_cohort_by_vark, count_in_cohort_by_vark, suffix=variant_filtering.mut_pass_suffix, do_cohort_filtering=True) _, _, _, reject_written_lines_count = write_combined_results( cnf, reject_variants_fpath, samples, vcf2txt_fpaths, reject_freq_in_cohort_by_vark, reject_count_in_cohort_by_vark, suffix=variant_filtering.mut_reject_suffix, do_cohort_filtering=False) if len(artefacts_samples.keys()) > 0: reason = 'cohort freq > ' + str(cnf.variant_filtering.max_ratio) with open(reject_variants_fpath) as f: line = f.readline().split() reason_col = line.index('Reason') if 'Reason' in line else None with open(reject_variants_fpath, 'a') as f: for vark, samples in artefacts_samples.items(): fs = artefacts_data[vark] if reason_col: fs[reason_col] = reason else: fs.append(reason) f.write('\t'.join(fs) + '\n') info('Skipped artefacts with cohort freq > ' + str(cnf.variant_filtering.max_ratio) + ' and sample count > ' + str(cnf.variant_filtering.max_sample_cnt) + ': ' + str(len(artefacts_samples.keys()))) info('Added artefacts into ' + reject_variants_fpath) info('All variants not under filtering: ' + str(variants_count['not_filtered'])) if len(artefacts_samples.keys()) > 0: info('Variants not under filtering with freq > ' + str(cnf.variant_filtering.max_ratio) + ': ' + str(variants_count['good_freq'])) verify_file(pass_variants_fpath, 'PASS variants file', is_critical=True) info('Written ' + str(written_lines_count) + ' records to ' + pass_variants_fpath) info('Written ' + str(reject_written_lines_count + len(artefacts_samples.keys())) + ' rejected records to ' + reject_variants_fpath) variants_fpath = verify_file(variants_fpath, is_critical=True) pass_variants_fpath = verify_file(pass_variants_fpath, is_critical=True) if not_existing_snames or not_existing_pass_snames: return None, None return variants_fpath, pass_variants_fpath
def main(args): cnf = read_opts_and_cnfs( extra_opts=[ (['--vcf', '--var'], dict( dest='vcf', help='variants to filter') ), (['--vcf2txt'], dict( dest='vcf2txt', help='variants in vcf2txt to filter') ), (['--cohort-freqs'], dict( dest='cohort_freqs_fpath', help='frequencies of variants in a cohort') ), (['--qc'], dict( dest='qc', action='store_true', default=True, help=SUPPRESS_HELP) ), (['--no-qc'], dict( dest='qc', action='store_false', help=SUPPRESS_HELP) ), (['--no-tsv'], dict( dest='tsv', action='store_false', default=True, help=SUPPRESS_HELP) ), ], required_keys=['vcf'], file_keys=['vcf'], key_for_sample_name='vcf', proc_name=source.varfilter_name + '_post') check_system_resources(cnf, required=['perl']) check_genome_resources(cnf) if not cnf.output_file: cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt') safe_mkdir(dirname(cnf.output_file)) safe_mkdir(cnf.output_dir) if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'): verify_vcf(cnf.vcf, is_critical=True) if not cnf.vcf2txt: vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file) if not vcf2txt_res_fpath: critical('vcf2txt run returned non-0') info('Saved vcf2txt output to ' + vcf2txt_res_fpath) else: cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True) info('Input is vcf2txt output, grepping by sample name ' + cnf.sample) vcf2txt_res_fpath = cnf.output_file with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx: with open(cnf.vcf2txt) as f, open(tx, 'w') as out: for i, l in enumerate(f): if l.strip(): if i == 0: out.write(l) else: if l.split('\t')[0] == cnf.sample: out.write(l) info('Using vcf2txt from ' + vcf2txt_res_fpath) # if is_local(): # vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl')) # info('Running vardict2mut perl') # res = run_vardict2mut(cnf, vcf2txt_res_fpath, # add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'), # vardict2mut_executable=vardict2mut_pl) # if not res: # critical('vardict2mut.pl run returned non-0') mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix)) if not mut_fpath: err('vardict2mut failed') else: info('Saved passed mutations to ' + mut_fpath) var_s = source.VarSample(cnf.sample, cnf.output_dir) var_s.anno_vcf_fpath = cnf.vcf var_s.varfilter_dirpath = var_s.dirpath ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0] ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt')) var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz' var_s.variants_fpath = vcf2txt_res_fpath var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix) ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass') var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass') filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath) index_vcf(cnf, var_s.name, filt_vcf, cnf.caller) index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller) if cnf.qc: report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s) qc_dirpath = join(cnf.output_dir, 'qc') safe_mkdir(qc_dirpath) qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name) info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')') info('-' * 70) info() if not cnf['keep_intermediate']: shutil.rmtree(cnf['work_dir']) info() info('*' * 70) info('Done filtering ' + var_s.name)
def run_vardict2mut(cnf, vcf2txt_res_fpath, vardict2mut_res_fpath=None, vardict2mut_executable=None): cmdline = None if vardict2mut_res_fpath is None: vardict2mut_res_fpath = add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix) vardict2mut_reject_fpath = add_suffix(vcf2txt_res_fpath, variant_filtering.mut_reject_suffix) check_filtering_results(vardict2mut_res_fpath) if not vardict2mut_executable: # vardict2mut_executable = get_script_cmdline(cnf, 'python', join('scripts', 'post', 'vardict2mut.py')) vardict2mut_executable = 'vardict2mut' c = cnf.variant_filtering cmdline = '{vardict2mut_executable} {vcf2txt_res_fpath} ' if vardict2mut_executable.endswith('.pl'): cmdline += ' --report_reason ' if c.min_hotspot_freq is not None and c.min_hotspot_freq != 'default': cmdline += ' -F ' + str(c.min_hotspot_freq) if c.max_ratio_vardict2mut is not None: cmdline += ' -R ' + str(c.max_ratio_vardict2mut) if cnf.genome.filter_common_snp: cmdline += ' --filter_common_snp {cnf.genome.filter_common_snp} ' if cnf.genome.filter_common_artifacts: cmdline += ' --filter_common_artifacts {cnf.genome.filter_common_artifacts} ' if cnf.genome.actionable: cmdline += ' --actionable {cnf.genome.actionable} ' if cnf.genome.compendia_ms7_hotspot: cmdline += ' --compendia_ms7_hotspot {cnf.genome.compendia_ms7_hotspot} ' if cnf.snpeffect_export_polymorphic: cmdline += ' --snpeffect_export_polymorphic {cnf.snpeffect_export_polymorphic} ' if cnf.actionable_hotspot: cmdline += ' --actionable_hotspot {cnf.actionable_hotspot} ' if cnf.ruledir: cmdline += ' --ruledir {cnf.ruledir} ' cmdline = cmdline.format(**locals()) res = call(cnf, cmdline, vardict2mut_res_fpath, exit_on_error=False) else: filt_yaml_fpath = join(cnf.work_dir, 'filt_cnf.yaml') info('Writing filtering yaml into ' + filt_yaml_fpath) with file_transaction(cnf.work_dir, filt_yaml_fpath) as tx, open( filt_yaml_fpath, 'w') as out: with open(cnf.run_cnf) as run_cnf: lines = [] met_variant_filtering = False for l in run_cnf: if l.startswith('variant_filtering:'): met_variant_filtering = True continue if met_variant_filtering: if l.startswith(' '): out.write(l.lstrip()) else: break cmdline += ' --filt-cnf ' + filt_yaml_fpath cmdline += ' --work-dir ' + cnf.work_dir cmdline += (' --debug ' if cnf.debug else '') cmdline += ' --genome ' + cnf.genome.name cmdline += ' -o ' + vardict2mut_res_fpath cmdline += ' --o-reject ' + vardict2mut_reject_fpath if cnf.cohort_freqs_fpath: cmdline += ' --cohort-freqs ' + cnf.cohort_freqs_fpath cmdline = cmdline.format(**locals()) res = call(cnf, cmdline, output_fpath=vardict2mut_res_fpath, stdout_to_outputfile=False) if not res: return None else: return res
def _filter(cnf, samples, variants_fpath, variants_fname): # if cohort_mode: # info('Running vcf2txt.pl in cohort mode') # vcf2txt = get_script_cmdline(cnf, 'perl', 'vcf2txt', is_critical=True) # vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in samples} # cmdline = vcf2txt + ' ' + make_vcf2txt_cmdl_params(cnf, vcf_fpath_by_sample) # res = run_vcf2txt_with_retries(cnf, cmdline, variants_fpath) # if not res: # critical('Error: vcf2txt.pl crashed') total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 cohort_freqs_fpath = None # if cnf.variant_filtering.max_ratio_vardict2mut < 1.0: # cohort_freqs_fpath = join(cnf.work_dir, 'cohort_freqs.tsv') # info('*' * 70) # info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio_vardict2mut) + ', counting freqs in cohort') # # cnf.variant_filtering.max_ratio < 1.0 or \ # # cnf.fraction < 1.0 # cohort_freqs_fpath = count_cohort_freqs(cnf, samples, cohort_freqs_fpath, max_ratio=cnf.variant_filtering.max_ratio_vardict2mut) # info('*' * 70) # info() not_submitted_samples = samples while not_submitted_samples: reused_samples = [] jobs_to_wait = [] submitted_samples = [] for sample in not_submitted_samples: output_dirpath = sample.varfilter_dirpath = join( sample.dirpath, source.varfilter_name) output_fpath = sample.variants_fpath = join( sample.varfilter_dirpath, variants_fname) pass_output_fpath = add_suffix(sample.variants_fpath, variant_filtering.mut_pass_suffix) if cnf.reuse_intermediate and check_filtering_results(output_fpath) \ and check_filtering_results(pass_output_fpath): info('Filtered results ' + output_fpath + ' and ' + pass_output_fpath + ' exist, reusing.') reused_samples.append(sample) info() continue varfilter_py = 'varfilter' work_dir = join(cnf.work_dir, 'filt_' + sample.name) if not cnf.genome.dbsnp_multi_mafs: critical( 'Error: dbsnp_multi_mafs is not specified in the config ' + cnf.sys_cnf) cmdl = ( '{varfilter_py}' + ((' --sys-cnf ' + cnf.sys_cnf) if not cnf.filt_cnf else '') + ((' --run-cnf ' + cnf.run_cnf) if not cnf.filt_cnf else '') + ((' --filt-cnf ' + cnf.filt_cnf) if cnf.filt_cnf else '') + ' --vcf {sample.anno_vcf_fpath}' + ' --sample {sample.name}' + ' -o {output_dirpath}' + ' --output-file {sample.variants_fpath}' + ' --project-name ' + cnf.project_name + ' --genome {cnf.genome.name}' + ' --work-dir {work_dir}' + ' --debug ' + (' --cohort-freqs {cohort_freqs_fpath}' if cohort_freqs_fpath else '') + (' --reuse ' if cnf.reuse_intermediate else '') + ((' --caller ' + cnf.caller) if cnf.caller else '') + (' --qc' if cnf.qc else ' --no-qc') + (' --no-tsv' if not cnf.tsv else '') + ' --dbsnp-multi-mafs ' + adjust_path(cnf.genome.dbsnp_multi_mafs)).format(**locals()) with with_cnf(cnf, reuse_intermediate=False): j = submit_job(cnf, cmdl, job_name='_filt_' + sample.name, output_fpath=pass_output_fpath, stdout_to_outputfile=False, work_dir=work_dir) if not j.is_done: jobs_to_wait.append(j) submitted_samples.append(sample) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No filtering jobs to submit.') info('') info('-' * 70) info('Finihsed filtering ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_file( j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed and not cnf.debug: if isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) else: err('Job was done, but ' + j.work_dir + ' does not exist') processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) info() info('Combining results...') vcf2txt_fpaths = [s.variants_fpath for s in samples] variants_fpath, pass_variants_fpath = combine_results( cnf, samples, vcf2txt_fpaths, variants_fpath) if cnf.qc: _summarize_varqc(cnf, cnf.output_dir, samples, cnf.project_name, post_filter=True) return variants_fpath, pass_variants_fpath
def _annotate(cnf, samples): varannotate_cmdl = (get_script_cmdline( cnf, 'python', join('scripts', 'post', 'varannotate.py')) + ' --sys-cnf ' + cnf.sys_cnf + ' --run-cnf ' + cnf.run_cnf + ' --project-name ' + cnf.project_name + (' --reuse ' if cnf.reuse_intermediate else '') + ' --log-dir -' + ' --genome ' + cnf.genome.name + (' --no-check ' if cnf.no_check else '') + (' --qc' if cnf.qc else ' --no-qc') + ((' --caller ' + cnf.caller) if cnf.caller else '')) total_reused = 0 total_processed = 0 total_success = 0 total_failed = 0 not_submitted_samples = samples while not_submitted_samples: jobs_to_wait = [] submitted_samples = [] reused_samples = [] for sample in not_submitted_samples: if not sample.varannotate_dirpath: sample.varannotate_dirpath = join(sample.dirpath, source.varannotate_name) if not sample.anno_vcf_fpath: sample.anno_vcf_fpath = join( sample.varannotate_dirpath, add_suffix(basename(sample.vcf), 'anno')) output_fpath = sample.anno_vcf_fpath if not output_fpath.endswith('.gz'): output_fpath += '.gz' debug('Checking ' + output_fpath) if cnf.reuse_intermediate and isfile(output_fpath) and verify_vcf( output_fpath): info('Annotated results ' + output_fpath + ' exist, reusing.') reused_samples.append(sample) info() continue work_dir = join(cnf.work_dir, source.varannotate_name + '_' + sample.name) j = submit_job( cnf, cmdline=varannotate_cmdl + ' --vcf ' + sample.vcf + ' -o ' + sample.varannotate_dirpath + ' -s ' + sample.name + ' --work-dir ' + work_dir + ' --output-file ' + output_fpath, job_name='VA_' + cnf.project_name + '_' + sample.name, output_fpath=output_fpath, stdout_to_outputfile=False, work_dir=work_dir) if not j.is_done: jobs_to_wait.append(j) submitted_samples.append(sample) if len(jobs_to_wait) >= cnf.threads: not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] if not_submitted_samples: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before ' 'submitting more ' + str(len(not_submitted_samples))) else: info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.') info() break info() info() info('-' * 70) if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No annotation jobs to submit.') info('') info('-' * 70) info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs') for j in jobs_to_wait: if j.is_done and not j.is_failed and not verify_vcf( j.output_fpath): j.is_failed = True if j.is_done and not j.is_failed: if isdir(j.work_dir): os.system('rm -rf ' + j.work_dir) else: err('Job was done, but j.work_dir ' + j.work_dir + ' does not exist') processed = sum(1 for j in jobs_to_wait if j.is_done) failed = sum(1 for j in jobs_to_wait if j.is_failed) success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed) total_failed += failed total_reused += len(reused_samples) total_processed += processed total_success += success info('Reused: ' + str(len(reused_samples))) info('Processed: ' + str(processed)) info('Success: ' + str(success)) info('Failed: ' + str(failed)) info() not_submitted_samples = [ s for s in not_submitted_samples if s not in submitted_samples and s not in reused_samples ] info('-' * 70) info('Done with all ' + str(len(samples)) + ' samples.') info('Total reused: ' + str(total_reused)) info('Total processed: ' + str(total_processed)) info('Total success: ' + str(total_success)) info('Total failed: ' + str(total_failed)) info()
def __intermediate_fname(work_dir, fname, suf): output_fname = add_suffix(fname, suf) return join(work_dir, basename(output_fname))
def generate_flagged_regions_report(cnf, output_dir, sample, ave_depth, gene_by_key): depth_threshs = cnf.coverage_reports.depth_thresholds report = PerRegionSampleReport( sample=sample, metric_storage=get_detailed_metric_storage(depth_threshs)) report.add_record('Sample', sample.name) safe_mkdir(sample.flagged_regions_dirpath) ''' 1. Detect depth threshold (ave sample coverage * DEPTH_THRESH_FROM_AVE_COV) 2. Select regions covered in less than MIN_DEPTH_PERCENT_AT_THRESH at threshold 3. Sort by % at threshold 4. Select those parts of those regions where % = 0, save to BED 5. Find HotSpots at those regions 6. Intersect HotSpots with tracks For each gene where are regions with parts % = 0: sort them by part where % = 0 ''' #vcf_dbs = ['oncomine', 'dbsnp', 'cosmic'] vcf_dbs = ['oncomine'] from source._deprecated_clinical_reporting.clinical_parser import get_key_or_target_bed_genes key_genes, _ = get_key_or_target_bed_genes( cnf.bed, verify_file(adjust_system_path(cnf.key_genes), 'key genes')) depth_cutoff = get_depth_cutoff(ave_depth, depth_threshs) genes_sorted = sorted(gene_by_key.values()) min_cov, max_cov = min_and_max_based_on_outliers(genes_sorted) for coverage_type in ['low', 'high']: info('Selecting and saving ' + coverage_type + ' covered genes') selected_genes = [] if coverage_type == 'low': selected_genes = [ g for g in genes_sorted if g.gene_name in key_genes and (any( e.rates_within_threshs[depth_cutoff] < MIN_DEPTH_PERCENT_AT_THRESH for e in g.get_exons()) or any( a.rates_within_threshs[depth_cutoff] < MIN_DEPTH_PERCENT_AT_THRESH for a in g.get_amplicons())) ] else: if max_cov: selected_genes = [ g for g in genes_sorted if g.gene_name in key_genes and (any( e.avg_depth > max_cov for e in g.get_exons()) or any( a.avg_depth > max_cov for a in g.get_amplicons())) ] for region_type in ['exons', 'target']: selected_regions = [] for gene in selected_genes: if coverage_type == 'low': cur_regions = [ a for a in (gene.get_amplicons() if region_type == 'target' else gene.get_exons()) if a.rates_within_threshs[depth_cutoff] < MIN_DEPTH_PERCENT_AT_THRESH and 'Multi' not in a.feature ] else: cur_regions = [ a for a in (gene.get_amplicons() if region_type == 'target' else gene.get_exons()) if a.avg_depth > max_cov and 'Multi' not in a.feature ] selected_regions.extend(cur_regions) if selected_regions: selected_regions_bed_fpath = join( sample.flagged_regions_dirpath, coverage_type + '_cov_' + region_type + '.bed') save_regions_to_bed(cnf, selected_regions, selected_regions_bed_fpath) # Report cov for Hotspots for db in vcf_dbs: res = _report_normalize_coverage_for_variant_sites( cnf, sample, ave_depth, db, selected_regions_bed_fpath, selected_regions, depth_cutoff, region_type, coverage_type) if not res: return None report = make_flat_region_report(sample, selected_regions, depth_threshs) flagged_txt_fpath = add_suffix( add_suffix(sample.flagged_txt, region_type), coverage_type) flagged_tsv_fpath = add_suffix( add_suffix(sample.flagged_tsv, region_type), coverage_type) report.save_txt(flagged_txt_fpath) report.save_tsv(flagged_tsv_fpath) info('') info(coverage_type + ' covered ' + region_type + '(total ' + str(len(selected_regions)) + ') for sample ' + sample.name + ' saved into:') info(' ' + flagged_txt_fpath + ', ' + flagged_tsv_fpath) return report
def __intermediate_fname(work_dir, fname, suf): output_fname = add_suffix(fname, suf) return join(work_dir, basename(output_fname))
def intersect_regions(cnf, bcbio_structures, all_regions, min_samples): all_regions_fname = 'all_regions.bed' all_regions_bed_fpath = join( cnf.output_dir, add_suffix(all_regions_fname, str(cnf.min_depth)) if cnf.min_depth else all_regions_fname) with open(all_regions_bed_fpath, 'w') as out: if not cnf.min_depth: out.write( '## Coverage threshold Nx is 10x for cell line and 100x for plasma\n' ) else: out.write('## Coverage threshold Nx is ' + str(cnf.min_depth) + 'x\n') out.write('\t'.join([ '#Chr', 'Start', 'End', 'Size', 'Gene', 'Depth<Nx', 'SamplesSharingSameFeature' ]) + '\n') for region in all_regions: out.write('\t'.join([str(val) for val in region]) + '\n') regions_overlaps = defaultdict(lambda: defaultdict(list)) regions = [] if cnf.tricky_regions: intersection_fpath = _intersect_with_tricky_regions( cnf, all_regions_bed_fpath, 'samples') else: bed_fpath = cnf.bed intersection_fpath = join( cnf.work_dir, splitext(basename(all_regions_bed_fpath))[0] + '_bed.intersect') bedtools = get_system_path(cnf, 'bedtools') if not cnf.reuse_intermediate or not verify_file( intersection_fpath, silent=True, is_critical=False): cmdline = '{bedtools} intersect -header -a {all_regions_bed_fpath} -b {bed_fpath} -wo'.format( **locals()) res = call(cnf, cmdline, output_fpath=intersection_fpath, max_number_of_tries=1, exit_on_error=False) if not res: return None with open(intersection_fpath) as f: for l in f: l = l.strip() if not l or l.startswith('#'): continue fs = l.split('\t') chrom, start, end, size, symbol, pct_depth, num_samples = fs[:7] overlap_bps = int(fs[-1]) r = (chrom, start, end, size, symbol, pct_depth, num_samples) if cnf.tricky_regions: filename = tricky_regions_fnames_d[basename( fs[7]).split('.')[0]] regions_overlaps[r][filename].append(overlap_bps) else: regions_overlaps[r][basename(cnf.bed)].append(overlap_bps) for r in all_regions: if r in regions_overlaps: overlaps = '' chrom, start, end, size, symbol, pct_depth, num_samples = r overlaps_txt = ', '.join( fname + ': %.0f' % (sum(regions_overlaps[r][fname]) / float(size) * 100) + '%' for fname in regions_overlaps[r]) r = list(r) r.append(overlaps_txt) else: r = list(r) r.append('') regions.append(r) os.remove(intersection_fpath) return regions
def combine_projects(cnf, bcbio_structures, tags=None): tag_by_sample = dict() if tags: for bs, tag in zip(bcbio_structures, tags): for s in bs.samples: tag_by_sample[s.name] = tag or bs.project_name # else: # for bs in bcbio_structures: # for s in bs.sampels: # tag_by_sample[s.name] = bs.project_name final_dirpath = adjust_path(join(cnf.output_dir, 'final')) safe_mkdir(final_dirpath) merged_bcbio_cnf = merge_bcbio_yamls(cnf, bcbio_structures) samples = [s for bs in bcbio_structures for s in bs.samples] dirs_to_reprocess = [ source.clinreport_dir, BCBioStructure.var_dir, source.varannotate_name, source.varfilter_name ] for s in samples: sample_dir = join(final_dirpath, s.name) sample_var_dirpath = join(sample_dir, BCBioStructure.var_dir) safe_mkdir(sample_var_dirpath) for file_or_dir in os.listdir(s.dirpath): if file_or_dir not in dirs_to_reprocess: safe_symlink_to(join(s.dirpath, file_or_dir), sample_dir) for file in os.listdir(s.var_dirpath): safe_symlink_to(join(s.var_dirpath, file), sample_var_dirpath) merged_date_dir = join( final_dirpath, merged_bcbio_cnf['fc_date'] + '_' + merged_bcbio_cnf['fc_name']) merged_bs_var_dirpath = join(merged_date_dir, BCBioStructure.var_dir) merged_bs_raw_var_dirpath = join(merged_bs_var_dirpath, 'raw') safe_mkdir(merged_bs_raw_var_dirpath) for bs in bcbio_structures: for file in os.listdir(bs.raw_var_dirpath): safe_symlink_to(join(bs.raw_var_dirpath, file), merged_bs_raw_var_dirpath) variants_fpaths = [] vardict_txt_fname = variant_filtering.mut_fname_template.format( caller_name='vardict') variants_fpath = join(merged_bs_var_dirpath, vardict_txt_fname) pass_variants_fpath = add_suffix(variants_fpath, variant_filtering.mut_pass_suffix) reject_variants_fpath = add_suffix(variants_fpath, variant_filtering.mut_reject_suffix) cnf.steps = ['Variants'] for bs_i, bs in enumerate( bcbio_structures ): # re-filtering, perform cohort-based filtering only within sub-projects correct_bs = BCBioStructure(cnf, cnf.output_dir, bs.bcbio_cnf, final_dirpath) bcbio_runner = BCBioRunner(cnf, correct_bs, bs.bcbio_cnf) bcbio_runner.post_jobs() bs_raw_variants_fpath = add_suffix(variants_fpath, str(bs_i)) pass_bs_variants_fpath = add_suffix(bs_raw_variants_fpath, variant_filtering.mut_pass_suffix) reject_bs_variants_fpath = add_suffix( bs_raw_variants_fpath, variant_filtering.mut_reject_suffix) shutil.move(variants_fpath, bs_raw_variants_fpath) shutil.move(pass_variants_fpath, pass_bs_variants_fpath) shutil.move(reject_variants_fpath, reject_bs_variants_fpath) variants_fpaths.append(bs_raw_variants_fpath) merged_bs = BCBioStructure(cnf, cnf.output_dir, merged_bcbio_cnf, final_dirpath) merged_samples = [s for s in merged_bs.samples] cnf.variant_filtering.max_ratio = 1 combine_results(cnf, merged_samples, variants_fpaths, variants_fpath, pass_variants_fpath=pass_variants_fpath) for variants_fpath in variants_fpaths: safe_remove(variants_fpath) pass_fpath = add_suffix(variants_fpath, variant_filtering.mut_pass_suffix) safe_remove(pass_fpath) reject_fpath = add_suffix(variants_fpath, variant_filtering.mut_reject_suffix) safe_remove(reject_fpath) cnf.reuse_intermediate = True cnf.steps = ['Seq2C', 'Summary'] BCBioRunner(cnf, merged_bs, merged_bs.bcbio_cnf).post_jobs()
def main(): if len(sys.argv) < 4: info( 'The script writes all CDS, stop codon, and ncRNA exon regions for all known Ensembl genes, with associated gene symbols.' ) # info('When the gene name is found in HGNC, it get replaced with an approved name. ') # info('If the gene is not charactirized (like LOC729737), this symbol is just kept as is. ') info( ' ' ) info( 'Usage: ' ) info(' ' + __file__ + ' hg19 db.gtf output.bed [HGNC_gene_synonyms.txt=' + us_syn_path + '] [additional_feature_list]') info( ' ' ) info( ' where HGNC_gene_synonyms.txt (from http://www.genenames.org/cgi-bin/download) is:' ) info( ' #Approved Symbol Previous Symbols Synonyms Chromosome Ensembl Gene ID UCSC ID(supplied by UCSC)' ) info( ' OR7E26P OR7E67P, OR7E69P, OR7E70P, OR7E68P OR1-51, OR1-72, OR1-73, OR912-95 19q13.43 ENSG00000121410 uc002qsg.3' ) info( ' ... ' ) info( ' ' ) info( ' or DB is Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz' ) info( ' 1 pseudogene gene 11869 14412 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene";' ) info( ' 1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";' ) info( ' ... ' ) info( ' ' ) info( ' or DB is RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz' ) info( ' NC_000001.10 RefSeq region 1 249250621 . + . ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA' ) info( ' NC_000001.10 BestRefSeq gene 11874 14409 . + . ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true' ) info( ' NC_000001.10 BestRefSeq transcript 11874 14409 . + . ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2' ) info( ' NC_000001.10 BestRefSeq exon 11874 12227 . + . ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2' ) info( ' ... ' ) info( ' ' ) info( ' or either RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) is:' ) info( ' #hg19.knownGene.name hg19.knownGene.chrom hg19.knownGene.strand hg19.knownGene.txStart hg19.knownGene.txEnd hg19.knownGene.exonCount hg19.knownGene.exonStarts hg19.knownGene.exonEnds hg19.kgXref.geneSymbol' ) info( ' uc001aaa.3 chr1 + 11873 14409 3 11873,12612,13220, 12227,12721,14409, DDX11L1' ) info( ' ... ' ) info( ' ' ) info( ' Writes to Exons.bed ' ) info( ' ' ) info( 'See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols' ) sys.exit(1) genome_name = sys.argv[1] seq_fpath = hg19_seq_fpath if genome_name == 'hg19' else hg38_seq_fpath canonical_transcripts_fpath = canonical_hg19_transcripts_fpath if genome_name == 'hg19' else canonical_hg38_transcripts_fpath chr_lengths = get_chr_lengths_from_seq(seq_fpath) chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)} input_fpath = verify_file(sys.argv[2]) output_fpath = adjust_path(sys.argv[3]) synonyms_fpath = None if len(sys.argv) > 4: synonyms_fpath = verify_file(sys.argv[4]) info('Synonyms file provided ' + synonyms_fpath + '') else: info('No synonyms file provided, skipping approving') not_approved_fpath = None if len(sys.argv) > 5: not_approved_fpath = adjust_path(sys.argv[5]) with open(verify_file(canonical_transcripts_fpath)) as f: canonical_transcripts_ids = set(l.strip().split('.')[0] for l in f) info('Reading the features...') with open_gzipsafe(input_fpath) as inp: l = inp.readline() if output_fpath.endswith('.gtf') or output_fpath.endswith('.gtf.gz'): gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_fpath, chr_order) elif output_fpath.endswith('.gff3') or output_fpath.endswith( '.gff3.gz'): gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_fpath, chr_order) else: gene_by_name_and_chrom = _proc_ucsc(inp, output_fpath, chr_order) if synonyms_fpath and synonyms_fpath != "''": gene_by_name_and_chrom, not_approved_gene_names = _approve( gene_by_name_and_chrom, synonyms_fpath) info('') info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) + ' genes.') if not_approved_fpath: with open(not_approved_fpath, 'w') as f: f.write('#Searched as\tStatus\n') f.writelines((l + '\n' for l in not_approved_gene_names)) info('Saved not approved to ' + not_approved_fpath) # with open('serialized_genes.txt', 'w') as f: # for g in gene_by_name.values(): # f.write(str(g) + '\t' + str(g.db_id) + '\n') # for e in g.exons: # f.write('\t' + str(e) + '\n') info('Found:') info(' ' + str(len(gene_by_name_and_chrom)) + ' genes') genes = gene_by_name_and_chrom.values() coding_and_mirna_genes = [ g for g in genes if any(t.biotype in ['protein_coding', 'miRNA'] for t in g.transcripts) ] coding_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'protein_coding' for t in g.transcripts) ] coding_transcripts = [ t for g in coding_and_mirna_genes for t in g.transcripts if t.biotype == 'protein_coding' ] mirna_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'miRNA' for t in g.transcripts) ] mirna_transcripts = [ t for g in coding_and_mirna_genes for t in g.transcripts if t.biotype == 'miRNA' ] codingmiRNA_genes = [ g for g in coding_and_mirna_genes if any(t.biotype == 'miRNA' for t in g.transcripts) and any(t.biotype == 'protein_coding' for t in g.transcripts) ] info(' ' + str(len(coding_genes)) + ' coding genes') info(' ' + str(len(coding_transcripts)) + ' coding transcripts') info(' ' + str(len(mirna_genes)) + ' miRNA genes') info(' ' + str(len(mirna_transcripts)) + ' miRNA transcripts') info(' ' + str(len(codingmiRNA_genes)) + ' genes with both coding and miRNA transcripts') info() # info('Choosing genes with exons...') # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] info('Choosing canonical...') canon_genes = choose_canonical(genes, canonical_transcripts_ids) info() info('Sorting and printing all regions...') print_genes(genes, output_fpath, canon_only=False) info() info('Sorting and printing canonical regions...') canon_output_fpath = add_suffix(output_fpath, 'canon') print_genes(canon_genes, canon_output_fpath, canon_only=True) info() info('Saved all regions to\n ' + output_fpath + '\n ' + canon_output_fpath)
def _generate_summary_flagged_regions_report(cnf, bcbio_structure, samples, mutations, key_or_target_genes): region_types = ['exons', 'target'] coverage_types = ['low', 'high'] flagged_regions_metrics = [ Metric('Gene', min_width=50, max_width=70), Metric('Chr', with_heatmap=False, max_width=20, align='right'), Metric('Position', td_class='td_position', min_width=70, max_width=120), Metric('Ave depth', td_class='long_expanded_line right_aligned', max_width=100, with_heatmap=False), Metric('#HS', quality='Less is better', align='right', max_width=30), Metric('Hotspots & Deleterious', td_class='long_expanded_line', min_width=100, max_width=150), Metric('Found mutations', td_class='long_expanded_line', min_width=150, max_width=200), Metric('Samples', td_class='long_expanded_line', min_width=100, max_width=120), Metric('Possible reasons', td_class='long_expanded_line', max_width=120) ] flagged_regions_metric_storage = MetricStorage( sections=[ReportSection(metrics=flagged_regions_metrics)]) flagged_regions_report_dirpath = bcbio_structure.flagged_regions_dirpath safe_mkdir(flagged_regions_report_dirpath) if key_or_target_genes == 'target': genes_description = 'genes' else: genes_description = 'genes that have been previously implicated in various cancers' for region_type in region_types: regions_dict = {} total_regions = 0 info() info('Preparing report for ' + region_type) for coverage_type in coverage_types: regions_by_gene = {} for sample in samples: selected_regions_bed_fpath = join( sample.flagged_regions_dirpath, coverage_type + '_cov_' + region_type + '.bed') regions_by_reasons = {} if verify_file(selected_regions_bed_fpath, is_critical=False): intersection_fpath = _intersect_with_tricky_regions( cnf, selected_regions_bed_fpath, sample.name) regions_by_reasons = _parse_intersection_with_tricky_regions( cnf, intersection_fpath) total_report_fpath = add_suffix( add_suffix(sample.flagged_tsv, region_type), coverage_type) if verify_file(total_report_fpath, is_critical=False): with open(total_report_fpath) as f: for l in f: l = l.strip() if not l or l.startswith('#'): continue fs = l.split('\t') (chrom, start, end, size, gene, strand, feature, biotype, min_depth, avg_depth) = fs[:10] start, end = int(start), int(end) regions_by_gene.setdefault(gene, []) cur_region = Region(sample_name=[sample.name], avg_depth=[avg_depth], gene_name=gene, strand=strand, feature=feature, biotype=biotype, chrom=chrom, start=start, end=end) for r in regions_by_reasons: if r[0] <= start and end <= r[1]: cur_region.extra_fields = regions_by_reasons[ r] cur_region.missed_by_db = [] was_added = False for r in regions_by_gene[gene]: if r.start <= cur_region.start <= r.end and r.start <= cur_region.end <= r.end: was_added = True if sample.name not in r.sample_name: r.sample_name.append(sample.name) r.avg_depth.append(avg_depth) if not was_added: regions_by_gene[gene].append(cur_region) report_fpath = join( sample.flagged_regions_dirpath, coverage_type + '_cov_' + region_type + '.oncomine.tsv') if verify_file(report_fpath, is_critical=False): with open(report_fpath) as f: for l in f: l = l.strip() if not l or l.startswith('#'): continue fs = l.split('\t') hotspots = [] (gene, chrom, start, end, strand, feature, biotype, id_, num_hotspots) = fs[:9] start, end = int(start), int(end) if int(num_hotspots) != 0: hotspots = fs[9].split() regions_by_gene.setdefault(gene, []) cur_region = Region(sample_name=[sample.name], gene_name=gene, strand=strand, feature=feature, biotype=biotype, chrom=chrom, start=start, end=end) for r in regions_by_gene[gene]: if r.start <= cur_region.start <= r.end and r.start <= cur_region.end <= r.end: if sample.name not in r.sample_name: r.sample_name.append(sample.name) r.avg_depth.append('.') new_hotspots = [ hs for hs in hotspots if hs not in r.missed_by_db ] r.missed_by_db.extend(new_hotspots) flagged_regions_report = PerRegionSampleReport( name='Flagged regions', metric_storage=flagged_regions_metric_storage) num_regions = 0 non_hs_class = ' no_hotspots' slash_with_zero_space = '/​' for gene in regions_by_gene.keys(): if regions_by_gene[gene]: num_regions += len(regions_by_gene[gene]) row_class = ' expandable_row collapsed' if len(regions_by_gene[gene]) > 1: reg = flagged_regions_report.add_row() reg.class_ = ' expandable_gene_row collapsed' chr = regions_by_gene[gene][0].chrom num_hotspots = [ len(r.missed_by_db) for r in regions_by_gene[gene] ] all_samples = [ sample for r in regions_by_gene[gene] for sample in r.sample_name ] all_unique_samples = [] all_unique_samples = [ sample for sample in all_samples if sample not in all_unique_samples and not all_unique_samples.append(sample) ] all_tricky_regions = sorted( set([ tricky_region for r in regions_by_gene[gene] for tricky_region in r.extra_fields ])) all_depths = [[] for x in range(len(all_unique_samples))] for r in regions_by_gene[gene]: for sample_num, sample in enumerate( all_unique_samples): if sample in r.sample_name: cur_sample_index = r.sample_name.index( sample) if r.avg_depth[cur_sample_index] != '.': all_depths[sample_num].append( float( r.avg_depth[cur_sample_index])) avg_depth_per_samples = [ sum(all_depths[i]) / len(all_depths[i]) if len(all_depths[i]) > 0 else 0 for i in range(len(all_depths)) ] reg.add_record('Gene', gene) reg.add_record('Chr', chr.replace('chr', '')) reg.add_record('#HS', sum(num_hotspots)) reg.add_record( 'Position', str(len(regions_by_gene[gene])) + ' regions') reg.add_record( 'Ave depth', slash_with_zero_space.join([ format(depth, '.2f') if depth != '.' else '.' for depth in avg_depth_per_samples ]), num=sum(avg_depth_per_samples) / len(avg_depth_per_samples)) reg.add_record('Hotspots & Deleterious', '') reg.add_record('Possible reasons', ', '.join(all_tricky_regions)) reg.add_record('Samples', ',\n'.join(all_unique_samples)) reg.add_record('Found mutations', '') if sum(num_hotspots) == 0: reg.class_ += non_hs_class row_class += ' row_to_hide row_hidden' else: row_class += ' not_to_hide' for r in regions_by_gene[gene]: reg = flagged_regions_report.add_row() reg.class_ = row_class reg.add_record('Gene', r.gene_name) reg.add_record('Chr', r.chrom.replace('chr', '')) avg_depths = [ float(depth) for depth in r.avg_depth if depth != '.' ] reg.add_record( 'Ave depth', slash_with_zero_space.join([ format(depth, '.2f') if depth != '.' else depth for depth in avg_depths ]), num=sum(avg_depths) / len(avg_depths)) reg.add_record( 'Position', Metric.format_value( r.start, human_readable=True, is_html=True) + '-' + Metric.format_value( r.end, human_readable=True, is_html=True)) reg.add_record('#HS', len(r.missed_by_db)) if len(r.missed_by_db) == 0: reg.class_ += non_hs_class uniq_hs_positions = sorted( set([ hotspot.split(':')[0] for hotspot in r.missed_by_db ])) hs_by_pos = { pos: [ h.split(':')[1] for h in r.missed_by_db if h.split(':')[0] == pos ] for pos in uniq_hs_positions } hs_breakable = [ gray( Metric.format_value(int(pos.replace(',', '')), human_readable=True, is_html=True)) + ': ' + ','.join([ h.replace('/', slash_with_zero_space) for h in hs_by_pos[pos] ]) for pos in uniq_hs_positions ] reg.add_record('Hotspots & Deleterious', '\n'.join(hs_breakable)) reg.add_record('Possible reasons', ', '.join(r.extra_fields)) reg.add_record('Samples', ',\n'.join(r.sample_name)) found_mutations = [] for sample in samples: if sample.name in r.sample_name: for mut in mutations[sample.name]: if mut.gene.name == r.gene_name and r.start <= mut.pos <= r.end: found_mutations.append( gray( Metric.format_value( mut.pos, human_readable=True, is_html=True)) + ':' + mut.ref + '>' + mut.alt + ' (' + sample.name + ')') reg.add_record('Found mutations', '\n'.join(found_mutations)) flagged_regions_report.expandable = True flagged_regions_report.unique = True regions_dict[coverage_type] = create_section( flagged_regions_report, num_regions, regions_by_gene.keys(), region_type) total_regions += num_regions flagged_report_fpath = join(flagged_regions_report_dirpath, 'flagged_' + region_type + '.html') write_static_html_report(cnf, { 'key_or_target': key_or_target_genes, 'region_type': region_type, 'genes_description': genes_description, 'flagged_low': regions_dict['low'], 'flagged_high': regions_dict['high'], }, flagged_report_fpath, tmpl_fpath=join( dirname(abspath(__file__)), 'template_flagged_regions.html'), extra_js_fpaths=[ join(dirname(abspath(__file__)), 'static', 'flagged_regions.js') ], extra_css_fpaths=[ join(dirname(abspath(__file__)), 'static', 'flagged_regions.css') ]) #BaseReport.save_html(flagged_regions_report, cnf, flagged_report_fpath, caption='Flagged regions') info('') info('Flagged regions (total ' + str(total_regions) + ' ' + region_type + ') saved into:') info(' ' + flagged_report_fpath)
def create_oncoprints_link(cnf, bcbio_structure, project_name=None): if is_us(): loc = exposing.us # elif is_uk(): loc = exposing.uk else: loc = exposing.local return None if not bcbio_structure.variant_callers: info('No varianting calling performed, not generating Oncoprints') return None clinical_report_caller = \ bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') if not clinical_report_caller: err('Warning: vardict is not in the variant callers list, this not generating Oncoprints') return None step_greetings('Creating Oncoprints link') zhongwu_data_query_dirpath = '/home/kdld047/public_html/cgi-bin/TS' if not isdir(zhongwu_data_query_dirpath): warn('Data Query directory ' + zhongwu_data_query_dirpath + ' does not exists.') return None vardict_txt_fname = variant_filtering.mut_fname_template.format(caller_name=clinical_report_caller.name) vardict_txt_fpath = join(bcbio_structure.var_dirpath, vardict_txt_fname) cnf.mutations_fpath = add_suffix(vardict_txt_fpath, variant_filtering.mut_pass_suffix) cnf.seq2c_tsv_fpath = bcbio_structure.seq2c_fpath samples = sorted(bcbio_structure.samples) cnf.project_name = project_name or bcbio_structure.project_name or basename(cnf.output_dir) study_name = re.sub('[\.\-:&]', '_', cnf.project_name) check_genome_resources(cnf) data_query_dirpath = join(loc.dirpath, 'DataQueryTool') data_fpath = join(zhongwu_data_query_dirpath, study_name + '.data.txt') info_fpath = join(zhongwu_data_query_dirpath, study_name + '.info.txt') altered_genes = print_data_txt(cnf, cnf.mutations_fpath, cnf.seq2c_tsv_fpath, samples, data_fpath) if not altered_genes: err('No altered genes in ' + cnf.mutations_fpath + ' or ' + cnf.seq2c_tsv_fpath + ', not generating Oncoptints.') return None print_info_txt(cnf, samples, info_fpath) data_ext_fpath = data_fpath.replace('/home/', '/users/') info_ext_fpath = info_fpath.replace('/home/', '/users/') # optional: data_symlink = join(data_query_dirpath, study_name + '.data.txt') info_symlink = join(data_query_dirpath, study_name + '.info.txt') (symlink_to_ngs if is_us() else local_symlink)(data_ext_fpath, data_symlink) (symlink_to_ngs if is_us() else local_symlink)(info_ext_fpath, info_symlink) properties_fpath = join(zhongwu_data_query_dirpath, 'DataQuery.properties') add_data_query_properties(cnf, study_name, properties_fpath, data_ext_fpath, info_ext_fpath) genes = '%0D%0A'.join(altered_genes) data_query_url = join(loc.website_url_base, 'DataQueryTool', 'DataQuery.pl?' 'analysis=oncoprint&' 'study={study_name}&' 'gene={genes}&' 'order=on&' 'freq=50&' 'nocheckgenes=true&' 'submit=Submit' .format(**locals())) info() info('Information about study was added in Data Query Tool, URL is ' + data_query_url) return data_query_url
def run_annotators(cnf, vcf_fpath, bam_fpath): original_vcf = cnf.vcf db_section_by_name = OrderedDict( (dbname, cnf.annotation[dbname]) for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine'] if dbname in cnf.annotation and not cnf.annotation[dbname].get('skip-annotation')) # if not cnf.no_check: # to_delete_id_ref = [] # if 'dbsnp' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as rs*') # to_delete_id_ref.append('rs') # if 'cosmic' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as COS*') # to_delete_id_ref.append('COS') # # def delete_ids(rec): # deleting existing dbsnp and cosmic ID annotations # if rec.ID: # if isinstance(rec.ID, basestring): # if any(rec.ID.startswith(pref) for pref in to_delete_id_ref): # rec.ID = None # else: # rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)] # # if not rec.FILTER: # rec.FILTER = 'PASS' # # return rec # # info('Removing previous rs* and COS* IDs') # vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID') bcftools = get_system_path(cnf, 'bcftools') if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'): vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) cmdl = '{bcftools} annotate --remove ID {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid')) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get( 'custom_vcfs', dict()).items(): step_greetings('Annotating using ' + dbname) annotations = ','.join('INFO/' + a for a in dbconf.get('annotations')) if dbname in ('cosmic', 'dbsnp'): annotations += ',=ID' db_fpath = get_db_path(cnf, dbconf, dbname) if db_fpath: cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname)) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) verify_vcf(vcf_fpath, is_critical=True) if 'dbnsfp' in cnf.annotation: res = _snpsift_db_nsfp(cnf, vcf_fpath) if res: vcf_fpath = res if 'snpeff' in cnf.annotation: res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath) if res: vcf_fpath = res verify_vcf(vcf_fpath, is_critical=True) final_summary_fpath = join(cnf.output_dir, basename(summary_fpath)) final_genes_fpath = join(cnf.output_dir, basename(genes_fpath)) if isfile(final_summary_fpath): os.remove(final_summary_fpath) if isfile(final_genes_fpath): os.remove(final_genes_fpath) if file_exists(summary_fpath): shutil.move(summary_fpath, final_summary_fpath) if file_exists(genes_fpath): shutil.move(genes_fpath, final_genes_fpath) if 'tracks' in cnf.annotation and cnf.annotation[ 'tracks'] and cnf.annotation['tracks']: track_fapths = [] for track_name in cnf.annotation['tracks']: if isfile(track_name) and verify_file(track_name): track_fapths.append(track_name) else: if 'tracks' in cnf['genome'] and cnf['genome'][ 'tracks'] and track_name in cnf['genome']['tracks']: track_fpath = cnf['genome']['tracks'][track_name] if verify_file(track_fpath): track_fapths.append(track_fpath) for track_fapth in track_fapths: res = _tracks(cnf, track_fapth, vcf_fpath) if res: vcf_fpath = res step_greetings('Intersection with database VCFs...') if 'intersect_with' in cnf.annotation: for key, db_fpath in cnf.annotation['intersect_with'].items(): res = intersect_vcf(cnf, input_fpath=vcf_fpath, db_fpath=db_fpath, key=key) if res: vcf_fpath = res if 'mongo' in cnf.annotation: res = _mongo(cnf, vcf_fpath) if res: vcf_fpath = res return vcf_fpath