def main(): info(' '.join(sys.argv)) info() cnf, bcbio_structure = bcbio_summary_script_proc_params( 'expression', BCBioStructure.expression_dir) step_greetings('Gene expression heatmaps summary for all samples') report_caption_names = ['Gene counts', 'Exon counts', 'Gene TPM', 'Isoform TPM'] genes_dict, transcripts_dict = _get_gene_transcripts_id(cnf) for counts_fname, report_caption_name in zip(bcbio_structure.counts_names, report_caption_names): counts_fpath = join(bcbio_structure.expression_dirpath, counts_fname) if not verify_file(counts_fpath, silent=True): raw_counts_fpath = join(bcbio_structure.expression_dirpath, 'raw', 'combined.' + counts_fname.replace('.tsv', '')) info('Annotating ' + report_caption_name + ' from ' + raw_counts_fpath) annotate_gene_counts(cnf, raw_counts_fpath, counts_fpath, genes_dict) verify_file(counts_fpath, is_critical=True, description=counts_fname) isoforms_found = counts_fname == 'isoform.sf.tpm' and counts_fpath used_dict = transcripts_dict if isoforms_found else genes_dict report_fpath = join(safe_mkdir(join(bcbio_structure.expression_dirpath, 'html')), counts_fname.replace('.tsv', '') + '.html') make_gene_expression_heatmaps(cnf, bcbio_structure, counts_fpath, used_dict, report_fpath, report_caption_name, keep_gene_names=isoforms_found) info('Done')
def _rename_fields(cnf, inp_tsv_fpath, field_map): if cnf.get('keep_intermediate'): step_greetings('Renaming fields.') with open(inp_tsv_fpath) as f: first_line = f.readline() fields = first_line.split() new_fields = [field_map.get(f) or f for f in fields] new_first_line = '\t'.join(new_fields) if cnf.get('keep_intermediate'): out_tsv_fpath = intermediate_fname(cnf, inp_tsv_fpath, 'renamed') else: out_tsv_fpath = inp_tsv_fpath with file_transaction(cnf.work_dir, out_tsv_fpath) as tx_out_fpath: with open(tx_out_fpath, 'w') as out: out.write(new_first_line + '\n') with open(inp_tsv_fpath) as f: for i, l in enumerate(f): if i >= 1: out.write(l) if not cnf.get('keep_intermediate'): shutil.move(out_tsv_fpath, inp_tsv_fpath) return inp_tsv_fpath else: return out_tsv_fpath
def _tracks(cnf, track_fpath, input_fpath): if not verify_file(track_fpath): return None field_name = splitext_plus(basename(track_fpath))[0] step_greetings('Intersecting with ' + field_name) output_fpath = intermediate_fname(cnf, input_fpath, field_name) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath toolpath = get_system_path(cnf, 'vcfannotate') if not toolpath: err('WARNING: Skipping annotation with tracks: vcfannotate ' 'executable not found, you probably need to specify path in system_config, or ' 'run load bcbio: . /group/ngs/bin/bcbio-prod.sh"') return None # self.all_fields.append(field_name) cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format( **locals()) assert input_fpath output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, overwrite=True) if not verify_vcf(output_fpath): err('Error: tracks resulted ' + str(output_fpath) + ' for ' + track_fpath) return output_fpath # Set TRUE or FALSE for tracks def proc_line(line, i): if field_name in line: if not line.startswith('#'): fields = line.split('\t') info_line = fields[7] info_pairs = [attr.split('=') for attr in info_line.split(';')] info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if pair[0] == field_name and len(pair) > 1 else pair for pair in info_pairs] info_line = ';'.join( '='.join(pair) if len(pair) == 2 else pair[0] for pair in info_pairs) fields = fields[:7] + [info_line] + fields[8:] return '\t'.join(fields) return line assert output_fpath output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk') return verify_vcf(output_fpath, is_critical=True)
def process_one(cnf): sample = VarSample(cnf.sample, cnf.output_dir, vcf=cnf.vcf, bam=cnf.bam, genome=cnf.genome) step_greetings('Fixing "SAMPLE" INFO annotation and SAMPLE header...') vcf_fpath = fix_vcf_sample_name(cnf, sample.name, cnf.vcf) # this method will also gunzip the vcf file # sample.vcf = fix_chromosome_names(cnf, sample.vcf) # if cnf.vcf.endswith('.gz'): # vcf_fpath = intermediate_fname(cnf, splitext(sample.vcf)[0], None) # info('Ungzipping ' + sample.vcf + ', writing to ' + vcf_fpath) # gunzip = get_system_path(cnf, 'gunzip', is_critical=True) # cmdl = '{gunzip} {sample.vcf} --to-stdout'.format(**locals()) # call(cnf, cmdl, output_fpath=vcf_fpath) # verify_vcf(vcf_fpath) # sample.vcf = vcf_fpath step_greetings('Removing rejeted records...') pass_vcf_fpath = remove_rejected(cnf, vcf_fpath) info() # if sample.vcf is None: # err('No variants left for ' + cnf.vcf + ': all rejected and removed.') # return None, None, None # # In mutect, running paired analysis on a single sample could lead # # to a "none" sample column. Removing that column. # info('get_sample_column_index') # none_idx = get_sample_column_index(sample.vcf, 'none', suppress_warn=True) # if none_idx is not None: # info('Removing the "none" column.') # def fn(line, i): # if line and not line.startswith('##'): # ts = line.split('\t') # del ts[9 + none_idx] # return '\t'.join(ts) + '\n' # return line # sample.vcf = iterate_file(cnf, sample.vcf, fn, suffix='none_col') # Replacing so the main sample goes first (if it is not already) # main_idx = get_sample_column_index(sample.vcf, sample.name) # if main_idx: # info('Moving the main sample column (' + sample.name + ') to the first place.') # def fn(line, i): # if line and not line.startswith('##'): # ts = line.split('\t') # main_sample_field = ts[9 + main_idx] # del ts[9 + main_idx] # ts = ts[:9] + [main_sample_field] + ts[9:] # return '\t'.join(ts) + '\n' # return line # sample.vcf = iterate_file(cnf, sample.vcf, fn, suffix='main_col') anno_vcf_fpath = run_annotators(cnf, pass_vcf_fpath, sample.bam) return finialize_annotate_file(cnf, anno_vcf_fpath, sample, cnf.caller)
def make_tsv(cnf, vcf_fpath, samplename, main_sample_index=None): step_greetings('Exporting to TSV...') vcf_fpath = vcf_one_per_line(cnf, vcf_fpath) if main_sample_index is None: main_sample_index = get_sample_column_index(vcf_fpath, samplename) or 0 tsv_fpath = _extract_fields(cnf, vcf_fpath, samplename, main_sample_index) if not tsv_fpath: return tsv_fpath return tsv_fpath
def run_seq2c(cnf, output_dirpath, samples, seq2c_bed, is_wgs): step_greetings('Running Seq2C') bams_by_sample = dict() for s in samples: if not s.bam: err('No BAM file for ' + s.name) continue bams_by_sample[s.name] = s.bam # cnf.work_dir = join(ori_work_dir, source.targqc_name + '_' + s.name) # safe_mkdir(cnf.work_dir) # s.dedup_bam = intermediate_fname(cnf, s.bam, source.dedup_bam) # dedupped_bam_by_sample[s.name] = s.dedup_bam # if verify_bam(s.dedup_bam, silent=True): # info(s.dedup_bam + ' exists') # else: # info('Deduplicating bam file ' + s.dedup_bam) # dedup_jobs.append(remove_dups(cnf, s.bam, s.dedup_bam, use_grid=True)) # cnf.work_dir = ori_work_dir # wait_for_jobs(cnf, dedup_jobs) # # ok = True # for s in samples: # if not dedupped_bam_by_sample.get(s.name) or not verify_bam(dedupped_bam_by_sample[s.name]): # err('No BAM file for ' + s.name) # ok = False # if not ok: # err('No BAM files found for any sample, cannot run Seq2C.') # return None info('Getting reads and cov stats') mapped_read_fpath = join(output_dirpath, 'mapped_reads_by_sample.tsv') mapped_read_fpath, samples = __get_mapped_reads(cnf, samples, bams_by_sample, mapped_read_fpath) info() if not mapped_read_fpath: return None combined_gene_depths_fpath = join(output_dirpath, 'cov.tsv') combined_gene_depths_fpath = __seq2c_coverage(cnf, samples, bams_by_sample, seq2c_bed, is_wgs, combined_gene_depths_fpath) info() if not combined_gene_depths_fpath: return None seq2c_report_fpath = join(output_dirpath, source.seq2c_name + '.tsv') seq2c_report_fpath = __final_seq2c_scripts(cnf, mapped_read_fpath, combined_gene_depths_fpath, seq2c_report_fpath) if not seq2c_report_fpath: return None info('Done. The results is ' + seq2c_report_fpath) return seq2c_report_fpath
def _filter_malformed_fields(cnf, input_fpath): step_greetings('Correcting malformed fields...') def proc_rec(rec): for k, v in rec.INFO.items(): if isinstance(v, list): if v[-1] == '.': rec.INFO[k] = rec.INFO[k][:-1] if v[0] == '.': rec.INFO[k] = rec.INFO[k][1:] return rec def proc_line(line, i): if line.startswith('#'): return line.replace("\' \">", "\'\">") # For vcf-merge return line # else: # if ',.' in line or '.,' in line: # fields = line.split('\t') # info_line = fields[7] # info_pairs = [attr.split('=') for attr in info_line.split(';')] # new_info_pairs = [] # for p in info_pairs: # if len(p) == 2: # if p[1].endswith(',.'): # p[1] = p[1][:-2] # if p[1].startswith('.,'): # p[1] = p[1][2:] # new_info_pairs.append('='.join(p)) # info_line = ';'.join(new_info_pairs) # fields = fields[:7] + [info_line] + fields[8:] # return '\t'.join(fields) info('Correcting INFO fields...') output_fpath = iterate_vcf(cnf, input_fpath, proc_rec, suffix='corr') info('') info('Correcting headers for vcf-merge...') output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='corr_headr') return output_fpath
def main(): info(' '.join(sys.argv)) info() cnf, bcbio_structure = bcbio_summary_script_proc_params( BCBioStructure.fastqc_name, BCBioStructure.fastqc_dir) step_greetings('FastQC summary for all samples') final_summary_report_fpath = join(cnf.output_dir, source.fastqc_name + '.html') write_fastqc_combo_report(cnf, final_summary_report_fpath, bcbio_structure.samples) info() info('*' * 70) info('Fastqc summary:') info(' ' + final_summary_report_fpath)
def fix_chromosome_names(cnf, vcf_fpath): with open(vcf_fpath) as f: for l in f: if not l.startswith('#'): if l.startswith('chr'): info('Chomosome names are hg19, no need to fix.') return vcf_fpath step_greetings('Fixing chromosome names') def _proc_rec(rec): if not rec.CHROM.startswith('chr'): rec.CHROM = 'chr' + rec.CHROM return rec out_fpath = iterate_vcf(cnf, vcf_fpath, _proc_rec, 'chr') if not verify_file(out_fpath): err('Could not run fix_chromosome_names') return out_fpath
def add_annotation(cnf, input_fpath, key, value, number, type_, description): step_greetings('Adding annotation...') def proc_rec(rec): rec.INFO[key] = value return rec output_fpath = iterate_vcf(cnf, input_fpath, proc_rec) info('Adding header meta info...') def _add_format_header(l, i): if l.startswith('#CHROM'): ext_l = '' ext_l += '##INFO=<ID={key},Number={number},Type={type_},Description="{desc}">\n'.format( key=key, number=number, type_=type_, desc=description) return ext_l + l return l output_fpath = iterate_file(cnf, output_fpath, _add_format_header) return verify_vcf(output_fpath, is_critical=True)
def draw_plots(cnf, vcf_fpath): step_greetings('Quality control plots') chr_lengths = get_chr_lengths(cnf) qc_cnf = cnf['quality_control'] variants_per_kbp = qc_cnf.get('variant_distribution_scale') plot_scale = 1000 * variants_per_kbp info() info('Subsitutions and indel stats...') variants_distribution, substituitions, indel_lengths = _get_subs_and_indel_stats(vcf_fpath, chr_lengths, plot_scale) substs_plot_fpath = _draw_substitutions(cnf, substituitions) indels_plot_fpath = _draw_indel_lengths(cnf, indel_lengths) if substs_plot_fpath: info(' Substitutions: ' + substs_plot_fpath) if indels_plot_fpath: info(' Indels: ' + indels_plot_fpath) variants_distribution_plot_fpath = _draw_variants_distribution(cnf, variants_distribution, chr_lengths, variants_per_kbp) if variants_distribution_plot_fpath: info(' Variant distr: ' + variants_distribution_plot_fpath) return [x for x in [variants_distribution_plot_fpath, substs_plot_fpath, indels_plot_fpath] if x is not None]
def run_seq2c_bcbio_structure(cnf, bcbio_structure): step_greetings('Coverage statistics for each gene for all samples') if cnf.prep_bed is not False: info('Preparing BED files') features_bed_fpath = cnf.features or cnf.genome.features # only for annotation if cnf.bed or bcbio_structure.bed: _, _, _, seq2c_bed = \ prepare_beds(cnf, features_bed=features_bed_fpath, target_bed=bcbio_structure.bed, seq2c_bed=bcbio_structure.sv_bed) else: seq2c_bed = verify_bed(cnf.genome.cds) else: seq2c_bed = verify_bed(cnf.bed) info('Calculating normalized coverages for CNV...') cnv_report_fpath = run_seq2c( cnf, join(bcbio_structure.date_dirpath, BCBioStructure.cnv_dir), bcbio_structure.samples, seq2c_bed, is_wgs=cnf.is_wgs) # if not verify_module('matplotlib'): # warn('No matplotlib, skipping plotting Seq2C') # else: # Parallel(n_jobs=cnf.threads) \ # (delayed(draw_seq2c_plot)(CallCnf(cnf.__dict__), cnv_report_fpath, s.name, # cnf.output_dir, chr_lens=get_chr_lengths(cnf)) # for s in bcbio_structure.samples) # # for s in bcbio_structure.samples: # plot_fpath = draw_seq2c_plot(cnf, cnv_report_fpath, s.name, cnf.output_dir) info() info('*' * 70) if cnv_report_fpath: info('Seq2C:') if cnv_report_fpath: info(' ' + cnv_report_fpath) return [cnv_report_fpath]
def _snpsift_db_nsfp(cnf, input_fpath): if 'dbnsfp' not in cnf.annotation or 'dbnsfp' not in cnf.genome: return None step_greetings('DB SNFP') output_fpath = intermediate_fname(cnf, input_fpath, 'db_nsfp') if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') db_path = cnf['genome']['dbnsfp'] if not verify_file(db_path, 'DB NSFP file'): err('DB NSFP file is incorrect. Skipping.') return None annotations = cnf.annotation['dbnsfp'].get('annotations') or [] # all_fields.extend(['dbNSFP_' + ann for ann in annotations]) ann_line = ('-f ' + ','.join(annotations)) if annotations else '' cmdline = '{executable} dbnsfp {ann_line} -v -db {db_path} ' \ '{input_fpath}'.format(**locals()) if call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True): return verify_vcf(output_fpath, is_critical=True) else: return None
def _mongo(cnf, input_fpath): step_greetings('Annotating from Mongo') if 'mongo' not in cnf.annotation: return None executable = get_java_tool_cmdline( cnf, join('ext_tools', 'mongo_loader', 'VCFStore.jar')) output_fpath = intermediate_fname(cnf, input_fpath, 'mongo') project_name = cnf.project_name cmdline = ('{executable} -module annotation -inputFile {input_fpath} ' '' '-outputFile {output_fpath} -project {project_name} ').format( **locals()) if call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=False, exit_on_error=False): return output_fpath else: return None
def _report_normalize_coverage_for_variant_sites(cnf, summary_threads, output_dir, samples, vcf_key, bed_fpath): step_greetings('Combined normalized coverage for ' + vcf_key + ' hotspots') # vcf_fpath = cnf.genome.get(vcf_key) # if not vcf_fpath: # err('Error: no ' + vcf_key + ' for ' + cnf.genome.name + ' VCF fpath specified in ' + cnf.sys_cnf) # return None # # ave_coverages_per_sample = { # s.name: get_ave_coverage(cnf, s.targetcov_json_fpath) # for s in samples if verify_file(s.targetcov_json_fpath)} # # clipped_gz_vcf_fpath, regions_in_order, vars_by_region, var_by_site = \ # _read_vars_per_region_and_clip_vcf(cnf, vcf_fpath, bed_fpath) # # samtools = get_system_path(cnf, 'samtools') # bedtools = get_system_path(cnf, 'bedtools') # # vars_by_region_per_sample = OrderedDict(Parallel(n_jobs=summary_threads) # (delayed(_get_depth_for_each_variant)( # CallCnf(cnf.__dict__), samtools, bedtools, var_by_site, clipped_gz_vcf_fpath, bed_fpath, # s.name, s.bam) # for s in samples)) ############################ Combined ############################ # info() # info('*' * 70) # info('Saving for all samples: combined reports.') # # best_report = _prep_best_report(single_report_metric_storage, samples) # comb_report = _prep_comb_report(single_report_metric_storage, samples, shared_general_metrics, shared_metrics) # # total_variants = 0 # nth_regions_from_each_sample = [s.report.get_regions() for s in samples] # while True: # nth_region_from_each_sample = [rs[total_variants] for rs in nth_regions_from_each_sample if total_variants < len(rs)] # total_variants += 1 # if len(nth_region_from_each_sample) == 0: # break # assert len(nth_region_from_each_sample) == len(nth_regions_from_each_sample), 'Region files for samples are not euqal size' # # # best_report_reg = best_report.add_region() # comb_report_reg = comb_report.add_region() # rand_line = nth_region_from_each_sample[0] # for i in range(9): # # best_report_reg.records.append(rand_line.records[i]) # comb_report_reg.records.append(rand_line.records[i]) # # # best_depth = select_best(r.records[10].value for r in nth_region_from_each_sample) # # best_norm_depth = select_best(r.records[11].value for r in nth_region_from_each_sample) # # best_report_reg.add_record('Depth', best_depth) # # best_report_reg.add_record('Norm depth', best_norm_depth) # # for s, r in zip(samples, nth_region_from_each_sample): # comb_report_reg.add_record(s.name + ' hotspots depths/norm depths', r.records[9].value) # # best_report_basename = 'Best.' + source.targetseq_name + '_' + vcf_key # comb_report_basename = 'Comb.' + source.targetseq_name + '_' + vcf_key # # best_targetcov_norm_depth_vcf_txt = best_report.save_txt(output_dir, best_report_basename) # # best_targetcov_norm_depth_vcf_tsv = best_report.save_tsv(output_dir, best_report_basename) # comb_targetcov_norm_depth_vcf_txt = comb_report.save_txt(output_dir, comb_report_basename) # comb_targetcov_norm_depth_vcf_tsv = comb_report.save_tsv(output_dir, comb_report_basename) # info('') # info('Depths for Oncomine variants (total: {0:,} variants, {0:,} regions) saved into:'.format(total_variants)) # # info(' Best: ' + best_targetcov_norm_depth_vcf_txt) # info(' Combined: ' + comb_targetcov_norm_depth_vcf_txt) # return None, None # comb_targetcov_norm_depth_vcf_txt
def summarize_targqc(cnf, summary_threads, output_dir, samples, bed_fpath=None, features_fpath=None, tag_by_sample=None): step_greetings('TargQC coverage statistics for all samples') correct_samples = [] for sample in samples: # if not sample.targetcov_done(): # err('Error: target coverage is not done (json, html, or detail tsv are not there)') # else: correct_samples.append(sample) # if not sample.ngscat_done(): # sample.ngscat_html_fpath = None # if not sample.qualimap_done(): # sample.qualimap_html_fpath = None samples = correct_samples # _make_targetcov_symlinks(samples) txt_fpath, tsv_fpath, html_fpath = _make_tarqc_html_report( cnf, output_dir, samples, bed_fpath, tag_by_sample=tag_by_sample) best_for_regions_fpath = None if any( verify_file(s.targetcov_detailed_tsv, silent=True) for s in samples): best_for_regions_fpath = _save_best_details_for_each_gene( cnf.coverage_reports.depth_thresholds, samples, output_dir) ''' 1. best_regions = get_best_regions() 2. best_for_regions_fpath = save_per_region_report() 3. calc median coverage across best regions 4. flagged_regions_report_fpath = _generate_flagged_regions_report( output_dir, 'Best', average_coverage, genes, depth_threshs) ''' if cnf.extended: if not features_fpath or not bed_fpath: err('For the extended analysis, capture and features BED files are required!' ) else: features_bed, features_no_genes_cut_bed, target_bed, _ = prepare_beds( cnf, features_fpath, bed_fpath) #norm_best_var_fpath, norm_comb_var_fpath = _report_normalize_coverage_for_variant_sites( # cnf, summary_threads, output_dir, samples, 'oncomine', bed_fpath) info() info('*' * 70) if not html_fpath and not txt_fpath: info( 'TargQC summary was not generated, because there were no reports generated for individual samples.' ) else: info('TargQC summary saved in: ') for fpath in [txt_fpath, html_fpath]: if fpath: info(' ' + fpath) if best_for_regions_fpath: info() info('Best stats for regions saved in:') info(' ' + best_for_regions_fpath) # if cnf.extended: # if norm_best_var_fpath: # info() # info('Normalized depths for oncomine saved in:') # info(' ' + norm_comb_var_fpath) # info(' Best: ' + norm_best_var_fpath) return html_fpath
def make_report(cnf, vcf_fpath, sample): set_db_versions(cnf) step_greetings('Quality control reports') total_with_rejected = 0 total = 0 snps = 0 inss = 0 dels = 0 dbsnps = 0 cosmics = 0 novels = 0 hets = 0 homs = 0 transitions = 0 transversions = 0 with open_gzipsafe(vcf_fpath) as f: reader = vcf_parser.Reader(f) for rec in (vcf_processing.Record(rec, vcf_fpath, i) for i, rec in enumerate(reader)): total_with_rejected += 1 if not rec.FILTER or rec.FILTER == 'PASS': if rec.FILTER: warn('Warn: ' + rec.get_variant() + ' FILTER=' + str(rec.FILTER)) total += 1 if rec.is_snp: snps += 1 if rec.is_transition: transitions += 1 elif len(rec.ALT) == 1: transversions += 1 elif rec.is_indel: if rec.is_deletion: dels += 1 elif len(rec.ALT) == 1: inss += 1 if not rec.ID: novels += 1 else: ids = rec.ID if isinstance(ids, basestring): ids = [ids] if any(id.startswith('COS') for id in ids): cosmics += 1 if any(id.startswith('rs') for id in ids): dbsnps += 1 call = rec.samples[0] if call.called: if call.gt_type == 1: hets += 1 elif call.gt_type == 2: homs += 1 report = SampleReport(sample, metric_storage=metric_storage) report.add_record('Total variants', total) report.add_record('SNPs', snps) report.add_record('Insertions', inss) report.add_record('Deletions', dels) report.add_record('Novel', novels) report.add_record('Novel, %', 1.0 * novels / total if total else None) report.add_record('In dbSNP', dbsnps) report.add_record('In dbSNP, %', 1.0 * dbsnps / total if total else None) report.add_record('In Cosmic', cosmics) report.add_record('In Cosmic, %', 1.0 * cosmics / total if total else None) report.add_record('Het/hom', float(hets) / homs if homs != 0 else None) report.add_record( 'Ti/tv', float(transitions) / transversions if transversions != 0 else None) report.add_record('Total with rejected', total_with_rejected) return report
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath): if not vcf_conf: err('No database for ' + dbname + ', skipping.') return None step_greetings('Annotating with ' + dbname) output_fpath = intermediate_fname(cnf, input_fpath, dbname) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') java = get_system_path(cnf, 'java') info('Java version:') call(cnf, java + ' -version') info() db_path = cnf['genome'].get(dbname) if not db_path: db_path = vcf_conf.get('path') if not db_path: err('Please, provide a path to ' + dbname + ' in the "genomes" section in the system config. The config is: ' + str(cnf['genome'])) return verify_file(db_path, is_critical=True) annotations = vcf_conf.get('annotations') if not cnf.no_check: info('Removing previous annotations...') def delete_annos(rec): for anno in annotations: if anno in rec.INFO: del rec.INFO[anno] return rec if annotations: input_fpath = iterate_vcf(cnf, input_fpath, delete_annos, suffix='d') anno_line = '' if annotations: anno_line = '-info ' + ','.join(annotations) cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format( **locals()) output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True) if not output_fpath: err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname) return output_fpath verify_vcf(output_fpath, is_critical=True) # f = open(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open(output_fpath) # contents = f.read() # critical('SnpSift failed with memory issue:\n' + contents) # f.close() # return None if not cnf.no_check: info_pattern = re.compile( r'''\#\#INFO=< ID=(?P<id>[^,]+),\s* Number=(?P<number>-?\d+|\.|[AG]),\s* Type=(?P<type>Integer|Float|Flag|Character|String),\s* Description="(?P<desc>[^"]*)" >''', re.VERBOSE) def _fix_after_snpsift(line, i, ctx): if not line.startswith('#'): if not ctx['met_CHROM']: return None line = line.replace(' ', '_') assert ' ' not in line # elif line.startswith('##INFO=<ID=om'): # line = line.replace(' ', '') elif not ctx['met_CHROM'] and line.startswith('#CHROM'): ctx['met_CHROM'] = True elif line.startswith('##INFO'): m = info_pattern.match(line) if m: line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format( m.group('id'), m.group('number'), m.group('type'), m.group('desc')) return line output_fpath = iterate_file(cnf, output_fpath, _fix_after_snpsift, suffix='fx', ctx=dict(met_CHROM=False)) return verify_vcf(output_fpath, is_critical=True)
def make_report_metadata(cnf, bcbio_structure, oncoprints_link=None, circos_link=None): step_greetings('Making the %s project-level report' % ('preproc' if bcbio_structure is None else 'postproc')) # if dataset_structure is None and bcbio_structure: # analysis_dirpath = normpath(join(bcbio_structure.bcbio_project_dirpath, pardir)) # dataset_dirpath = realpath(join(analysis_dirpath, 'dataset')) # dataset_structure = DatasetStructure.create(dataset_dirpath, bcbio_structure.project_name) general_records = _add_summary_reports(cnf, metric_storage.general_section, bcbio_structure) # sample_reports_records = _add_per_sample_reports(cnf, metric_storage.sections[0], bcbio_structure) # sample_reports = [] # if dataset_project: # samples = dataset_project.sample_by_name.values() # if bcbio_structure: samples = bcbio_structure.samples # for sample in samples: # sample_reports.append(SampleReport(sample, # records=sample_reports_records[sample.name], # html_fpath=None, # metric_storage=metric_storage)) full_report = FullReport(cnf.project_name, [], metric_storage=metric_storage, general_records=general_records) project_report_html_fpath = bcbio_structure.multiqc_fpath project_name = bcbio_structure.project_name additional_data = dict() normal_samples = [ s for s in bcbio_structure.samples if s.phenotype == 'normal' ] if normal_samples: sample_match_on_hover_js = '<script type="text/javascript">\n' for s in bcbio_structure.samples: if s.phenotype != 'normal' and s.normal_match: sample_match_on_hover_js += ( '' + '\tdocument.getElementById("' + s.name + '_match").onmouseover = function() { document.getElementById("' + s.normal_match.name + '").style.backgroundColor = "#EEE"; };\n' + '\tdocument.getElementById("' + s.name + '_match").onmouseleave = function() { document.getElementById("' + s.normal_match.name + '").style.backgroundColor = "white"; };\n') sample_match_on_hover_js += '</script>\n' additional_data['sample_match_on_hover_js'] = sample_match_on_hover_js metadata = _report_to_multiqc_metadata(cnf, full_report, project_report_html_fpath, project_name, bcbio_structure, additional_data=additional_data, oncoprints_link=oncoprints_link, circos_link=circos_link) metadata_fpath = join(bcbio_structure.work_dir, 'az_multiqc_metadata.yaml') import yaml with open(metadata_fpath, 'w') as outfile: yaml.dump(metadata, outfile, default_flow_style=False) import json with open(metadata_fpath.replace('.yaml', '.json'), 'w') as outfile: json.dump(metadata, outfile) return metadata_fpath.replace('.yaml', '.json')
def _snpeff(cnf, input_fpath): if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome: return None, None, None step_greetings('SnpEff') output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff') stats_fpath = join( cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') + '.snpEff_summary.csv') if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' snpeff = get_java_tool_cmdline(cnf, 'snpeff') ref_name = cnf.genome.snpeff.reference or cnf.genome.name if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'): ref_name = 'GRCh37.75' if ref_name.startswith('hg38'): ref_name = 'GRCh38.82' opts = '' if cnf.annotation.snpeff.cancer: opts += ' -cancer' assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!' verify_file(cnf.transcripts_fpath, 'Transcripts for snpEff -onlyTr', is_critical=True) opts += ' -onlyTr ' + cnf.transcripts_fpath + ' ' db_path = adjust_system_path(cnf.genome.snpeff.data) if db_path: opts += ' -dataDir ' + db_path elif cnf.resources.snpeff.config: conf = get_system_path(cnf, cnf.resources.snpeff.config) if conf: opts += ' -c ' + conf + ' ' else: err('Cannot find snpEff config file ' + str(cnf.resources.snpeff.config)) if cnf.annotation.snpeff.extra_options: opts += '' if not cnf.no_check: info('Removing previous snpEff annotations...') res = remove_prev_eff_annotation(cnf, input_fpath) if not res: err('Could not remove preivous snpEff annotations') return None, None, None input_fpath = res snpeff_type = get_snpeff_type(snpeff) if snpeff_type == "old": opts += ' -stats ' + stats_fpath + ' -csvStats' else: opts += ' -csvStats ' + stats_fpath cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format( **locals()) for i in range(1, 20): try: res = call_subprocess(cnf, cmdline, input_fpath, output_fpath, exit_on_error=False, stdout_to_outputfile=True, overwrite=True) except OSError: import traceback, time err(traceback.format_exc()) warn() info('Waiting 1 minute') time.sleep(60) info('Rerunning ' + str(i)) else: break output_fpath = verify_vcf(output_fpath, is_critical=True) snpeff_summary_html_fpath = 'snpEff_summary.html' if isfile(snpeff_summary_html_fpath): info('SnpEff created ' + snpeff_summary_html_fpath + ' in the cwd, removing it...') try: os.remove(snpeff_summary_html_fpath) except OSError: pass if res: return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' else: return None, None, None
def create_oncoprints_link(cnf, bcbio_structure, project_name=None): if is_us(): loc = exposing.us # elif is_uk(): loc = exposing.uk else: loc = exposing.local return None if not bcbio_structure.variant_callers: info('No varianting calling performed, not generating Oncoprints') return None clinical_report_caller = \ bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') if not clinical_report_caller: err('Warning: vardict is not in the variant callers list, this not generating Oncoprints') return None step_greetings('Creating Oncoprints link') zhongwu_data_query_dirpath = '/home/kdld047/public_html/cgi-bin/TS' if not isdir(zhongwu_data_query_dirpath): warn('Data Query directory ' + zhongwu_data_query_dirpath + ' does not exists.') return None vardict_txt_fname = variant_filtering.mut_fname_template.format(caller_name=clinical_report_caller.name) vardict_txt_fpath = join(bcbio_structure.var_dirpath, vardict_txt_fname) cnf.mutations_fpath = add_suffix(vardict_txt_fpath, variant_filtering.mut_pass_suffix) cnf.seq2c_tsv_fpath = bcbio_structure.seq2c_fpath samples = sorted(bcbio_structure.samples) cnf.project_name = project_name or bcbio_structure.project_name or basename(cnf.output_dir) study_name = re.sub('[\.\-:&]', '_', cnf.project_name) check_genome_resources(cnf) data_query_dirpath = join(loc.dirpath, 'DataQueryTool') data_fpath = join(zhongwu_data_query_dirpath, study_name + '.data.txt') info_fpath = join(zhongwu_data_query_dirpath, study_name + '.info.txt') altered_genes = print_data_txt(cnf, cnf.mutations_fpath, cnf.seq2c_tsv_fpath, samples, data_fpath) if not altered_genes: err('No altered genes in ' + cnf.mutations_fpath + ' or ' + cnf.seq2c_tsv_fpath + ', not generating Oncoptints.') return None print_info_txt(cnf, samples, info_fpath) data_ext_fpath = data_fpath.replace('/home/', '/users/') info_ext_fpath = info_fpath.replace('/home/', '/users/') # optional: data_symlink = join(data_query_dirpath, study_name + '.data.txt') info_symlink = join(data_query_dirpath, study_name + '.info.txt') (symlink_to_ngs if is_us() else local_symlink)(data_ext_fpath, data_symlink) (symlink_to_ngs if is_us() else local_symlink)(info_ext_fpath, info_symlink) properties_fpath = join(zhongwu_data_query_dirpath, 'DataQuery.properties') add_data_query_properties(cnf, study_name, properties_fpath, data_ext_fpath, info_ext_fpath) genes = '%0D%0A'.join(altered_genes) data_query_url = join(loc.website_url_base, 'DataQueryTool', 'DataQuery.pl?' 'analysis=oncoprint&' 'study={study_name}&' 'gene={genes}&' 'order=on&' 'freq=50&' 'nocheckgenes=true&' 'submit=Submit' .format(**locals())) info() info('Information about study was added in Data Query Tool, URL is ' + data_query_url) return data_query_url
def run_annotators(cnf, vcf_fpath, bam_fpath): original_vcf = cnf.vcf db_section_by_name = OrderedDict( (dbname, cnf.annotation[dbname]) for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine'] if dbname in cnf.annotation and not cnf.annotation[dbname].get('skip-annotation')) # if not cnf.no_check: # to_delete_id_ref = [] # if 'dbsnp' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as rs*') # to_delete_id_ref.append('rs') # if 'cosmic' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as COS*') # to_delete_id_ref.append('COS') # # def delete_ids(rec): # deleting existing dbsnp and cosmic ID annotations # if rec.ID: # if isinstance(rec.ID, basestring): # if any(rec.ID.startswith(pref) for pref in to_delete_id_ref): # rec.ID = None # else: # rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)] # # if not rec.FILTER: # rec.FILTER = 'PASS' # # return rec # # info('Removing previous rs* and COS* IDs') # vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID') bcftools = get_system_path(cnf, 'bcftools') if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'): vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) cmdl = '{bcftools} annotate --remove ID {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid')) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get( 'custom_vcfs', dict()).items(): step_greetings('Annotating using ' + dbname) annotations = ','.join('INFO/' + a for a in dbconf.get('annotations')) if dbname in ('cosmic', 'dbsnp'): annotations += ',=ID' db_fpath = get_db_path(cnf, dbconf, dbname) if db_fpath: cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname)) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) verify_vcf(vcf_fpath, is_critical=True) if 'dbnsfp' in cnf.annotation: res = _snpsift_db_nsfp(cnf, vcf_fpath) if res: vcf_fpath = res if 'snpeff' in cnf.annotation: res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath) if res: vcf_fpath = res verify_vcf(vcf_fpath, is_critical=True) final_summary_fpath = join(cnf.output_dir, basename(summary_fpath)) final_genes_fpath = join(cnf.output_dir, basename(genes_fpath)) if isfile(final_summary_fpath): os.remove(final_summary_fpath) if isfile(final_genes_fpath): os.remove(final_genes_fpath) if file_exists(summary_fpath): shutil.move(summary_fpath, final_summary_fpath) if file_exists(genes_fpath): shutil.move(genes_fpath, final_genes_fpath) if 'tracks' in cnf.annotation and cnf.annotation[ 'tracks'] and cnf.annotation['tracks']: track_fapths = [] for track_name in cnf.annotation['tracks']: if isfile(track_name) and verify_file(track_name): track_fapths.append(track_name) else: if 'tracks' in cnf['genome'] and cnf['genome'][ 'tracks'] and track_name in cnf['genome']['tracks']: track_fpath = cnf['genome']['tracks'][track_name] if verify_file(track_fpath): track_fapths.append(track_fpath) for track_fapth in track_fapths: res = _tracks(cnf, track_fapth, vcf_fpath) if res: vcf_fpath = res step_greetings('Intersection with database VCFs...') if 'intersect_with' in cnf.annotation: for key, db_fpath in cnf.annotation['intersect_with'].items(): res = intersect_vcf(cnf, input_fpath=vcf_fpath, db_fpath=db_fpath, key=key) if res: vcf_fpath = res if 'mongo' in cnf.annotation: res = _mongo(cnf, vcf_fpath) if res: vcf_fpath = res return vcf_fpath