def make_key_genes_cov_report(experiment_by_key): info('Making key genes coverage report...') ms = [ Metric('Gene'), Metric('Chr', with_heatmap=False, max_width=20, align='right') ] for i, (k, e) in enumerate(experiment_by_key.items()): ms.extend([ Metric(k + ' Ave depth', short_name=k + '\nave depth', med=e.ave_depth, class_='shifted_column' if i == 0 else ''), Metric(k + ' % cov at {}x'.format(e.depth_cutoff), short_name='% at {}x'.format(e.depth_cutoff), unit='%', med=1, low_inner_fence=0.5, low_outer_fence=0.1), Metric(k + ' CNV', short_name=' CNV') ] # short name is hack for IE9 who doesn't have "text-align: left" and tries to stick "CNV" to the previous col header ) clinical_cov_metric_storage = MetricStorage( sections=[ReportSection(metrics=ms)]) key_genes_report = PerRegionSampleReport( sample=experiment_by_key.values()[0].sample, metric_storage=clinical_cov_metric_storage) # Writing records hits_by_gene_by_experiment = OrderedDefaultDict(OrderedDict) for k, e in experiment_by_key.items(): for gene in e.key_gene_by_name.values(): hits_by_gene_by_experiment[gene.name][e] = gene for gname, hit_by_experiment in sorted( hits_by_gene_by_experiment.items(), key=lambda (gname, h): gname): gene = next( (m for m in hit_by_experiment.values() if m is not None), None) row = key_genes_report.add_row() row.add_record('Gene', gene.name) row.add_record('Chr', gene.chrom.replace('chr', '')) for e, hit in hit_by_experiment.items(): row.add_record(e.key + ' Ave depth', hit.ave_depth) m = clinical_cov_metric_storage.find_metric( e.key + ' % cov at {}x'.format(e.depth_cutoff)) row.add_record( m.name, next((cov for cutoff, cov in hit.cov_by_threshs.items() if cutoff == e.depth_cutoff), None)) if hit.seq2c_event and (hit.seq2c_event.is_amp() or hit.seq2c_event.is_del()): row.add_record( e.key + ' CNV', hit.seq2c_event.amp_del + ', ' + hit.seq2c_event.fragment) return key_genes_report
def main(): cnf = read_opts_and_cnfs( description='Plotting Seq2C results.', extra_opts=[ (['--seq2c-results'], dict( dest='seq2c_tsv_fpath') ), (['--key-genes'], dict( dest='key_genes_fpath') ), ], required_keys=['seq2c_tsv_fpath', 'output_dir'], file_keys=['seq2c_tsv_fpath', 'key_genes'], key_for_sample_name=None, ) check_system_resources(cnf) check_genome_resources(cnf) key_gene_names = None if cnf.key_genes_fpath: with open(cnf.key_genes_fpath) as f: key_gene_names = set([l.strip() for l in f.readlines() if l.strip() != '']) plot_fpath = draw_seq2c_plot(cnf, cnf.seq2c_tsv_fpath, cnf.sample, cnf.output_dir, key_gene_names) if plot_fpath: info('Saved plot to ' + plot_fpath)
def add_data_query_properties(cnf, study_name, properties_fpath, data_fpath, info_fpath): # modify properties properties_lines = [] text_to_add = None lines = open(properties_fpath).read().split('\n') for l in lines: l = l.strip() if 'studies=' in l: studies = l.split('=')[1].split(';') if study_name not in studies: l += ';' + study_name if 'study2desc' in l: text_to_add = '{study_name} => "{study_name}", \\'.format(**locals()) if 'study2data' in l: text_to_add = '{study_name} => "{data_fpath}", \\'.format(**locals()) if 'study2info' in l: text_to_add = '{study_name} => "{info_fpath}", \\'.format(**locals()) if study_name + ' => ' in l: info(l.strip() + ' already present in properties, removing it.') continue if l == '}' and text_to_add and text_to_add not in properties_lines: properties_lines.append(text_to_add) text_to_add = None properties_lines.append(l) with file_transaction(cnf.work_dir, properties_fpath) as tx: with open(tx, 'w') as out: for l in properties_lines: out.write(l + '\n')
def function_timer(*args, **kwargs): t0 = time.time() result = function(*args, **kwargs) t1 = time.time() info('Total time running %s: %s seconds' % (function.func_name, str(t1 - t0))) return result
def proc_args(): info(' '.join(sys.argv)) info() cnf, bcbio_structure = bcbio_summary_script_proc_params( BCBioStructure.seq2c_name, extra_opts=[ (['--bed', '--capture', '--amplicons'], dict( dest='bed' )) ], ) return cnf, bcbio_structure
def parse_seq2c(seq2c_tsv_fpath, altered_genes, key_gene_by_name_chrom): seq2c_events_by_sample = defaultdict(list) if not seq2c_tsv_fpath or not verify_file(seq2c_tsv_fpath): return seq2c_events_by_sample, altered_genes info('Parsing Seq2C from ' + seq2c_tsv_fpath) if seq2c_tsv_fpath and verify_file(seq2c_tsv_fpath): with open(seq2c_tsv_fpath) as f_inp: for i, l in enumerate(f_inp): if i == 0: continue fs = l.replace('\n', '').split('\t') sname, gname, chrom = fs[0], fs[1], fs[2] if (gname, chrom) not in key_gene_by_name_chrom: continue sname, gname, chrom, start, end, length, log2r, sig, fragment, amp_del, ab_seg, total_seg, \ ab_log2r, log2r_diff, ab_seg_loc, ab_samples, ab_samples_pcnt = fs[:17] if not amp_del: continue if fragment == 'BP': exons = str(ab_seg) + ' of ' + total_seg copy_number = round(2 ** float(ab_log2r) * 2, 2) if copy_number > 2: copy_number = round(copy_number, 1) else: exons = 'Whole' if amp_del == 'Amp': ab_log2r = copy_number = 'AMP' else: ab_log2r = copy_number = 'HOMDEL' cnv_type = 'amplification' if amp_del == 'Amp' else 'loss' event = OncoprintSeq2CEvent( gene=gname, copy_number=str(copy_number), exons=exons, ratio=ab_log2r, cnv_type=cnv_type) seq2c_events_by_sample[sname].append(event) altered_genes.add(gname) return seq2c_events_by_sample, altered_genes
def get_rejected_mutations(cnf, bs, key_gene_by_name_chrom, genes_collection_type): rejected_mutations = defaultdict(dict) rejected_mutations_by_sample = defaultdict(list) pass_mutations_fpath, _ = get_mutations_fpath_from_bs(bs) for reject_mutations_fpath in get_rejected_mutations_fpaths( pass_mutations_fpath): if verify_file(reject_mutations_fpath, silent=True): info('Parsing rejected mutations from ' + str(reject_mutations_fpath)) parse_mutations(cnf, None, key_gene_by_name_chrom, reject_mutations_fpath, genes_collection_type, mutations_dict=rejected_mutations_by_sample) for sample, mutations in rejected_mutations_by_sample.iteritems(): for mut in mutations: rejected_mutations[sample][(mut.gene.name, mut.pos)] = mut return rejected_mutations
def main(): info(' '.join(sys.argv)) info() description = 'This script runs preprocessing.' parser = OptionParser(description=description) parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath') parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath') parser.add_option('--sample', dest='sample_name', help='Sample name') parser.add_option('-o', dest='output_dir', help='Output directory path') parser.add_option('--downsample-to', dest='downsample_to', default=None, type='int', help='Downsample reads to avoid excessive processing times with large files. ' 'Default is 1 million. Set to 0 to turn off downsampling.') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) (opts, args) = parser.parse_args() if not opts.left_reads_fpath or not opts.right_reads_fpath or not opts.output_dir: parser.print_usage() verify_file(opts.left_reads_fpath, is_critical=False) left_reads_fpath = adjust_path(opts.left_reads_fpath) verify_file(opts.right_reads_fpath, is_critical=False) right_reads_fpath = adjust_path(opts.right_reads_fpath) output_dirpath = adjust_path(opts.output_dir) if opts.output_dir else critical('Please, specify output directory with -o') verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True) left_reads_fpath, right_reads_fpath, output_dirpath =\ map(_proc_path, [left_reads_fpath, right_reads_fpath, output_dirpath]) ssh = connect_to_server(server_url='blue.usbod.astrazeneca.net', username='******', password='******') fastqc_py = get_script_cmdline(None, 'python', 'scripts/pre/fastqc.py') fastqc_py = fastqc_py.replace(REPORTING_SUITE_PATH_CLARITY, REPORTING_SUITE_PATH_WALTHAM) fastqc_py = fastqc_py.replace(PYTHON_PATH_CLARITY, PYTHON_PATH_WALTHAM) cmdl = '{fastqc_py} -1 {left_reads_fpath} -2 {right_reads_fpath} -o {output_dirpath}' if opts.sample_name: cmdl += ' --sample {opts.sample_name}' if opts.downsample_to: cmdl += ' --downsample-to ' + str(int(opts.downsample_to)) cmdl = cmdl.format(**locals()) cmdl += ' 2>&1' info(cmdl) stdin, stdout, stderr = ssh.exec_command(cmdl) for l in stdout: err(l, ending='') info() ssh.close()
def make_gene_expression_heatmaps(cnf, bcbio_structure, counts_fpath, genes_dict, report_fpath, report_name, keep_gene_names=False): # key_gene_names = get_key_genes(verify_file(adjust_system_path(cnf.key_genes), 'key genes')) gene_counts, samples = parse_gene_counts( counts_fpath, None, report_name[0].lower() + report_name[1:], keep_gene_names) report = make_expression_heatmap(bcbio_structure, gene_counts) counts_fname = basename(counts_fpath) counts_url = relpath(counts_fpath, report_fpath) counts_link = ( 'Showing genes with count >=' + str(HEATMAPS_MIN_COUNT) + ' at least in one sample. ' + 'The full results can be downloaded from here: <a href="{counts_url}" target="_blank">{counts_fname}</a>' ).format(**locals()) data_dict = {'file_link': counts_link} BaseReport.save_html(report, cnf, report_fpath, caption=report_name, extra_js_fpaths=[ join(dirname(abspath(__file__)), 'static', 'rnaseq_heatmaps.js') ], extra_css_fpaths=[ join(dirname(abspath(__file__)), 'static', 'rnaseq.css') ], tmpl_fpath=join(dirname(abspath(__file__)), 'template.html'), data_dict=data_dict) info(report_name + ' heatmap saved in ' + report_fpath) return
def parse_gene_counts(counts_fpath, key_gene_names, report_name, keep_gene_names): gene_counts = defaultdict(list) info('Preparing ' + report_name + ' stats for expression heatmaps') info('Checking ' + counts_fpath) if not verify_file(counts_fpath): err('Cannot find ' + report_name + ' fpath') return [] info('Reading ' + report_name + ' from ' + counts_fpath) samples_cols = dict() samples = [] gene_col = None with open(counts_fpath) as f: for i, l in enumerate(f): if i == 0: header = l.strip().split('\t') gene_col = header.index('HUGO') samples = header[1:gene_col] samples_cols = { sample: col + 1 for col, sample in enumerate(samples) } continue fs = l.replace('\n', '').split('\t') gene_name = fs[gene_col] if key_gene_names and gene_name not in key_gene_names: continue gene_expression_dict = { sample: int(float(fs[col])) if float(fs[col]).is_integer() else float(fs[col]) for sample, col in samples_cols.iteritems() } if all(v < HEATMAPS_MIN_COUNT for v in gene_expression_dict.values()): continue is_hidden_row = False name = gene_name if ':' in fs[0]: ## exon number is_hidden_row = True exon_number = fs[0].split(':')[1] name += ':' + exon_number if keep_gene_names: is_hidden_row = True name = fs[0] # use id gene = Counts(name, gene_name=gene_name, counts=gene_expression_dict, is_hidden_row=is_hidden_row) gene_counts[gene_name].append(gene) return gene_counts, samples
def main(): info(' '.join(sys.argv)) info() description = 'This script runs preprocessing.' parser = OptionParser(description=description) parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath') parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath') parser.add_option('--sample', dest='sample_name', help='Sample name') parser.add_option('--suffix', dest='suffix', default='subset', help='Output files suffix') parser.add_option('-o', dest='output_dir', help='Output directory path') parser.add_option( '--downsample-to', dest='downsample_to', default=5e5, type='int', help= 'Downsample reads to avoid excessive processing times with large files. ' 'Default is 1 million. Set to 0 to turn off downsampling.') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) (opts, args) = parser.parse_args() logger.is_debug = opts.debug cnf = Config(opts.__dict__, determine_sys_cnf(opts), determine_run_cnf(opts)) left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True) right_reads_fpath = verify_file( opts.right_reads_fpath, is_critical=True) if opts.right_reads_fpath else None output_dirpath = adjust_path( opts.output_dir) if opts.output_dir else critical( 'Please, specify output directory with -o') safe_mkdir(output_dirpath) verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True) with workdir(cnf): info('Downsampling to ' + str(cnf.downsample_to)) downsample(cnf, cnf.sample_name, left_reads_fpath, right_reads_fpath, cnf.downsample_to, output_dir=cnf.output_dir, suffix=cnf.suffix)
def run_sambamba_use_grid(cnf, infos_by_key, mut_bed_fpath): sambamba_output_by_experiment = dict() not_submitted_experiments = infos_by_key.values() while not_submitted_experiments: jobs_to_wait = [] submitted_experiments = [] reused_experiments = [] for (group, uniq_key), e in infos_by_key.iteritems(): if e not in not_submitted_experiments: continue sambamba_output_fpath = join(cnf.work_dir, uniq_key + '__mutations.bed') sambamba_output_by_experiment[e] = sambamba_output_fpath if cnf.reuse_intermediate and verify_file(sambamba_output_fpath, silent=True): info(sambamba_output_fpath + ' exists, reusing') reused_experiments.append(e) continue else: if not e.sample.bam: err('Sample ' + e.sample.name + ' in ' + str(group) + ', ' + str(uniq_key) + ' has no BAM') continue j = sambamba_depth(cnf, mut_bed_fpath, e.sample.bam, output_fpath=sambamba_output_fpath, only_depth=True, silent=True, use_grid=True) submitted_experiments.append(e) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: break if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No jobs to submit.') not_submitted_experiments = [ e for e in not_submitted_experiments if e not in submitted_experiments and e not in reused_experiments ] return sambamba_output_by_experiment
def run_fastq(cnf, sample_name, l_r_fpath, r_r_fpath, output_dirpath, downsample_to=1e7): fastqc = get_system_path(cnf, 'fastqc', is_critical=True) java = get_system_path(cnf, 'java', is_critical=True) if downsample_to: info('Downsampling to ' + str(downsample_to)) l_fpath, r_fpath = downsample(cnf, sample_name, l_r_fpath, r_r_fpath, downsample_to, output_dir=cnf.work_dir) # Joining fastq files to run on a combination fastqc_fpath = join(cnf.work_dir, sample_name + '.fq') info('Combining fastqs, writing to ' + fastqc_fpath) with open(fastqc_fpath, 'w') as out: out.write(open_gzipsafe(l_r_fpath).read()) out.write(open_gzipsafe(r_r_fpath).read()) # Running FastQC info('Running FastQC') tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp') safe_mkdir(tmp_dirpath) cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format( **locals()) call(cnf, cmdline) # Cleaning and getting report sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc') if isfile(sample_fastqc_dirpath + '.zip'): os.remove(sample_fastqc_dirpath + '.zip') fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html') verify_file(fastqc_html_fpath, is_critical=True) return sample_fastqc_dirpath
def run_clinical_target2wgs(cnf, wgs_bs, trg_bs, shared_sample_names, output_dirpath): info('Running clinical reporting comparison') for sname in shared_sample_names: info('Preparing ' + sname + '...') trg_sample = next(s for s in trg_bs.samples if s.name == sname) wgs_sample = next(s for s in wgs_bs.samples if s.name == sname) info('-' * 70) clin_trg_info = clinical_sample_info_from_bcbio_structure( cnf, trg_bs, trg_sample, is_target2wgs_comparison=True) info('') info('-' * 70) clin_wgs_info = clinical_sample_info_from_bcbio_structure( cnf, wgs_bs, wgs_sample, is_target2wgs_comparison=True) info('') info('*' * 70) infos_by_key = {'Target': clin_trg_info, 'WGS': clin_wgs_info} run_sample_combine_clinreport(cnf, infos_by_key, output_dirpath, is_target2wgs=True) info('*' * 70) info('Successfully finished.')
def parse_sv_files(cnf, samples, altered_genes, key_gene_by_name_chrom): sv_events_by_samples = defaultdict(set) sv_fpaths = [sample.find_sv_fpath() for sample in samples] sv_fpaths = [f for f in sv_fpaths if f] if not sv_fpaths: return sv_events_by_samples, altered_genes chr_order = get_chrom_order(cnf) all_events = dict() with open(cnf.transcripts_fpath) as f: transcripts = [tr.strip() for tr in f] for sv_fpath in sv_fpaths: info('Parsing prioritized SV from ' + sv_fpath) sample_col = None known_col = None with open(sv_fpath) as f: header_rows = [] for i, l in enumerate(f): fs = l.strip().split('\t') if i == 0: # header_rows = fs # caller sample chrom start end svtype known end_gene lof annotation split_read_support paired_end_support header_rows = fs # caller sample chrom start end svtype lof annotation split_read_support paired_support_PE paired_support_PR sample_col = header_rows.index('sample') # known_col = header_rows.index('known') else: event = SVEvent.parse_sv_event(chr_order, key_gene_by_name_chrom, transcripts, **dict(zip(header_rows, fs))) sample = fs[sample_col] if event: all_events[(sample, event.id)] = event for annotation in event.annotations: if event.is_fusion() or event.is_known_fusion(annotation) or annotation.effect == 'EXON_DEL': if event.end: event.chrom2 = event.chrom if event.is_known_fusion(annotation): annotation.known = True key_altered_genes = [g for g in annotation.genes if (g, event.chrom) in key_gene_by_name_chrom] if (annotation.effect == 'FUSION' or annotation.effect == 'EXON_DEL' or annotation.known) \ and key_altered_genes: annotation.event = event event.key_annotations.add(annotation) # event.supplementary = '-with-' in fs[known_col] sv_events_by_samples[sample].add(event) for g in key_altered_genes: altered_genes.add(g) sv_anns_by_samples = dict() for sample, events in sv_events_by_samples.iteritems(): # combine two annotations of fusion in one # suppl_events = {e.mate_id: e for e in events if e.supplementary} # main_events = [e for e in events if not e.supplementary] # for event in main_events: # if event.id not in suppl_events: # continue # suppl_event = suppl_events[event.id] # event.end = suppl_event.chrom + ':' + str(suppl_event.start) sv_anns_by_key = OrderedDefaultDict(SVEvent.Annotation) for event in events: if not event.end and event.mate_id: event_mate = all_events[(sample, event.mate_id)] event.end = event_mate.start event.chrom2 = event_mate.chrom for an in event.key_annotations: sv_anns_by_key[an.get_key()].update_annotation(an) sv_anns_by_samples[sample] = sv_anns_by_key.values() return sv_anns_by_samples, altered_genes
def parse_mutations(mutations_fpath, altered_genes, key_gene_by_name_chrom): mut_by_samples = defaultdict(list) if not mutations_fpath or not verify_file(mutations_fpath): return mut_by_samples, altered_genes info('Parsing mutations from ' + mutations_fpath) sample_col = None chr_col = None pos_col = None type_col = None allele_freq_col = None gene_col = None depth_col = None aa_chg_col = None cdna_chg_col = None status_col = None signif_col = None incidentalome_col = None stop_gain_pattern = re.compile('^[A-Z]+\d+\*') fs_pattern = re.compile('^[A-Z]+(\d+)fs') aa_chg_pattern = re.compile('^([A-Z]\d+)[A-Z]$') with open(mutations_fpath) as txt: for i, l in enumerate(txt): l = l.replace('\n', '') if not l: continue if i == 0: header = l.split('\t') sample_col = header.index('Sample') chr_col = header.index('Chr') pos_col = header.index('Start') type_col = header.index('Type') allele_freq_col = header.index('AlleleFreq') gene_col = header.index('Gene') aa_chg_col = header.index('Amino_Acid_Change') cdna_chg_col = header.index('cDNA_Change') depth_col = header.index('Depth') if 'Status' in header: status_col = header.index('Status') if 'Significance' in header: signif_col = header.index('Significance') else: signif_col = len(header) - header[::-1].index('Status') - 1 # get last index of status if 'Incidentalome' in header: incidentalome_col = header.index('Incidentalome') continue fs = l.replace('\n', '').split('\t') sample, gene, chrom, pos, type_ = fs[sample_col], fs[gene_col], fs[chr_col], fs[pos_col], fs[type_col] if (gene, chrom) not in key_gene_by_name_chrom: continue mut = OncoprintMutation(chrom, pos, gene) mut.aa_change, mut.cdna_change, mut.depth, mut.freq = fs[aa_chg_col], fs[cdna_chg_col], fs[depth_col], float(fs[allele_freq_col]) mut.status = fs[status_col] if status_col is not None else None mut.signif = fs[signif_col] if signif_col is not None else None incidentalome_reason = fs[incidentalome_col] if incidentalome_col is not None else None if incidentalome_reason: continue mut.type = 'Known' if mut.signif != 'unknown' else 'Unknown' if 'splice' in type_: mut.type = 'Splice' elif stop_gain_pattern.match(mut.aa_change): mut.type = 'Trunc/FS' elif fs_pattern.match(mut.aa_change): mut.type = 'Trunc/FS' elif mut.aa_change.startswith('-'): mut.type = 'Trunc/FS' elif 'missense' in type_: mut.type += '-Missense' elif 'ins' in mut.aa_change or 'del' in mut.aa_change: mut.type += '-Indel' else: mut.type += '-Other' mut_by_samples[sample].append(mut) altered_genes.add(gene) return mut_by_samples, altered_genes
def downsample(cnf, sample_name, fastq_L_fpath, fastq_R_fpath, N, output_dir, suffix=None, quick=False): """ get N random headers from a fastq file without reading the whole thing into memory modified from: http://www.biostars.org/p/6544/ quick=True will just grab the first N reads rather than do a true downsampling """ sample_name = sample_name or splitext(''.join( lc if lc == rc else '' for lc, rc in izip(fastq_L_fpath, fastq_R_fpath)))[0] l_out_fpath = join(output_dir, add_suffix(basename(fastq_L_fpath), suffix or 'subset')) r_out_fpath = join(output_dir, add_suffix(basename(fastq_R_fpath), suffix or 'subset')) if cnf.reuse_intermediate and verify_file( l_out_fpath, silent=True) and verify_file(r_out_fpath, silent=True): info(l_out_fpath + ' and ' + r_out_fpath + ' exist, reusing.') return l_out_fpath, r_out_fpath info('Processing ' + sample_name) N = int(N) records_num = N if quick: rand_records = range(N) else: info(sample_name + ': getting number of reads in fastq...') records_num = sum(1 for _ in open_gzipsafe(fastq_L_fpath)) / 4 if records_num > LIMIT: info(sample_name + ' the number of reads is higher than ' + str(LIMIT) + ', sampling from only first ' + str(LIMIT)) records_num = LIMIT info(sample_name + ': ' + str(records_num) + ' reads') if records_num < N: info(sample_name + ': and it is less than ' + str(N) + ', so no downsampling.') return fastq_L_fpath, fastq_R_fpath else: info(sample_name + ': downsampling to ' + str(N)) rand_records = sorted(random.sample(xrange(records_num), N)) info('Opening ' + fastq_L_fpath) fh1 = open_gzipsafe(fastq_L_fpath) info('Opening ' + fastq_R_fpath) fh2 = open_gzipsafe(fastq_R_fpath) if fastq_R_fpath else None out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath) written_records = 0 with file_transaction(cnf.work_dir, out_files) as tx_out_files: if isinstance(tx_out_files, basestring): tx_out_f1 = tx_out_files else: tx_out_f1, tx_out_f2 = tx_out_files info('Opening ' + str(tx_out_f1) + ' to write') sub1 = open_gzipsafe(tx_out_f1, "w") info('Opening ' + str(tx_out_f2) + ' to write') sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None rec_no = -1 for rr in rand_records: while rec_no < rr: rec_no += 1 for i in range(4): fh1.readline() if fh2: for i in range(4): fh2.readline() for i in range(4): sub1.write(fh1.readline()) if sub2: sub2.write(fh2.readline()) written_records += 1 rec_no += 1 if written_records % 10000 == 0: info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no)) if rec_no > records_num: info(sample_name + ' reached the limit of ' + str(records_num), ' read lines, stopping.') break info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no)) fh1.close() sub1.close() if fastq_R_fpath: fh2.close() sub2.close() info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written') return l_out_fpath, r_out_fpath
def run_combine_clinical_reports(cnf, bcbio_structures, parameters_info, samples_data): info('Running clinical reporting comparison') infos_by_key = OrderedDict() sample_names = [s.name for bs in bcbio_structures for s in bs.samples] for i, bs in enumerate(bcbio_structures): info() info('Preparing ' + bs.project_name + '...') info('-' * 70) for sample in bs.samples: if not cnf.sample_names or (cnf.sample_names and sample.name in cnf.sample_names): info('Preparing ' + sample.name + '...') info('-' * 70) # sample.targetcov_detailed_tsv = None clin_info = clinical_sample_info_from_bcbio_structure( cnf, bs, sample) group = samples_data[bs.bcbio_project_dirpath][ sample.name].group uniq_key = get_uniq_sample_key(bs.project_name, sample, sample_names) infos_by_key[(group, uniq_key)] = clin_info info('') rejected_mutations = get_rejected_mutations( cnf, bs, clin_info.key_gene_by_name_chrom, clin_info.genes_collection_type) for sample in bs.samples: if not cnf.sample_names or (cnf.sample_names and sample.name in cnf.sample_names): group = samples_data[bs.bcbio_project_dirpath][ sample.name].group uniq_key = get_uniq_sample_key(bs.project_name, sample, sample_names) infos_by_key[( group, uniq_key )].rejected_mutations = rejected_mutations[sample.name] sample_infos = OrderedDict({ k: get_sample_info(e.sample.name, e.sample.dirpath, samples_data) for k, e in infos_by_key.iteritems() }) sorted_sample_infos = sorted(sample_infos.items(), key=lambda x: ([x[1][j] for j in range(len(x[1]))], x[0][1])) sorted_experiments = OrderedDict() for k, v in sorted_sample_infos: sorted_experiments[k] = infos_by_key[k] save_all_mutations_depth(cnf, infos_by_key) info('*' * 70) run_sample_combine_clinreport(cnf, infos_by_key, cnf.output_dir, parameters_info, samples_data) info('*' * 70) info('Successfully finished.')
def write_report(self, output_fpath, is_target2wgs=False, sample_names=None): info('') data = { 'key_or_target': self.experiment_by_key.values()[0].genes_collection_type, 'genes_description': self.experiment_by_key.values()[0].genes_description, 'sample': { 'experiments': [ self.sample_section(e, sample_name=e.sample.name + (', ' + sample_names[e] if sample_names and e in sample_names else '')) for k, e in self.experiment_by_key.items() ], }, # 'patient': self.__patient_section(self.patient), # 'sample_name': self.sample_name, 'variants': self.__mutations_section(self.mutations_report, self.experiment_by_key), 'coverage': self.__coverage_section(self, self.key_genes_report, self.cov_plot_data), # 'actionable_genes': self.__actionable_genes_section() } min_af = self.cnf.min_af or 0 data['min_af'] = str(float(min_af) * 100) if self.seq2c_report: data['seq2c'] = {'amp_del': self.seq2c_section()} if self.seq2c_plot_data: data['seq2c']['plot'] = {'plot_data': self.seq2c_plot_data} if data['variants']: data['variants']['venn_diagram'] = { 'diagram_data': self.venn_plot_data } data['variants']['mut_parameters'] = self.mutations_parameters write_static_html_report( self.cnf, data, output_fpath, tmpl_fpath=join( dirname(abspath(__file__)), 'template_target2wgs.html' if is_target2wgs else 'template_combine.html'), extra_js_fpaths=[ join(dirname(abspath(__file__)), 'static', 'clinical_report.js'), join(dirname(abspath(__file__)), 'static', 'combined_clinical_report.js'), join(dirname(abspath(__file__)), 'static', 'draw_genes_coverage_plot.js'), join(dirname(abspath(__file__)), 'static', 'draw_mutations_plot.js'), join(dirname(abspath(__file__)), 'static', 'd3.min.js'), join(dirname(abspath(__file__)), 'static', 'venn.js'), join(dirname(abspath(__file__)), 'static', 'draw_venn_diagram.js'), join(dirname(abspath(__file__)), 'static', 'draw_substitutions_plot.js'), join(dirname(abspath(__file__)), 'static', 'draw_seq2c_plot.js') ], extra_css_fpaths=[ join(dirname(abspath(__file__)), 'static', 'clinical_report.css'), join(dirname(abspath(__file__)), 'static', 'header_picture.css') ]) info('Saved clinical report to ' + output_fpath) info('-' * 70) info() return output_fpath
def draw_seq2c_plot(cnf, seq2c_tsv_fpath, sample_name, output_dir, key_gene_names=None, chr_lens=None): info('Seq2C plot builder') plot_fpath = join(output_dir, sample_name + cnv_plot_ending) if cnf.reuse_intermediate and verify_file(plot_fpath, silent=True): info('Seq2C plot ' + plot_fpath + ' exists, reusing...') return plot_fpath if not verify_file(seq2c_tsv_fpath, 'Seq2C.tsv'): return None chr_names_lengths = OrderedDict( (chr_, l) for chr_, l in (chr_lens or get_chr_lengths(cnf)) if '_' not in chr_) # not drawing extra chromosomes chr1_blablabla chr_names = chr_names_lengths.keys() chr_short_names = [chrom[3:] for chrom in chr_names_lengths.keys()] chr_lengths = [chrom for chrom in chr_names_lengths.values()] fig = matplotlib.pyplot.figure(figsize=(25, 5)) matplotlib.pyplot.xlim([0, len(chr_lengths) + 1]) chr_cum_lens = [sum(chr_lengths[:i]) for i in range(len(chr_lengths) + 1)] matplotlib.pyplot.xticks(chr_cum_lens, []) ax = matplotlib.pyplot.gca() chr_names_coords = [ chr_cum_lens[i + 1] - chr_lengths[i] / 2 for i in range(len(chr_lengths)) ] ax.xaxis.set_minor_locator(ticker.FixedLocator(chr_names_coords)) ax.xaxis.set_minor_formatter(ticker.FixedFormatter(chr_short_names)) # def add_rec_to_plot(chr_, start, end, log2r, max_y, min_y, marker, color, label=None): # x_vals = [chr_cum_lengths[chr_names.index(chr_)] + (int(start) + int(end))/2] # point_y = float(log2r) # y_vals = [point_y] # max_y = max(max_y, point_y) # min_y = min(min_y, point_y) # if label: # matplotlib.pyplot.plot(x_vals, y_vals, marker, markersize=2, label=label) # else: # matplotlib.pyplot.plot(x_vals, y_vals, marker, markersize=2) # return max_y, min_y chr_cum_len_by_chrom = dict(zip(chr_names, chr_cum_lens)) nrm_xs = [] nrm_ys = [] amp_xs = [] amp_ys = [] amp_gs = [] del_xs = [] del_ys = [] del_gs = [] with open(seq2c_tsv_fpath) as f: for i, l in enumerate(f): if i == 0: continue fs = l.replace('\n', '').split('\t') sname, gname = fs[0], fs[1] if key_gene_names and gname not in key_gene_names: continue if sname != sample_name: continue sname, gname, chrom, start, end, length, log2r, sig, type_, amp_del, ab_seg, total_seg, \ ab_log2r, log2r_diff, ab_seg_loc, ab_samples, ab_samples_pcnt = fs[:17] x = chr_cum_len_by_chrom[chrom] + (int(start) + int(end)) / 2 if not ab_log2r or type_ == 'BP': # breakpoint, meaning part of exon is not amplified nrm_xs.append(x) nrm_ys.append(float(log2r)) # add_rec_to_plot(chrom, start, end, log2r, max_y, min_y, marker='b.') if ab_log2r: y = float(ab_log2r) if amp_del == 'Amp': amp_xs.append(x) amp_ys.append(y) amp_gs.append(gname) elif amp_del == 'Del': del_xs.append(x) del_ys.append(y) del_gs.append(gname) else: warn('Event is not Amp or Del, it\'s ' + amp_del) # max_y, min_y = add_rec_to_plot(chrom, start, end, log2r, max_y, min_y, marker=color + 'o', label=gname) # log2r = float(log2r) # if -0.5 < log2r < 0.5: # color = 'k' # elif -1.5 < log2r < 1.5: # color = 'g' # else: # color = 'r' matplotlib.pyplot.scatter(nrm_xs, nrm_ys, marker='.', color='k', s=1) matplotlib.pyplot.scatter(amp_xs, amp_ys, marker='o', color='b', s=2) matplotlib.pyplot.scatter(del_xs, del_ys, marker='o', color='r', s=2) if len(amp_xs) <= 10 or len(amp_xs) + len(del_xs) < 40: for x, y, g in zip(amp_xs, amp_ys, amp_gs): ax.text(x, y, g, fontsize=9, color='g', verticalalignment='center', horizontalalignment='center') if len(del_xs) <= 10 or len(amp_xs) + len(del_xs) < 40: for x, y, g in zip(del_xs, del_ys, del_gs): ax.text(x, y, g, fontsize=9, color='r', verticalalignment='center', horizontalalignment='center') matplotlib.pyplot.ylim(ymax=max(chain(nrm_ys, amp_ys, del_ys, [2])) * 1.05, ymin=min(chain(nrm_ys, amp_ys, del_ys, [-2])) * 1.05) matplotlib.pyplot.tick_params(axis='x', which='minor', bottom='off', top='off', labelbottom='on') info('Saving plot to ' + plot_fpath) matplotlib.pyplot.tight_layout() fig.savefig(plot_fpath, bbox_inches='tight') matplotlib.pyplot.close(fig) info('Done') info('-' * 70) return plot_fpath
def create_oncoprints_link(cnf, bcbio_structure, project_name=None): if is_us(): loc = exposing.us # elif is_uk(): loc = exposing.uk else: loc = exposing.local return None if not bcbio_structure.variant_callers: info('No varianting calling performed, not generating Oncoprints') return None clinical_report_caller = \ bcbio_structure.variant_callers.get('vardict') or \ bcbio_structure.variant_callers.get('vardict-java') if not clinical_report_caller: err('Warning: vardict is not in the variant callers list, this not generating Oncoprints') return None step_greetings('Creating Oncoprints link') zhongwu_data_query_dirpath = '/home/kdld047/public_html/cgi-bin/TS' if not isdir(zhongwu_data_query_dirpath): warn('Data Query directory ' + zhongwu_data_query_dirpath + ' does not exists.') return None vardict_txt_fname = variant_filtering.mut_fname_template.format(caller_name=clinical_report_caller.name) vardict_txt_fpath = join(bcbio_structure.var_dirpath, vardict_txt_fname) cnf.mutations_fpath = add_suffix(vardict_txt_fpath, variant_filtering.mut_pass_suffix) cnf.seq2c_tsv_fpath = bcbio_structure.seq2c_fpath samples = sorted(bcbio_structure.samples) cnf.project_name = project_name or bcbio_structure.project_name or basename(cnf.output_dir) study_name = re.sub('[\.\-:&]', '_', cnf.project_name) check_genome_resources(cnf) data_query_dirpath = join(loc.dirpath, 'DataQueryTool') data_fpath = join(zhongwu_data_query_dirpath, study_name + '.data.txt') info_fpath = join(zhongwu_data_query_dirpath, study_name + '.info.txt') altered_genes = print_data_txt(cnf, cnf.mutations_fpath, cnf.seq2c_tsv_fpath, samples, data_fpath) if not altered_genes: err('No altered genes in ' + cnf.mutations_fpath + ' or ' + cnf.seq2c_tsv_fpath + ', not generating Oncoptints.') return None print_info_txt(cnf, samples, info_fpath) data_ext_fpath = data_fpath.replace('/home/', '/users/') info_ext_fpath = info_fpath.replace('/home/', '/users/') # optional: data_symlink = join(data_query_dirpath, study_name + '.data.txt') info_symlink = join(data_query_dirpath, study_name + '.info.txt') (symlink_to_ngs if is_us() else local_symlink)(data_ext_fpath, data_symlink) (symlink_to_ngs if is_us() else local_symlink)(info_ext_fpath, info_symlink) properties_fpath = join(zhongwu_data_query_dirpath, 'DataQuery.properties') add_data_query_properties(cnf, study_name, properties_fpath, data_ext_fpath, info_ext_fpath) genes = '%0D%0A'.join(altered_genes) data_query_url = join(loc.website_url_base, 'DataQueryTool', 'DataQuery.pl?' 'analysis=oncoprint&' 'study={study_name}&' 'gene={genes}&' 'order=on&' 'freq=50&' 'nocheckgenes=true&' 'submit=Submit' .format(**locals())) info() info('Information about study was added in Data Query Tool, URL is ' + data_query_url) return data_query_url
def __init__(self, cnf, experiment_by_key, parameters_info=None, samples_data=None, *args): BaseClinicalReporting.__init__(self, cnf, *args) self.experiment_by_key = experiment_by_key self.mutations_report = None self.mutations_plot_data = None self.venn_plot_data = None self.substitutions_plot_data = None self.sv_report = None self.actionable_genes_report = None self.seq2c_plot_data = None self.seq2c_report = None self.key_genes_report = None self.cov_plot_data = None self.mutations_by_experiment = OrderedDict() self.mutations_reports = dict() self.mutations_parameters = None self.seq2c_reports = defaultdict() self.sample_names = [] for k, e in experiment_by_key.items(): e.key = k import re e.sample.clinical_html = abspath( join(cnf.output_dir, 'report_' + str(get_group_num(k)) + '.html')) e.cnf.work_dir = cnf.work_dir if is_us: e.sample.clinical_html = re.sub('^/ngs/', '/gpfs/ngs/', e.sample.clinical_html) e.project_report_path = re.sub('^/ngs/', '/gpfs/ngs/', e.project_report_path) self.sample_names.append(e.sample.name) sample_infos = OrderedDict({ k: get_sample_info(e.sample.name, e.sample.dirpath, samples_data) for k, e in experiment_by_key.iteritems() }) sorted_sample_infos = sorted( sample_infos.items(), key=lambda x: (x[0][1], [x[1][j] for j in range(len(x[1]))])) sorted_experiments = OrderedDict() for k, v in sorted_sample_infos: sorted_experiments[k] = experiment_by_key[k] self.experiment_by_key = sorted_experiments # self.patient = self.merge_patients(self.infos) # bed_fpaths = set(experiment.target.bed_fpath for experiment in experiment_by_key.values() if experiment.target.bed_fpath) # bed_fnames = [basename(bed_fpath).split('.')[0] + '.bed' for bed_fpath in bed_fpaths] jbrowser_link = get_jbrowser_link(self.cnf.genome.name, self.sample_names) info('Preparing data...') # self.mut_by_key_by_exper = self.arrange_mutations({k: i.mutations for k, i in experiment_by_key.items()}) for e in sorted_experiments.values(): if e.mutations: self.mutations_by_experiment[e] = e.mutations group_nums = set( get_group_num(key) for key in self.experiment_by_key.keys()) if self.mutations_by_experiment: self.mutations_report, self.venn_plot_data = self.make_mutations_report( self.mutations_by_experiment, jbrowser_link, samples_data=samples_data, parameters_info=parameters_info, create_venn_diagrams=True) info('Preparing data for each sample...') for num in group_nums: sample_mut_report, venn_plot_data = self.make_mutations_report( self.mutations_by_experiment, jbrowser_link, samples_data=samples_data, parameters_info=parameters_info, create_venn_diagrams=True, cur_group_num=num) self.mutations_reports[num] = (sample_mut_report, venn_plot_data) # self.mutations_plot_data = self.make_mutations_json(mutations_by_experiment) # self.substitutions_plot_data = self.make_substitutions_json(mutations_by_experiment) #self.actionable_genes_report = self.make_actionable_genes_report(experiment_by_key.values()[0].actionable_genes_dict) seq2c_events_by_experiment = { e: e.seq2c_events_by_gene for e in experiment_by_key.values() if e.seq2c_events_by_gene } if seq2c_events_by_experiment: for num in group_nums: seq2c_report = self.make_seq2c_report( seq2c_events_by_experiment, samples_data=samples_data, cur_group_num=num) self.seq2c_reports[num] = seq2c_report