def write_coverage(cnf, output_dir, chrom, depths_by_pos, cov_thresholds, sample_index=None): coverage_data_fpath = join(output_dir, chrom + '.txt') if not cnf.reuse_intermediate or ( not verify_file(coverage_data_fpath, silent=True) and not verify_file(coverage_data_fpath + '.gz', silent=True)): chrom_num = chrom.replace('chr', '') with file_transaction(cnf.work_dir, coverage_data_fpath) as tx: with open(tx, 'w') as f: fs = ['#chrom', 'pos', 'mean', 'median' ] + [str(t) for t in cov_thresholds] f.write('\t'.join(fs) + '\n') sorted_positions = sorted(depths_by_pos.keys()) for pos in sorted_positions: depths = depths_by_pos[pos] if sample_index is None else [ depths_by_pos[pos][sample_index] ] mean_coverage = mean(depths) median_coverage = median(depths) pcnt_samples_ge_threshold = [ mean([1 if d >= t else 0 for d in depths]) for t in cov_thresholds ] res_line = chrom_num + '\t' + str(pos) + '\t' + str( mean_coverage) + '\t' + str(median_coverage) for pcnt_samples in pcnt_samples_ge_threshold: res_line += '\t' + str(pcnt_samples) f.write(res_line + '\n') bgzip_and_tabix(cnf, coverage_data_fpath, tabix_parameters='-p bed')
def get_regions_coverage(cnf, samples): cov_thresholds = [1, 5, 10, 15, 20, 25, 30, 50, 100] depths_by_pos = defaultdict(lambda: [0] * len(samples)) info() info('Coverage to bedgraph for ' + cnf.chrom) coverage_fpaths = [] for index, sample in enumerate(samples): coverage_fpath = join(cnf.work_dir, sample.name + '_' + cnf.chrom + '.bedgraph') coverage_fpath = get_bedgraph_coverage(cnf, sample.bam, chr_len_fpath=cnf.chr_len_fpath, bed_fpath=cnf.bed, output_fpath=coverage_fpath, exit_on_error=False) if coverage_fpath and verify_file(coverage_fpath): coverage_fpaths.append(coverage_fpath) for line in open(coverage_fpath): if line.startswith('#'): continue chrom, start, end, depth = line.split('\t') start, end, depth = map(int, (start, end, depth)) for pos in xrange(start, end): depths_by_pos[pos][index] = depth info() if not coverage_fpaths: warn(cnf.chrom + ' is not covered in all samples') return None info() info('Writing coverage for ' + cnf.chrom) write_coverage(cnf, cnf.output_dir, cnf.chrom, depths_by_pos, cov_thresholds) for index, sample in enumerate(samples): info('Writing coverage for ' + sample.name + ', ' + chrom) sample_output_dirpath = join(cnf.output_dir, sample.name) output_fpath = join(sample_output_dirpath, chrom + '.txt.gz') if cnf.reuse_intermediate and verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') continue write_coverage(cnf, sample_output_dirpath, cnf.chrom, depths_by_pos, cov_thresholds, sample_index=index) if not verify_file(output_fpath, silent=True): warn(sample.name + ' has no coverage at chromosome ' + chrom) return depths_by_pos
def main(): info(' '.join(sys.argv)) info() description = 'This script runs preprocessing.' parser = OptionParser(description=description) parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath') parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath') parser.add_option('--sample', dest='sample_name', help='Sample name') parser.add_option('-o', dest='output_dir', help='Output directory path') parser.add_option( '--downsample-to', dest='downsample_to', default=None, type='int', help= 'Downsample reads to avoid excessive processing times with large files. ' 'Default is 1 million. Set to 0 to turn off downsampling.') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) (opts, args) = parser.parse_args() logger.is_debug = opts.debug cnf = Config(opts.__dict__, determine_sys_cnf(opts), determine_run_cnf(opts)) left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True) right_reads_fpath = verify_file(opts.right_reads_fpath, is_critical=True) output_dirpath = adjust_path( opts.output_dir) if opts.output_dir else critical( 'Please, specify output directory with -o') verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True) with workdir(cnf): sample_name = cnf.sample_name if not sample_name: sample_name = _get_sample_name(left_reads_fpath, right_reads_fpath) results_dirpath = run_fastq(cnf, sample_name, left_reads_fpath, right_reads_fpath, output_dirpath, downsample_to=cnf.downsample_to) verify_dir(results_dirpath, is_critical=True) info() info('*' * 70) info('Fastqc results:') info(' ' + results_dirpath)
def main(): info(' '.join(sys.argv)) info() description = 'This script runs preprocessing.' parser = OptionParser(description=description) parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath') parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath') parser.add_option('--sample', dest='sample_name', help='Sample name') parser.add_option('-o', dest='output_dir', help='Output directory path') parser.add_option('--downsample-to', dest='downsample_to', default=None, type='int', help='Downsample reads to avoid excessive processing times with large files. ' 'Default is 1 million. Set to 0 to turn off downsampling.') add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) (opts, args) = parser.parse_args() if not opts.left_reads_fpath or not opts.right_reads_fpath or not opts.output_dir: parser.print_usage() verify_file(opts.left_reads_fpath, is_critical=False) left_reads_fpath = adjust_path(opts.left_reads_fpath) verify_file(opts.right_reads_fpath, is_critical=False) right_reads_fpath = adjust_path(opts.right_reads_fpath) output_dirpath = adjust_path(opts.output_dir) if opts.output_dir else critical('Please, specify output directory with -o') verify_dir(dirname(output_dirpath), description='output_dir', is_critical=True) left_reads_fpath, right_reads_fpath, output_dirpath =\ map(_proc_path, [left_reads_fpath, right_reads_fpath, output_dirpath]) ssh = connect_to_server(server_url='blue.usbod.astrazeneca.net', username='******', password='******') fastqc_py = get_script_cmdline(None, 'python', 'scripts/pre/fastqc.py') fastqc_py = fastqc_py.replace(REPORTING_SUITE_PATH_CLARITY, REPORTING_SUITE_PATH_WALTHAM) fastqc_py = fastqc_py.replace(PYTHON_PATH_CLARITY, PYTHON_PATH_WALTHAM) cmdl = '{fastqc_py} -1 {left_reads_fpath} -2 {right_reads_fpath} -o {output_dirpath}' if opts.sample_name: cmdl += ' --sample {opts.sample_name}' if opts.downsample_to: cmdl += ' --downsample-to ' + str(int(opts.downsample_to)) cmdl = cmdl.format(**locals()) cmdl += ' 2>&1' info(cmdl) stdin, stdout, stderr = ssh.exec_command(cmdl) for l in stdout: err(l, ending='') info() ssh.close()
def save_all_mutations_depth(cnf, infos_by_key): mut_bed_fpath = join(cnf.work_dir, 'mutations.bed') if not cnf.reuse_intermediate or not verify_file(mut_bed_fpath): all_mutations_pos = defaultdict(set) for e in infos_by_key.values(): for mut in e.mutations: all_mutations_pos[mut.chrom].add(mut.pos) with file_transaction(cnf.work_dir, mut_bed_fpath) as tx: with open(tx, 'w') as out_f: for chrom, positions in all_mutations_pos.iteritems(): for pos in positions: out_f.write('\t'.join( [chrom, str(pos - 1), str(pos)]) + '\n') sambamba_output_by_experiment = run_sambamba_use_grid( cnf, infos_by_key, mut_bed_fpath) for e, sambamba_output_fpath in sambamba_output_by_experiment.iteritems(): regions = parse_sambamba_depth_output(e.sample.name, sambamba_output_fpath) depth_dict = defaultdict() for region in regions: depth_dict[region.end] = region.avg_depth e.mutations_depth = depth_dict
def extract_graphs(samples): # Sample(name, fastq_fpath) parsed_data = OrderedDict((h, list()) for h in _header) for s in samples: if verify_file(s.fastqc_html_fpath, 's.fastqc_html_fpath for ' + s.name): with open(s.fastqc_html_fpath) as source_file_obj: html = source_file_obj.read() parts = [ p.split('</div>')[0] for p in html.split('<div class="module">')[1:] ] # <h2><img/></h2><table></table></div> OR <h2><img/></h2><p><img/></p></div> for i, part in enumerate(parts): # info('Parsing ' + _header[i]) # info(str(part)) table, graph = '', '' ok_img = '<img ' + part.split('"><img')[1].split( '>')[0] + '>' if '<table>' in part: table = '<table>' + part.split('<table>')[1] if '<p><img ' in part: graph = '<img ' + part.split('<p><img')[1].split( '>')[0] + '>' parsed_data[_header[i]].append( [s.name, ok_img, graph, table]) # module_divs = soup.find_all("div", class_="module") # _sort_graph_by_type(parsed_data, module_divs, s.name) # soup.decompose() else: err('Could not find fastqc html fpath for sample ' + s.name + ': ' + str(s.fastqc_html_fpath)) return parsed_data
def get_key_genes(key_genes_fpath): key_genes_fpath = verify_file(key_genes_fpath, is_critical=True, description='820 AZ key genes') with open(key_genes_fpath) as f: key_gene_names = set( [l.strip() for l in f.readlines() if l.strip() != '']) return key_gene_names
def run_fastq(cnf, sample_name, l_r_fpath, r_r_fpath, output_dirpath, downsample_to=1e7): fastqc = get_system_path(cnf, 'fastqc', is_critical=True) java = get_system_path(cnf, 'java', is_critical=True) if downsample_to: info('Downsampling to ' + str(downsample_to)) l_fpath, r_fpath = downsample(cnf, sample_name, l_r_fpath, r_r_fpath, downsample_to, output_dir=cnf.work_dir) # Joining fastq files to run on a combination fastqc_fpath = join(cnf.work_dir, sample_name + '.fq') info('Combining fastqs, writing to ' + fastqc_fpath) with open(fastqc_fpath, 'w') as out: out.write(open_gzipsafe(l_r_fpath).read()) out.write(open_gzipsafe(r_r_fpath).read()) # Running FastQC info('Running FastQC') tmp_dirpath = join(cnf.work_dir, 'FastQC_' + sample_name + '_tmp') safe_mkdir(tmp_dirpath) cmdline = '{fastqc} --dir {tmp_dirpath} --extract -o {output_dirpath} -f fastq -j {java} {fastqc_fpath}'.format( **locals()) call(cnf, cmdline) # Cleaning and getting report sample_fastqc_dirpath = join(output_dirpath, sample_name + '.fq_fastqc') if isfile(sample_fastqc_dirpath + '.zip'): os.remove(sample_fastqc_dirpath + '.zip') fastqc_html_fpath = join(sample_fastqc_dirpath, 'fastqc_report.html') verify_file(fastqc_html_fpath, is_critical=True) return sample_fastqc_dirpath
def parse_gene_counts(counts_fpath, key_gene_names, report_name, keep_gene_names): gene_counts = defaultdict(list) info('Preparing ' + report_name + ' stats for expression heatmaps') info('Checking ' + counts_fpath) if not verify_file(counts_fpath): err('Cannot find ' + report_name + ' fpath') return [] info('Reading ' + report_name + ' from ' + counts_fpath) samples_cols = dict() samples = [] gene_col = None with open(counts_fpath) as f: for i, l in enumerate(f): if i == 0: header = l.strip().split('\t') gene_col = header.index('HUGO') samples = header[1:gene_col] samples_cols = { sample: col + 1 for col, sample in enumerate(samples) } continue fs = l.replace('\n', '').split('\t') gene_name = fs[gene_col] if key_gene_names and gene_name not in key_gene_names: continue gene_expression_dict = { sample: int(float(fs[col])) if float(fs[col]).is_integer() else float(fs[col]) for sample, col in samples_cols.iteritems() } if all(v < HEATMAPS_MIN_COUNT for v in gene_expression_dict.values()): continue is_hidden_row = False name = gene_name if ':' in fs[0]: ## exon number is_hidden_row = True exon_number = fs[0].split(':')[1] name += ':' + exon_number if keep_gene_names: is_hidden_row = True name = fs[0] # use id gene = Counts(name, gene_name=gene_name, counts=gene_expression_dict, is_hidden_row=is_hidden_row) gene_counts[gene_name].append(gene) return gene_counts, samples
def annotate_gene_counts(cnf, counts_fpath, ann_counts_fpath, genes_dict): unannotated_fpath = counts_fpath if not verify_file(unannotated_fpath): critical('Not found counts ' + unannotated_fpath) with file_transaction(cnf.work_dir, ann_counts_fpath) as tx: with open(tx, 'w') as annotated_f: with open(unannotated_fpath) as f: for i, l in enumerate(f): if i == 0: header = l.replace('\n', '').split('\t') l = '\t'.join(header + ['HUGO']) annotated_f.write(l + '\n') continue fs = l.replace('\n', '').split('\t') gene_and_exon = fs[0].split(':') gene_id = gene_and_exon[0] if gene_id not in genes_dict: continue gene_symbol = genes_dict[gene_id] l = '\t'.join(fs + [gene_symbol]) annotated_f.write(l + '\n') if not verify_file(ann_counts_fpath): critical('Could not annotate counts ' + unannotated_fpath)
def run_sambamba_use_grid(cnf, infos_by_key, mut_bed_fpath): sambamba_output_by_experiment = dict() not_submitted_experiments = infos_by_key.values() while not_submitted_experiments: jobs_to_wait = [] submitted_experiments = [] reused_experiments = [] for (group, uniq_key), e in infos_by_key.iteritems(): if e not in not_submitted_experiments: continue sambamba_output_fpath = join(cnf.work_dir, uniq_key + '__mutations.bed') sambamba_output_by_experiment[e] = sambamba_output_fpath if cnf.reuse_intermediate and verify_file(sambamba_output_fpath, silent=True): info(sambamba_output_fpath + ' exists, reusing') reused_experiments.append(e) continue else: if not e.sample.bam: err('Sample ' + e.sample.name + ' in ' + str(group) + ', ' + str(uniq_key) + ' has no BAM') continue j = sambamba_depth(cnf, mut_bed_fpath, e.sample.bam, output_fpath=sambamba_output_fpath, only_depth=True, silent=True, use_grid=True) submitted_experiments.append(e) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: break if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No jobs to submit.') not_submitted_experiments = [ e for e in not_submitted_experiments if e not in submitted_experiments and e not in reused_experiments ] return sambamba_output_by_experiment
def process_all(cnf, bcbio_structure): samples = bcbio_structure.samples key_gene_by_name, use_custom_panel = get_key_or_target_bed_genes( cnf.bed, verify_file(adjust_system_path(cnf.key_genes), 'key genes')) key_or_target_genes = 'target' if use_custom_panel else 'key' mutations = {} for sample in samples: mutations[sample.name] = parse_mutations(cnf, sample, key_gene_by_name, cnf.mutations_fpath, key_or_target_genes, for_flagged_report=True) _generate_summary_flagged_regions_report(cnf, bcbio_structure, samples, mutations, key_or_target_genes) pass
def main(): cnf, vcf2txt_res_fpath = get_args() info('-' * 70) info('Writing to ' + cnf.output_file) if cnf.all_transcripts_output_file: info('Writing info for all transcripts to ' + cnf.all_transcripts_output_file) if cnf.fm_output_file: info('Writing in FM format to ' + cnf.fm_output_file) if cnf.rejected_output_file: info('Writing rejected mutations to ' + cnf.rejected_output_file) f = Filtration(cnf) input_f = open(verify_file(vcf2txt_res_fpath)) output_f = open(adjust_path(cnf.output_file), 'w') rejected_output_f = open(adjust_path(cnf.rejected_output_file), 'w') if cnf.rejected_output_file else None fm_output_f = open(adjust_path(cnf.fm_output_file), 'w') if cnf.fm_output_file else None all_transcripts_output_f = open( adjust_path(cnf.all_transcripts_output_file), 'w') if cnf.all_transcripts_output_file else None info() info('-' * 70) info('Running filtering...') f.do_filtering(input_f, output_f, fm_output_f, all_transcripts_output_f, rejected_output_f) input_f.close() output_f.close() if fm_output_f: fm_output_f.close() if all_transcripts_output_f: all_transcripts_output_f.close() info() if cnf.rejected_output_file: info('Rejected mutations saved to ' + cnf.rejected_output_file) info('Saved to ' + cnf.output_file)
def _links_show_hide(out, samples): out.write( '<form name="tcol" onsubmit="return false"> Show columns <br/>\n') out.write('<table>\n') i = 0 list_of_chunks = list( _chunks([s.name for s in samples if verify_file(s.fastqc_html_fpath)], 6)) for samples in list_of_chunks: out.write('<tr>\n') for sample in samples: out.write('<td><input type=checkbox name="col' + str(i) + '" onclick="toggleVis(' + str(i) + ')" checked> ' + sample + '</td>\n') i += 1 out.write('</tr>\n') out.write('</table>\n') out.write('</form> \n')
def get_rejected_mutations(cnf, bs, key_gene_by_name_chrom, genes_collection_type): rejected_mutations = defaultdict(dict) rejected_mutations_by_sample = defaultdict(list) pass_mutations_fpath, _ = get_mutations_fpath_from_bs(bs) for reject_mutations_fpath in get_rejected_mutations_fpaths( pass_mutations_fpath): if verify_file(reject_mutations_fpath, silent=True): info('Parsing rejected mutations from ' + str(reject_mutations_fpath)) parse_mutations(cnf, None, key_gene_by_name_chrom, reject_mutations_fpath, genes_collection_type, mutations_dict=rejected_mutations_by_sample) for sample, mutations in rejected_mutations_by_sample.iteritems(): for mut in mutations: rejected_mutations[sample][(mut.gene.name, mut.pos)] = mut return rejected_mutations
def draw_seq2c_plot(cnf, seq2c_tsv_fpath, sample_name, output_dir, key_gene_names=None, chr_lens=None): info('Seq2C plot builder') plot_fpath = join(output_dir, sample_name + cnv_plot_ending) if cnf.reuse_intermediate and verify_file(plot_fpath, silent=True): info('Seq2C plot ' + plot_fpath + ' exists, reusing...') return plot_fpath if not verify_file(seq2c_tsv_fpath, 'Seq2C.tsv'): return None chr_names_lengths = OrderedDict( (chr_, l) for chr_, l in (chr_lens or get_chr_lengths(cnf)) if '_' not in chr_) # not drawing extra chromosomes chr1_blablabla chr_names = chr_names_lengths.keys() chr_short_names = [chrom[3:] for chrom in chr_names_lengths.keys()] chr_lengths = [chrom for chrom in chr_names_lengths.values()] fig = matplotlib.pyplot.figure(figsize=(25, 5)) matplotlib.pyplot.xlim([0, len(chr_lengths) + 1]) chr_cum_lens = [sum(chr_lengths[:i]) for i in range(len(chr_lengths) + 1)] matplotlib.pyplot.xticks(chr_cum_lens, []) ax = matplotlib.pyplot.gca() chr_names_coords = [ chr_cum_lens[i + 1] - chr_lengths[i] / 2 for i in range(len(chr_lengths)) ] ax.xaxis.set_minor_locator(ticker.FixedLocator(chr_names_coords)) ax.xaxis.set_minor_formatter(ticker.FixedFormatter(chr_short_names)) # def add_rec_to_plot(chr_, start, end, log2r, max_y, min_y, marker, color, label=None): # x_vals = [chr_cum_lengths[chr_names.index(chr_)] + (int(start) + int(end))/2] # point_y = float(log2r) # y_vals = [point_y] # max_y = max(max_y, point_y) # min_y = min(min_y, point_y) # if label: # matplotlib.pyplot.plot(x_vals, y_vals, marker, markersize=2, label=label) # else: # matplotlib.pyplot.plot(x_vals, y_vals, marker, markersize=2) # return max_y, min_y chr_cum_len_by_chrom = dict(zip(chr_names, chr_cum_lens)) nrm_xs = [] nrm_ys = [] amp_xs = [] amp_ys = [] amp_gs = [] del_xs = [] del_ys = [] del_gs = [] with open(seq2c_tsv_fpath) as f: for i, l in enumerate(f): if i == 0: continue fs = l.replace('\n', '').split('\t') sname, gname = fs[0], fs[1] if key_gene_names and gname not in key_gene_names: continue if sname != sample_name: continue sname, gname, chrom, start, end, length, log2r, sig, type_, amp_del, ab_seg, total_seg, \ ab_log2r, log2r_diff, ab_seg_loc, ab_samples, ab_samples_pcnt = fs[:17] x = chr_cum_len_by_chrom[chrom] + (int(start) + int(end)) / 2 if not ab_log2r or type_ == 'BP': # breakpoint, meaning part of exon is not amplified nrm_xs.append(x) nrm_ys.append(float(log2r)) # add_rec_to_plot(chrom, start, end, log2r, max_y, min_y, marker='b.') if ab_log2r: y = float(ab_log2r) if amp_del == 'Amp': amp_xs.append(x) amp_ys.append(y) amp_gs.append(gname) elif amp_del == 'Del': del_xs.append(x) del_ys.append(y) del_gs.append(gname) else: warn('Event is not Amp or Del, it\'s ' + amp_del) # max_y, min_y = add_rec_to_plot(chrom, start, end, log2r, max_y, min_y, marker=color + 'o', label=gname) # log2r = float(log2r) # if -0.5 < log2r < 0.5: # color = 'k' # elif -1.5 < log2r < 1.5: # color = 'g' # else: # color = 'r' matplotlib.pyplot.scatter(nrm_xs, nrm_ys, marker='.', color='k', s=1) matplotlib.pyplot.scatter(amp_xs, amp_ys, marker='o', color='b', s=2) matplotlib.pyplot.scatter(del_xs, del_ys, marker='o', color='r', s=2) if len(amp_xs) <= 10 or len(amp_xs) + len(del_xs) < 40: for x, y, g in zip(amp_xs, amp_ys, amp_gs): ax.text(x, y, g, fontsize=9, color='g', verticalalignment='center', horizontalalignment='center') if len(del_xs) <= 10 or len(amp_xs) + len(del_xs) < 40: for x, y, g in zip(del_xs, del_ys, del_gs): ax.text(x, y, g, fontsize=9, color='r', verticalalignment='center', horizontalalignment='center') matplotlib.pyplot.ylim(ymax=max(chain(nrm_ys, amp_ys, del_ys, [2])) * 1.05, ymin=min(chain(nrm_ys, amp_ys, del_ys, [-2])) * 1.05) matplotlib.pyplot.tick_params(axis='x', which='minor', bottom='off', top='off', labelbottom='on') info('Saving plot to ' + plot_fpath) matplotlib.pyplot.tight_layout() fig.savefig(plot_fpath, bbox_inches='tight') matplotlib.pyplot.close(fig) info('Done') info('-' * 70) return plot_fpath
def get_args(): info(' '.join(sys.argv)) info() description = ( 'The program will filter the VarDict output after vcf2txt.pl to ' 'candidate interpretable mutations, somatic or germline.') parser = OptionParser(description=description) add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1) parser.add_option('-o', dest='output_file') parser.add_option('--o-all-transcripts', dest='all_transcripts_output_file') parser.add_option('--o-fm', dest='fm_output_file') parser.add_option('--o-reject', dest='rejected_output_file') parser.add_option('--cohort-freqs', dest='cohort_freqs_fpath') parser.add_option('--transcripts', dest='transcripts_fpath') parser.add_option('-D', '--min-depth', dest='filt_depth', type='int', help='The minimum total depth') parser.add_option('-V', '--min-vd', dest='min_vd', type='int', help='The minimum reads supporting variant') parser.add_option( '--gmaf', dest='min_gmaf', type='float', help= 'When the GMAF is greater than specified, it\'s considered common SNP and filtered out.' ) parser.add_option( '-f', '--min-freq', dest='min_freq', type='float', help='The minimum allele frequency for regular variants.') parser.add_option( '-F', '--min-freq-hs', '--act-min-freq', dest='act_min_freq', type='float', help= 'The minimum allele frequency hotspot somatic mutations, typically lower then -f. ' 'Default: 0.01 or half -f, whichever is less') parser.add_option( '-N', '--keep-utr-intronic', dest='keep_utr_intronic', action='store_true', help= 'Keep all intronic and UTR in the output, but will be set as "unknown".' ) parser.add_option( '-p', '--platform', dest='platform', help= 'The platform, such as WXS, WGS, RNA-Seq, VALIDATION, etc. No Default. ' 'Used for output in FM\'s format') parser.set_usage('Usage: ' + __file__ + ' vcf2txt_res_fpath [opts] -o output_fpath') (opts, args) = parser.parse_args() if len(args) < 1: critical('Provide the first argument - output from vcf2txt.pl') logger.is_debug = opts.debug vcf2txt_res_fpath = verify_file(args[0], is_critical=True) run_cnf = determine_run_cnf(opts) cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf) if not cnf.genome: critical('Please, specify the --genome option (e.g. --genome hg19)') check_genome_resources(cnf) if not cnf.output_file: critical('Please, specify the output fpath with -o') info() return cnf, vcf2txt_res_fpath
def sync_with_ngs_server(cnf, jira_url, project_name, sample_names, summary_report_fpath, dataset_dirpath=None, bcbio_final_dirpath=None, jira_case=None): if is_us(): loc = us elif is_uk(): loc = uk elif is_local(): loc = local elif is_sweden(): loc = sweden else: return None html_report_url = None if any(p in realpath((bcbio_final_dirpath or dataset_dirpath)) for p in loc.proper_path_should_contain): info('Location is ' + loc.loc_id + ', exposing reports to ' + loc.reports_dirpath) if jira_case is None and jira_case != 'unreached' and is_az( ) and jira_url: info() info('Getting info from JIRA...') jira_case = retrieve_jira_info(jira_url) proj_dirpath_on_server = _symlink_dirs( cnf=cnf, loc=loc, project_name=project_name, final_dirpath=bcbio_final_dirpath, dataset_dirpath=dataset_dirpath) # html_report_fpath=summary_report_fpath, # html_report_url=html_report_url) if bcbio_final_dirpath: html_report_url = join( loc. report_url_base, # http://ngs.usbod.astrazeneca.net/reports/ relpath( proj_dirpath_on_server, loc.reports_dirpath), # project_name/dataset/project_name relpath(summary_report_fpath, dirname(bcbio_final_dirpath) )) # final/2015_01_01_project/project.html elif dataset_dirpath: html_report_url = join( loc.report_url_base, relpath(proj_dirpath_on_server, loc.reports_dirpath), relpath(summary_report_fpath, dataset_dirpath)) # html_report_full_url = join(loc.website_url_base, 'samples.php?project_name=' + project_name + '&file=' + html_report_url) # info('HTML url: ' + html_report_full_url) if verify_file(loc.csv_fpath, 'Project list'): write_to_csv_file(work_dir=cnf.work_dir, jira_case=jira_case, project_list_fpath=loc.csv_fpath, country_id=loc.loc_id, project_name=project_name, samples_num=len(sample_names), analysis_dirpath=dirname(bcbio_final_dirpath) if bcbio_final_dirpath else None, html_report_url=html_report_url) return html_report_url
def main(args): if len(args) < 2: sys.exit('Usage ' + __file__ + ' input.tsv bcbio.csv [dir_with_bams] [bina_dir]') inp_fpath = args[0] verify_file(args[0], is_critical=True) out_fpath = args[1] verify_dir(dirname(adjust_path(out_fpath)), is_critical=True) bam_dirpath = None if len(args) > 2: bam_dirpath = args[2] verify_dir(adjust_path(bam_dirpath), is_critical=True) # bam_opt = args[2] # try: # bam_col = int(bam_opt) # bam_dirpath = None # except ValueError: # bam_col = None # verify_dir(bam_opt, is_critical=True) # bam_dirpath = args[2] bina_dirpath = None if len(args) > 3: bina_dirpath = args[3] verify_dir(dirname(adjust_path(bina_dirpath)), is_critical=True) # filtered_bams_dirpath = adjust_path(sys.argv[3]) # verify_dir(join(filtered_bams_dirpath, os.pardir), is_critical=True) columns_names = 'study barcode disease disease_name sample_type sample_type_name analyte_type library_type center center_name platform platform_name assembly filename files_size checksum analysis_id aliquot_id participant_id sample_id tss_id sample_accession published uploaded modified state reason' samples_by_patient = defaultdict(list) delim = '\t' barcode_col = 1 bam_col = 13 is_tcga_tsv = True with open(inp_fpath) as fh: for i, l in enumerate(fh): if not l.strip(): continue if i == 0: if len(l.split('\t')) == 27: err('Interpreting as TCGA tsv') if l.split('\t')[0] != 'TCGA': continue # skipping header else: delim = None for j, f in enumerate(l.split()): if f.startswith('TCGA'): barcode_col = j err('barcode col is ' + str(j)) if f.endswith('bam'): bam_col = j err('bam col is ' + str(j)) is_tcga_tsv = False fs = l.split(delim) barcode = fs[barcode_col].split( '-') # TCGA-05-4244-01A-01D-1105-08 sample = Sample() sample.bam = fs[bam_col] sample.bam_base_name = basename(os.path.splitext(fs[bam_col])[0]) sample.description = fs[barcode_col] sample.patient = '-'.join(barcode[:3]) if is_tcga_tsv: sample.reason = fs[26] sample_type = int(barcode[3][:2]) if sample_type >= 20 or sample_type <= 0: continue sample.is_normal = 10 <= sample_type < 20 sample.is_blood = sample_type in [ 3, 4, 9, 10 ] # https://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm if any(s.description == sample.description for s in samples_by_patient[sample.patient]): prev_sample = next(s for s in samples_by_patient[sample.patient] if s.description == sample.description) # comp reason # if 'Fileset modified' not in prev_sample.reason and 'Fileset modified' in sample.reason: # err('Duplicated sample: ' + sample.description + ' Fileset modified not in old ' + prev_sample.name + ' over ' + sample.name) # pass # elif 'Fileset modified' in prev_sample.reason and 'Fileset modified' not in sample.reason: # samples_by_patient[sample.patient].remove(prev_sample) # samples_by_patient[sample.patient].append(sample) # err('Duplicated sample: ' + sample.description + ' Fileset modified not in new ' + sample.name + ' over ' + prev_sample.name) # else: # comp version prev_version = get_bam_version(prev_sample.bam_base_name) version = get_bam_version(sample.bam_base_name) err('Duplicated sample: ' + sample.description + ' Resolving by version (' + ' over '.join( map(str, sorted([prev_version, version])[::-1])) + ')') if version > prev_version: samples_by_patient[sample.patient].remove(prev_sample) samples_by_patient[sample.patient].append(sample) else: samples_by_patient[sample.patient].append(sample) batches = [] final_samples = set() if bina_dirpath: safe_mkdir(bina_dirpath) for patient, patient_samples in samples_by_patient.iteritems(): tumours = [s for s in patient_samples if not s.is_normal] normals = [s for s in patient_samples if s.is_normal] main_normal = None if len(normals) >= 1: if any(n.is_blood for n in normals): main_normal = next(n for n in normals if n.is_blood) else: main_normal = normals[0] if tumours: for n in normals[1:]: b = Batch(n.description + '-batch') b.tumour = n batches.append(b) for t in tumours: b = Batch(t.description + '-batch') b.tumour = t t.batches.add(b) final_samples.add(t) if main_normal: b.normal = main_normal main_normal.batches.add(b) final_samples.add(main_normal) batches.append(b) ################## ###### Bina ###### if bina_dirpath: bina_patient_dirpath = join(bina_dirpath, patient) safe_mkdir(bina_patient_dirpath) normals_csv_fpath = join(bina_patient_dirpath, 'normals.csv') tumours_csv_fpath = join(bina_patient_dirpath, 'tumors.csv') if main_normal: with open(normals_csv_fpath, 'w') as f: f.write('name,bam\n') bam_fpath = join( bam_dirpath, main_normal.bam) if bam_dirpath else main_normal.bam f.write(main_normal.description + ',' + bam_fpath + '\n') with open(tumours_csv_fpath, 'w') as f: f.write('name,bam\n') for t in tumours: bam_fpath = join(bam_dirpath, t.bam) if bam_dirpath else t.bam f.write(t.description + ',' + bam_fpath + '\n') if bina_dirpath: err('Saved bina CSVs to ' + bina_dirpath) ########################### ######## Bcbio CSV ######## print 'bcbio_nextgen.py -w template bcbio.yaml', out_fpath, with open(out_fpath, 'w') as out: out.write('sample,description,batch,phenotype\n') for s in sorted(final_samples, key=lambda s: s.bam_base_name): out.write(','.join([ s.bam_base_name, s.description, ';'.join( sorted(b.name for b in s.batches)), ('normal' if s.is_normal else 'tumor') ]) + '\n') bam_fpath = join(bam_dirpath, s.bam) if bam_dirpath else s.bam if verify_bam(bam_fpath, is_critical=False): try: bam = pysam.Samfile(bam_fpath, "rb") except ValueError: err(traceback.format_exc()) err('Cannot read ' + bam_fpath) err() # n_rgs = max(1, len(bam.header.get("RG", []))) else: print bam_fpath,