def align(work_dir, sample_name, l_fpath, r_fpath, bwa, smb, bwa_prefix, dedup=True, threads=1): info('Running bwa to align reads...') bam_fpath = make_bam_fpath(work_dir) if can_reuse(bam_fpath, [l_fpath, r_fpath]): return bam_fpath tmp_dirpath = join(work_dir, 'sambamba_tmp_dir') safe_mkdir(tmp_dirpath) bwa_cmdline = ( '{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' + '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' + '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}' ).format(**locals()) run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False) if dedup: dedup_bam_fpath = add_suffix(bam_fpath, 'dedup') dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format( **locals()) run(dedup_cmdl, output_fpath=dedup_bam_fpath, stdout_to_outputfile=False) verify_bam(dedup_bam_fpath) os.rename(dedup_bam_fpath, bam_fpath) sambamba.index_bam(bam_fpath) # samtools view -b -S -u - | # sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam # /dev/stdin # if dedup: # info() # info('Calling SamBlaster to mark duplicates') # markdup_sam_fpath = markdup_sam(sam_fpath, samblaster) # if markdup_sam_fpath: # sam_fpath = markdup_sam_fpath # info() # info('Converting to BAM') # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals()) # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate) # # info() # info('Sorting BAM') # prefix = splitext(sorted_bam_fpath)[0] # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals()) # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate) return bam_fpath
def cut(fpath, col_num, output_fpath=None): output_fpath = output_fpath or add_suffix(fpath, 'cut') if can_reuse(output_fpath, fpath): return output_fpath cmdline = 'cut -f' + ','.join(map(str, range( 1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def align(work_dir, sample_name, l_fpath, r_fpath, bwa, smb, bwa_prefix, dedup=True, threads=1): info('Running bwa to align reads...') bam_fpath = make_bam_fpath(work_dir) if can_reuse(bam_fpath, [l_fpath, r_fpath]): return bam_fpath tmp_dirpath = join(work_dir, 'sambamba_tmp_dir') safe_mkdir(tmp_dirpath) bwa_cmdline = ('{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' + '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' + '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}').format(**locals()) run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False) if dedup: dedup_bam_fpath = add_suffix(bam_fpath, 'dedup') dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format(**locals()) run(dedup_cmdl, output_fpath=dedup_bam_fpath, stdout_to_outputfile=False) verify_bam(dedup_bam_fpath) os.rename(dedup_bam_fpath, bam_fpath) sambamba.index_bam(bam_fpath) # samtools view -b -S -u - | # sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam # /dev/stdin # if dedup: # info() # info('Calling SamBlaster to mark duplicates') # markdup_sam_fpath = markdup_sam(sam_fpath, samblaster) # if markdup_sam_fpath: # sam_fpath = markdup_sam_fpath # info() # info('Converting to BAM') # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals()) # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate) # # info() # info('Sorting BAM') # prefix = splitext(sorted_bam_fpath)[0] # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals()) # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate) return bam_fpath
def _test(self, name, genome, opts=None): os.chdir(self.results_dir) input_fname = genome + '.bed' output_fname = add_suffix(input_fname, 'anno') input_fpath = join(self.data_dir, input_fname) output_fpath = join(self.results_dir, output_fname) cmdl = [self.script, input_fpath, '-o', output_fpath] + \ (opts or []) + \ ['--debug'] + \ ['-g', genome] swap_output(output_fpath) info('-' * 100) check_call(cmdl) info('-' * 100) info('') self._check_file(output_fpath)
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed( clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count( ) == 3 or reannotate: debug( 'Annotating target BED file and collecting overlapping genome features' ) overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname( work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix( join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def make_downsampled_fpath(work_dir, fastq_fpath): return join(work_dir, add_suffix(basename(fastq_fpath), 'subset'))
def main(): description = ''' Usage: ' + __file__ + ' hg19 [db.gtf] ''' options = [ (['--debug'], dict(dest='debug', action='store_true', default=False)), ] parser = OptionParser(description=description) for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if len(args) == 0: parser.exit(1, 'Please provide genome name as the first argument') logger.is_debug = opts.debug genome_name = args[0] if len(args) > 1: gtf_fpath = args[1] else: gtf_fpath = ebl.ensembl_gtf_fpath(genome_name) if not isfile(gtf_fpath): if not gtf_fpath.endswith('.gz'): gtf_fpath += '.gz' gtf_fpath = verify_file(gtf_fpath) debug('Reading the GTF database') db = gtf.get_gtf_db(gtf_fpath) debug('Reading biomart data') features_by_ens_id = read_biomart(genome_name) chroms = [c for c, l in ref.get_chrom_lengths(genome_name)] output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed') unsorted_output_fpath = add_suffix(output_fpath, 'unsorted') debug('Processing features, writing to ' + unsorted_output_fpath) def _get(_rec, _key): val = _rec.attributes.get(_key) if val is None: return None assert len(val) == 1, (_key, str(val)) return val[0] num_tx_not_in_biomart = 0 num_tx_diff_gene_in_biomart = 0 with open(unsorted_output_fpath, 'w') as out: out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n') for rec in db.all_features(order_by=('seqid', 'start', 'end')): if rec.featuretype == 'gene': continue if rec.chrom not in chroms: continue if rec.end - rec.start < 0: continue tx_id = _get(rec, 'transcript_id') gname = _get(rec, 'gene_name') tx_biotype = _get(rec, 'transcript_biotype') if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype') tsl = _get(rec, 'transcript_support_level') hugo_gene = None biomart_rec = features_by_ens_id.get(tx_id) if not biomart_rec: if rec.featuretype == 'transcript': num_tx_not_in_biomart += 1 else: bm_gname = biomart_rec['Associated Gene Name'] bm_tx_biotype = biomart_rec['Transcript type'] bm_tsl = biomart_rec.get('Transcript Support Level (TSL)') hugo_gene = biomart_rec['HGNC symbol'] if bm_gname != gname: if rec.featuretype == 'transcript': num_tx_diff_gene_in_biomart += 1 continue tx_biotype = bm_tx_biotype tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None fs = [None] * len(ebl.BedCols.cols[:-3]) if not rec.chrom.startswith('chr'): rec.chrom = 'chr' + rec.chrom.replace('MT', 'M') fs[:6] = [ rec.chrom, str(rec.start - 1), str(rec.end), gname, rec.attributes.get('exon_number', ['.'])[0], rec.strand ] fs[ebl.BedCols.FEATURE] = rec.featuretype or '.' fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.' fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.' # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.' # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else '' fs[ebl.BedCols.TSL] = tsl or '.' fs[ebl.BedCols.HUGO] = hugo_gene or '.' # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc out.write('\t'.join(fs) + '\n') if num_tx_not_in_biomart: warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart') if num_tx_diff_gene_in_biomart: warn( str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart') debug('Sorting results') sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name) os.remove(unsorted_output_fpath) bgzip_and_tabix(output_fpath)
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir): metric_storage = get_detailed_metric_storage(depth_threshs) report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage) report.add_record( 'Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples])) total_regions = 0 fpaths = [ s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv) ] if not fpaths: err('No targetcov detailed per-gene report was generated; skipping.') return None open_tsv_files = [open(fpath) for fpath in fpaths] first_col = 0 while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break l = lines_for_each_sample[0] if l.startswith('##'): continue elif l.startswith('#'): if l.startswith('#Sample'): first_col = 1 break while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break if all([ not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample ]): shared_fields = lines_for_each_sample[0].split( '\t')[first_col:first_col + 9] reg = report.add_row() reg.add_record('Chr', get_val(shared_fields[0])) reg.add_record('Start', get_int_val(shared_fields[1])) reg.add_record('End', get_int_val(shared_fields[2])) reg.add_record('Size', get_int_val(shared_fields[3])) reg.add_record('Gene', get_val(shared_fields[4])) reg.add_record('Strand', get_val(shared_fields[5])) reg.add_record('Feature', get_val(shared_fields[6])) reg.add_record('Biotype', get_val(shared_fields[7])) reg.add_record('Transcript', get_val(shared_fields[8])) min_depths, ave_depths, stddevs, withins = ([], [], [], []) percents_by_threshs = {t: [] for t in depth_threshs} for l in lines_for_each_sample: fs = l.split('\t') min_depths.append(get_int_val(fs[first_col + 9])) ave_depths.append(get_float_val(fs[first_col + 10])) stddevs.append(get_float_val(fs[first_col + 11])) withins.append(get_float_val(fs[first_col + 12])) for t, f in zip(depth_threshs, fs[first_col + 13:]): percents_by_threshs[t].append(get_float_val(f)) # counting bests reg.add_record('Min depth', select_best(min_depths)) reg.add_record('Ave depth', select_best(ave_depths)) reg.add_record('Std dev', select_best(stddevs, max)) reg.add_record('W/n 20% of median depth', select_best(withins)) for t in depth_threshs: reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t])) total_regions += 1 for f in open_tsv_files: f.close() gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best') txt_rep_fpath = report.save_txt( join(output_dir, gene_report_basename + '.txt')) tsv_rep_fpath = report.save_tsv( join(output_dir, gene_report_basename + '.tsv')) info('') info('Best values for the regions (total ' + str(total_regions) + ') saved into:') info(' ' + txt_rep_fpath) return txt_rep_fpath
def main(): description = ''' Usage: ' + __file__ + ' hg19 [db.gtf] ''' options = [ (['--debug'], dict(dest='debug', action='store_true', default=False)), ] parser = OptionParser(description=description) for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if len(args) == 0: parser.exit(1, 'Please provide genome name as the first argument') logger.is_debug = opts.debug genome_name = args[0] if len(args) > 1: gtf_fpath = args[1] else: gtf_fpath = ebl.ensembl_gtf_fpath(genome_name) if not isfile(gtf_fpath): if not gtf_fpath.endswith('.gz'): gtf_fpath += '.gz' gtf_fpath = verify_file(gtf_fpath) debug('Reading the GTF database') db = gtf.get_gtf_db(gtf_fpath) debug('Reading biomart data') features_by_ens_id = read_biomart(genome_name) chroms = [c for c, l in ref.get_chrom_lengths(genome_name)] output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed') unsorted_output_fpath = add_suffix(output_fpath, 'unsorted') debug('Processing features, writing to ' + unsorted_output_fpath) def _get(_rec, _key): val = _rec.attributes.get(_key) if val is None: return None assert len(val) == 1, (_key, str(val)) return val[0] num_tx_not_in_biomart = 0 num_tx_diff_gene_in_biomart = 0 with open(unsorted_output_fpath, 'w') as out: out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n') for rec in db.all_features(order_by=('seqid', 'start', 'end')): if rec.featuretype == 'gene': continue if rec.chrom not in chroms: continue if rec.end - rec.start < 0: continue tx_id = _get(rec, 'transcript_id') gname = _get(rec, 'gene_name') tx_biotype = _get(rec, 'transcript_biotype') if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype') tsl = _get(rec, 'transcript_support_level') hugo_gene = None biomart_rec = features_by_ens_id.get(tx_id) if not biomart_rec: if rec.featuretype == 'transcript': num_tx_not_in_biomart += 1 else: bm_gname = biomart_rec['Associated Gene Name'] bm_tx_biotype = biomart_rec['Transcript type'] bm_tsl = biomart_rec.get('Transcript Support Level (TSL)') hugo_gene = biomart_rec['HGNC symbol'] if bm_gname != gname: if rec.featuretype == 'transcript': num_tx_diff_gene_in_biomart += 1 continue tx_biotype = bm_tx_biotype tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None fs = [None] * len(ebl.BedCols.cols[:-3]) if not rec.chrom.startswith('chr'): rec.chrom = 'chr' + rec.chrom.replace('MT', 'M') fs[:6] = [rec.chrom, str(rec.start - 1), str(rec.end), gname, rec.attributes.get('exon_number', ['.'])[0], rec.strand] fs[ebl.BedCols.FEATURE] = rec.featuretype or '.' fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.' fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.' # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.' # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else '' fs[ebl.BedCols.TSL] = tsl or '.' fs[ebl.BedCols.HUGO] = hugo_gene or '.' # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc out.write('\t'.join(fs) + '\n') if num_tx_not_in_biomart: warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart') if num_tx_diff_gene_in_biomart: warn(str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart') debug('Sorting results') sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name) os.remove(unsorted_output_fpath) bgzip_and_tabix(output_fpath)
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate: debug('Annotating target BED file and collecting overlapping genome features') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir): metric_storage = get_detailed_metric_storage(depth_threshs) report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage) report.add_record('Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples])) total_regions = 0 fpaths = [s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv)] if not fpaths: err('No targetcov detailed per-gene report was generated; skipping.') return None open_tsv_files = [open(fpath) for fpath in fpaths] first_col = 0 while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break l = lines_for_each_sample[0] if l.startswith('##'): continue elif l.startswith('#'): if l.startswith('#Sample'): first_col = 1 break while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break if all([not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample]): shared_fields = lines_for_each_sample[0].split('\t')[first_col:first_col+9] reg = report.add_row() reg.add_record('Chr', get_val(shared_fields[0])) reg.add_record('Start', get_int_val(shared_fields[1])) reg.add_record('End', get_int_val(shared_fields[2])) reg.add_record('Size', get_int_val(shared_fields[3])) reg.add_record('Gene', get_val(shared_fields[4])) reg.add_record('Strand', get_val(shared_fields[5])) reg.add_record('Feature', get_val(shared_fields[6])) reg.add_record('Biotype', get_val(shared_fields[7])) reg.add_record('Transcript', get_val(shared_fields[8])) min_depths, ave_depths, stddevs, withins = ([], [], [], []) percents_by_threshs = {t: [] for t in depth_threshs} for l in lines_for_each_sample: fs = l.split('\t') min_depths.append(get_int_val(fs[first_col+9])) ave_depths.append(get_float_val(fs[first_col+10])) stddevs.append(get_float_val(fs[first_col+11])) withins.append(get_float_val(fs[first_col+12])) for t, f in zip(depth_threshs, fs[first_col+13:]): percents_by_threshs[t].append(get_float_val(f)) # counting bests reg.add_record('Min depth', select_best(min_depths)) reg.add_record('Ave depth', select_best(ave_depths)) reg.add_record('Std dev', select_best(stddevs, max)) reg.add_record('W/n 20% of median depth', select_best(withins)) for t in depth_threshs: reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t])) total_regions += 1 for f in open_tsv_files: f.close() gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best') txt_rep_fpath = report.save_txt(join(output_dir, gene_report_basename + '.txt')) tsv_rep_fpath = report.save_tsv(join(output_dir, gene_report_basename + '.tsv')) info('') info('Best values for the regions (total ' + str(total_regions) + ') saved into:') info(' ' + txt_rep_fpath) return txt_rep_fpath