Example #1
0
def align(work_dir,
          sample_name,
          l_fpath,
          r_fpath,
          bwa,
          smb,
          bwa_prefix,
          dedup=True,
          threads=1):
    info('Running bwa to align reads...')
    bam_fpath = make_bam_fpath(work_dir)
    if can_reuse(bam_fpath, [l_fpath, r_fpath]):
        return bam_fpath

    tmp_dirpath = join(work_dir, 'sambamba_tmp_dir')
    safe_mkdir(tmp_dirpath)

    bwa_cmdline = (
        '{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' +
        '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' +
        '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}'
    ).format(**locals())
    run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False)

    if dedup:
        dedup_bam_fpath = add_suffix(bam_fpath, 'dedup')
        dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format(
            **locals())
        run(dedup_cmdl,
            output_fpath=dedup_bam_fpath,
            stdout_to_outputfile=False)
        verify_bam(dedup_bam_fpath)
        os.rename(dedup_bam_fpath, bam_fpath)

    sambamba.index_bam(bam_fpath)

    # samtools view -b -S -u - |
    # sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full
    # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam
    # /dev/stdin

    # if dedup:
    #     info()
    #     info('Calling SamBlaster to mark duplicates')
    #     markdup_sam_fpath = markdup_sam(sam_fpath, samblaster)
    #     if markdup_sam_fpath:
    #         sam_fpath = markdup_sam_fpath
    # info()

    # info('Converting to BAM')
    # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate)
    #
    # info()
    # info('Sorting BAM')
    # prefix = splitext(sorted_bam_fpath)[0]
    # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate)

    return bam_fpath
Example #2
0
def cut(fpath, col_num, output_fpath=None):
    output_fpath = output_fpath or add_suffix(fpath, 'cut')
    if can_reuse(output_fpath, fpath):
        return output_fpath
    cmdline = 'cut -f' + ','.join(map(str, range(
        1, col_num + 1))) + ' ' + fpath + ' > ' + output_fpath
    call_process.run(cmdline, output_fpath=output_fpath)
    return output_fpath
Example #3
0
def align(work_dir, sample_name, l_fpath, r_fpath, bwa, smb, bwa_prefix, dedup=True, threads=1):
    info('Running bwa to align reads...')
    bam_fpath = make_bam_fpath(work_dir)
    if can_reuse(bam_fpath, [l_fpath, r_fpath]):
        return bam_fpath

    tmp_dirpath = join(work_dir, 'sambamba_tmp_dir')
    safe_mkdir(tmp_dirpath)

    bwa_cmdline = ('{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' +
                   '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' +
                   '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}').format(**locals())
    run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False)

    if dedup:
        dedup_bam_fpath = add_suffix(bam_fpath, 'dedup')
        dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format(**locals())
        run(dedup_cmdl, output_fpath=dedup_bam_fpath, stdout_to_outputfile=False)
        verify_bam(dedup_bam_fpath)
        os.rename(dedup_bam_fpath, bam_fpath)

    sambamba.index_bam(bam_fpath)

# samtools view -b -S -u - |
# sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full
    # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam
    # /dev/stdin

    # if dedup:
    #     info()
    #     info('Calling SamBlaster to mark duplicates')
    #     markdup_sam_fpath = markdup_sam(sam_fpath, samblaster)
    #     if markdup_sam_fpath:
    #         sam_fpath = markdup_sam_fpath
    # info()

    # info('Converting to BAM')
    # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate)
    #
    # info()
    # info('Sorting BAM')
    # prefix = splitext(sorted_bam_fpath)[0]
    # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals())
    # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate)

    return bam_fpath
Example #4
0
    def _test(self, name, genome, opts=None):
        os.chdir(self.results_dir)
        input_fname = genome + '.bed'
        output_fname = add_suffix(input_fname, 'anno')
        input_fpath = join(self.data_dir, input_fname)
        output_fpath = join(self.results_dir, output_fname)

        cmdl = [self.script,
                input_fpath,
                '-o', output_fpath] + \
               (opts or []) + \
               ['--debug'] + \
               ['-g', genome]

        swap_output(output_fpath)

        info('-' * 100)
        check_call(cmdl)
        info('-' * 100)
        info('')

        self._check_file(output_fpath)
Example #5
0
    def _make_target_bed(self,
                         bed_fpath,
                         work_dir,
                         output_dir,
                         is_debug,
                         padding=None,
                         fai_fpath=None,
                         genome=None,
                         reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath,
                                                    'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir,
                                                   clean_target_bed_fpath,
                                                   'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(
                clean_target_bed_fpath,
                output_bed_fpath=sort_target_bed_fpath,
                fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir,
                                                      sort_target_bed_fpath,
                                                      'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count(
                ) == 3 or reannotate:
                    debug(
                        'Annotating target BED file and collecting overlapping genome features'
                    )
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          reannotate=reannotate,
                                          only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(
            work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir,
                                  final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)

        self.capture_bed_fpath = add_suffix(
            join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)
Example #6
0
def make_downsampled_fpath(work_dir, fastq_fpath):
    return join(work_dir, add_suffix(basename(fastq_fpath), 'subset'))
Example #7
0
def main():
    description = '''
Usage:
    ' + __file__ + ' hg19 [db.gtf]
'''

    options = [
        (['--debug'], dict(dest='debug', action='store_true', default=False)),
    ]
    parser = OptionParser(description=description)
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()
    if len(args) == 0:
        parser.exit(1, 'Please provide genome name as the first argument')
    logger.is_debug = opts.debug

    genome_name = args[0]

    if len(args) > 1:
        gtf_fpath = args[1]
    else:
        gtf_fpath = ebl.ensembl_gtf_fpath(genome_name)
    if not isfile(gtf_fpath):
        if not gtf_fpath.endswith('.gz'):
            gtf_fpath += '.gz'
    gtf_fpath = verify_file(gtf_fpath)
    debug('Reading the GTF database')
    db = gtf.get_gtf_db(gtf_fpath)

    debug('Reading biomart data')
    features_by_ens_id = read_biomart(genome_name)

    chroms = [c for c, l in ref.get_chrom_lengths(genome_name)]

    output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed')
    unsorted_output_fpath = add_suffix(output_fpath, 'unsorted')
    debug('Processing features, writing to ' + unsorted_output_fpath)

    def _get(_rec, _key):
        val = _rec.attributes.get(_key)
        if val is None:
            return None
        assert len(val) == 1, (_key, str(val))
        return val[0]

    num_tx_not_in_biomart = 0
    num_tx_diff_gene_in_biomart = 0
    with open(unsorted_output_fpath, 'w') as out:
        out.write('\t'.join(ebl.BedCols.names[i]
                            for i in ebl.BedCols.cols[:-4]) + '\n')

        for rec in db.all_features(order_by=('seqid', 'start', 'end')):
            if rec.featuretype == 'gene': continue
            if rec.chrom not in chroms: continue
            if rec.end - rec.start < 0: continue

            tx_id = _get(rec, 'transcript_id')
            gname = _get(rec, 'gene_name')
            tx_biotype = _get(rec, 'transcript_biotype')
            if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype')
            tsl = _get(rec, 'transcript_support_level')
            hugo_gene = None

            biomart_rec = features_by_ens_id.get(tx_id)
            if not biomart_rec:
                if rec.featuretype == 'transcript':
                    num_tx_not_in_biomart += 1
            else:
                bm_gname = biomart_rec['Associated Gene Name']
                bm_tx_biotype = biomart_rec['Transcript type']
                bm_tsl = biomart_rec.get('Transcript Support Level (TSL)')
                hugo_gene = biomart_rec['HGNC symbol']
                if bm_gname != gname:
                    if rec.featuretype == 'transcript':
                        num_tx_diff_gene_in_biomart += 1
                    continue
                tx_biotype = bm_tx_biotype
                tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None

            fs = [None] * len(ebl.BedCols.cols[:-3])
            if not rec.chrom.startswith('chr'):
                rec.chrom = 'chr' + rec.chrom.replace('MT', 'M')
            fs[:6] = [
                rec.chrom,
                str(rec.start - 1),
                str(rec.end), gname,
                rec.attributes.get('exon_number', ['.'])[0], rec.strand
            ]
            fs[ebl.BedCols.FEATURE] = rec.featuretype or '.'
            fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.'
            fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.'
            # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.'
            # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else ''
            fs[ebl.BedCols.TSL] = tsl or '.'
            fs[ebl.BedCols.HUGO] = hugo_gene or '.'
            # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc
            out.write('\t'.join(fs) + '\n')

    if num_tx_not_in_biomart:
        warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart')
    if num_tx_diff_gene_in_biomart:
        warn(
            str(num_tx_diff_gene_in_biomart) +
            ' transcripts have a different gene name in biomart')

    debug('Sorting results')
    sort_bed(unsorted_output_fpath,
             output_fpath,
             fai_fpath=ref.get_fai(genome_name),
             genome=genome_name)
    os.remove(unsorted_output_fpath)
    bgzip_and_tabix(output_fpath)
Example #8
0
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir):
    metric_storage = get_detailed_metric_storage(depth_threshs)

    report = PerRegionSampleReport(sample='Best',
                                   metric_storage=metric_storage)
    report.add_record(
        'Sample', 'contains best values from all samples: ' +
        ', '.join([s.name for s in samples]))

    total_regions = 0
    fpaths = [
        s.targqc_region_tsv for s in samples
        if verify_file(s.targqc_region_tsv)
    ]
    if not fpaths:
        err('No targetcov detailed per-gene report was generated; skipping.')
        return None

    open_tsv_files = [open(fpath) for fpath in fpaths]

    first_col = 0
    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break
        l = lines_for_each_sample[0]
        if l.startswith('##'):
            continue
        elif l.startswith('#'):
            if l.startswith('#Sample'):
                first_col = 1
            break

    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break

        if all([
                not l.startswith('#')
                and ('Whole-Gene' in l or 'Gene-Exon' in l)
                for l in lines_for_each_sample
        ]):
            shared_fields = lines_for_each_sample[0].split(
                '\t')[first_col:first_col + 9]
            reg = report.add_row()
            reg.add_record('Chr', get_val(shared_fields[0]))
            reg.add_record('Start', get_int_val(shared_fields[1]))
            reg.add_record('End', get_int_val(shared_fields[2]))
            reg.add_record('Size', get_int_val(shared_fields[3]))
            reg.add_record('Gene', get_val(shared_fields[4]))
            reg.add_record('Strand', get_val(shared_fields[5]))
            reg.add_record('Feature', get_val(shared_fields[6]))
            reg.add_record('Biotype', get_val(shared_fields[7]))
            reg.add_record('Transcript', get_val(shared_fields[8]))

            min_depths, ave_depths, stddevs, withins = ([], [], [], [])
            percents_by_threshs = {t: [] for t in depth_threshs}

            for l in lines_for_each_sample:
                fs = l.split('\t')

                min_depths.append(get_int_val(fs[first_col + 9]))
                ave_depths.append(get_float_val(fs[first_col + 10]))
                stddevs.append(get_float_val(fs[first_col + 11]))
                withins.append(get_float_val(fs[first_col + 12]))
                for t, f in zip(depth_threshs, fs[first_col + 13:]):
                    percents_by_threshs[t].append(get_float_val(f))

            # counting bests
            reg.add_record('Min depth', select_best(min_depths))
            reg.add_record('Ave depth', select_best(ave_depths))
            reg.add_record('Std dev', select_best(stddevs, max))
            reg.add_record('W/n 20% of median depth', select_best(withins))
            for t in depth_threshs:
                reg.add_record('{}x'.format(t),
                               select_best(percents_by_threshs[t]))

            total_regions += 1

    for f in open_tsv_files:
        f.close()

    gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best')
    txt_rep_fpath = report.save_txt(
        join(output_dir, gene_report_basename + '.txt'))
    tsv_rep_fpath = report.save_tsv(
        join(output_dir, gene_report_basename + '.tsv'))
    info('')
    info('Best values for the regions (total ' + str(total_regions) +
         ') saved into:')
    info('  ' + txt_rep_fpath)

    return txt_rep_fpath
def main():
    description = '''
Usage:
    ' + __file__ + ' hg19 [db.gtf]
'''

    options = [
        (['--debug'], dict(dest='debug', action='store_true', default=False)),
    ]
    parser = OptionParser(description=description)
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()
    if len(args) == 0:
        parser.exit(1, 'Please provide genome name as the first argument')
    logger.is_debug = opts.debug

    genome_name = args[0]

    if len(args) > 1:
        gtf_fpath = args[1]
    else:
        gtf_fpath = ebl.ensembl_gtf_fpath(genome_name)
    if not isfile(gtf_fpath):
        if not gtf_fpath.endswith('.gz'):
            gtf_fpath += '.gz'
    gtf_fpath = verify_file(gtf_fpath)
    debug('Reading the GTF database')
    db = gtf.get_gtf_db(gtf_fpath)

    debug('Reading biomart data')
    features_by_ens_id = read_biomart(genome_name)

    chroms = [c for c, l in ref.get_chrom_lengths(genome_name)]
    
    output_fpath = join(dirname(__file__), genome_name, 'ensembl.bed')
    unsorted_output_fpath = add_suffix(output_fpath, 'unsorted')
    debug('Processing features, writing to ' + unsorted_output_fpath)

    def _get(_rec, _key):
        val = _rec.attributes.get(_key)
        if val is None:
            return None
        assert len(val) == 1, (_key, str(val))
        return val[0]

    num_tx_not_in_biomart = 0
    num_tx_diff_gene_in_biomart = 0
    with open(unsorted_output_fpath, 'w') as out:
        out.write('\t'.join(ebl.BedCols.names[i] for i in ebl.BedCols.cols[:-4]) + '\n')

        for rec in db.all_features(order_by=('seqid', 'start', 'end')):
            if rec.featuretype == 'gene': continue
            if rec.chrom not in chroms: continue
            if rec.end - rec.start < 0: continue
            
            tx_id = _get(rec, 'transcript_id')
            gname = _get(rec, 'gene_name')
            tx_biotype = _get(rec, 'transcript_biotype')
            if not tx_biotype: tx_biotype = _get(rec, 'gene_biotype')
            tsl = _get(rec, 'transcript_support_level')
            hugo_gene = None

            biomart_rec = features_by_ens_id.get(tx_id)
            if not biomart_rec:
                if rec.featuretype == 'transcript':
                    num_tx_not_in_biomart += 1
            else:
                bm_gname = biomart_rec['Associated Gene Name']
                bm_tx_biotype = biomart_rec['Transcript type']
                bm_tsl = biomart_rec.get('Transcript Support Level (TSL)')
                hugo_gene = biomart_rec['HGNC symbol']
                if bm_gname != gname:
                    if rec.featuretype == 'transcript':
                        num_tx_diff_gene_in_biomart += 1
                    continue
                tx_biotype = bm_tx_biotype
                tsl = bm_tsl.split()[0].replace('tsl', '') if bm_tsl else None

            fs = [None] * len(ebl.BedCols.cols[:-3])
            if not rec.chrom.startswith('chr'):
                rec.chrom = 'chr' + rec.chrom.replace('MT', 'M')
            fs[:6] = [rec.chrom,
                      str(rec.start - 1),
                      str(rec.end),
                      gname,
                      rec.attributes.get('exon_number', ['.'])[0],
                      rec.strand]
            fs[ebl.BedCols.FEATURE] = rec.featuretype or '.'
            fs[ebl.BedCols.BIOTYPE] = tx_biotype or '.'
            fs[ebl.BedCols.ENSEMBL_ID] = tx_id or '.'
            # fs[ebl.BedCols.REFSEQ_ID] = refseq_id or '.'
            # fs[ebl.BedCols.IS_CANONICAL] = 'canonical' if refseq_id in canonical_transcripts_ids else ''
            fs[ebl.BedCols.TSL] = tsl or '.'
            fs[ebl.BedCols.HUGO] = hugo_gene or '.'
            # fs[ebl.BedCols.names[ensembl.BedCols.GC]] = gc
            out.write('\t'.join(fs) + '\n')

    if num_tx_not_in_biomart:
        warn(str(num_tx_not_in_biomart) + ' transcripts not found in biomart')
    if num_tx_diff_gene_in_biomart:
        warn(str(num_tx_diff_gene_in_biomart) + ' transcripts have a different gene name in biomart')

    debug('Sorting results')
    sort_bed(unsorted_output_fpath, output_fpath, fai_fpath=ref.get_fai(genome_name), genome=genome_name)
    os.remove(unsorted_output_fpath)
    bgzip_and_tabix(output_fpath)
Example #10
0
def make_downsampled_fpath(work_dir, fastq_fpath):
    return join(work_dir, add_suffix(basename(fastq_fpath), 'subset'))
Example #11
0
    def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug,
                         padding=None, fai_fpath=None, genome=None, reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate:
                    debug('Annotating target BED file and collecting overlapping genome features')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, reannotate=reannotate, only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir, final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)
        
        self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)
Example #12
0
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir):
    metric_storage = get_detailed_metric_storage(depth_threshs)

    report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage)
    report.add_record('Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples]))

    total_regions = 0
    fpaths = [s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv)]
    if not fpaths:
        err('No targetcov detailed per-gene report was generated; skipping.')
        return None

    open_tsv_files = [open(fpath) for fpath in fpaths]

    first_col = 0
    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break
        l = lines_for_each_sample[0]
        if l.startswith('##'):
            continue
        elif l.startswith('#'):
            if l.startswith('#Sample'):
                first_col = 1
            break

    while True:
        lines_for_each_sample = [next(f, None) for f in open_tsv_files]
        if not all(lines_for_each_sample):
            break

        if all([not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample]):
            shared_fields = lines_for_each_sample[0].split('\t')[first_col:first_col+9]
            reg = report.add_row()
            reg.add_record('Chr', get_val(shared_fields[0]))
            reg.add_record('Start', get_int_val(shared_fields[1]))
            reg.add_record('End', get_int_val(shared_fields[2]))
            reg.add_record('Size', get_int_val(shared_fields[3]))
            reg.add_record('Gene', get_val(shared_fields[4]))
            reg.add_record('Strand', get_val(shared_fields[5]))
            reg.add_record('Feature', get_val(shared_fields[6]))
            reg.add_record('Biotype', get_val(shared_fields[7]))
            reg.add_record('Transcript', get_val(shared_fields[8]))

            min_depths, ave_depths, stddevs, withins = ([], [], [], [])
            percents_by_threshs = {t: [] for t in depth_threshs}

            for l in lines_for_each_sample:
                fs = l.split('\t')

                min_depths.append(get_int_val(fs[first_col+9]))
                ave_depths.append(get_float_val(fs[first_col+10]))
                stddevs.append(get_float_val(fs[first_col+11]))
                withins.append(get_float_val(fs[first_col+12]))
                for t, f in zip(depth_threshs, fs[first_col+13:]):
                    percents_by_threshs[t].append(get_float_val(f))

            # counting bests
            reg.add_record('Min depth', select_best(min_depths))
            reg.add_record('Ave depth', select_best(ave_depths))
            reg.add_record('Std dev', select_best(stddevs, max))
            reg.add_record('W/n 20% of median depth', select_best(withins))
            for t in depth_threshs:
                reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t]))

            total_regions += 1

    for f in open_tsv_files:
        f.close()

    gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best')
    txt_rep_fpath = report.save_txt(join(output_dir, gene_report_basename + '.txt'))
    tsv_rep_fpath = report.save_tsv(join(output_dir, gene_report_basename + '.tsv'))
    info('')
    info('Best values for the regions (total ' + str(total_regions) + ') saved into:')
    info('  ' + txt_rep_fpath)

    return txt_rep_fpath