Ejemplo n.º 1
0
def _correct_qualimap_insert_size_histogram(work_dir, samples):
    """ replacing Qualimap insert size histogram with Picard one.
    """
    for s in samples:
        qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace(
            'raw_data_qualimapReport', 'raw_data')
        qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath)
        if exists(qualimap1_dirname):
            if not exists(qualimap2_dirname):
                shutil.move(qualimap1_dirname, qualimap2_dirname)
            else:
                shutil.rmtree(qualimap1_dirname)
        elif not exists(qualimap2_dirname):
            continue  # no data from both Qualimap v.1 and Qualimap v.2

        # if qualimap histogram exits and reuse_intermediate, skip
        if verify_file(s.qualimap_ins_size_hist_fpath,
                       silent=True) and tc.reuse_intermediate:
            pass
        else:
            if verify_file(s.picard_ins_size_hist_txt_fpath):
                with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f:
                    one_line_to_stop = False
                    for line in picard_f:
                        if one_line_to_stop:
                            break
                        if line.startswith('## HISTOGRAM'):
                            one_line_to_stop = True

                    with file_transaction(
                            work_dir, s.qualimap_ins_size_hist_fpath) as tx:
                        with open(tx, 'w') as qualimap_f:
                            for line in picard_f:
                                qualimap_f.write(line)
Ejemplo n.º 2
0
def combined_regional_reports(work_dir, output_dir, samples):
    if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples):
        return None, None

    tsv_region_rep_fpath = join(output_dir, basename(samples[0].targqc_region_tsv))
    debug('Combining regional reports, writing to ' + tsv_region_rep_fpath)
    with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv:
        with open(tx_tsv, 'w') as tsv_out:
            # sample_i = 0
            # for s in samples:
            #     if s.targqc_region_txt and verify_file(s.targqc_region_txt):
            #         with open(s.targqc_region_txt) as txt_in:
            #             for l in txt_in:
            #                 if l.startswith('#'):
            #                     if not l.startswith('##') and sample_i == 0:
            #                         txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr '))
            #                 else:
            #                     txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l)
            #         sample_i += 1
            sample_i = 0
            for s in samples:
                if s.targqc_region_tsv and verify_file(s.targqc_region_tsv):
                    with open(s.targqc_region_tsv) as tsv_in:
                        for i, l in enumerate(tsv_in):
                            if i == 0:
                                if sample_i == 0:
                                    tsv_out.write('sample\t' + l)
                            else:
                                tsv_out.write(s.name + '\t' + l)
                    sample_i += 1

    return tsv_region_rep_fpath
Ejemplo n.º 3
0
    def _make_wgs_regions_file(self, work_dir, genome=None):
        self.wgs_bed_fpath = join(work_dir, 'targqc_features_to_report.bed')
        if can_reuse(self.wgs_bed_fpath, ebl.ensembl_gtf_fpath(genome)):
            return self.wgs_bed_fpath

        chr_order = reference_data.get_chrom_order(genome or cfg.genome)

        r_by_tx_by_gene = OrderedDefaultDict(lambda: defaultdict(list))
        all_features = ebl.get_all_features(genome or cfg.genome, high_confidence=True)

        debug('Select best transcript to report')
        for r in all_features:
            if r[ebl.BedCols.FEATURE] != 'gene':
                gene = r[ebl.BedCols.GENE]
                tx = r[ebl.BedCols.ENSEMBL_ID]
                r_by_tx_by_gene[gene][tx].append(r.fields)

        with file_transaction(work_dir, self.wgs_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for gname, r_by_tx in r_by_tx_by_gene.items():
                    all_tx = (x for xx in r_by_tx.values() for x in xx if x[ebl.BedCols.FEATURE] == 'transcript')
                    tx_sorted_list = [x[ebl.BedCols.ENSEMBL_ID] for x in sorted(all_tx, key=tx_priority_sort_key)]
                    if not tx_sorted_list:
                        continue
                    tx_id = tx_sorted_list[0]
                    for r in sorted(r_by_tx[tx_id], key=get_sort_key(chr_order)):
                        out.write('\t'.join(str(f) for f in r) + '\n')
        return self.wgs_bed_fpath
Ejemplo n.º 4
0
def combined_regional_reports(work_dir, output_dir, samples):
    if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples):
        return None, None

    tsv_region_rep_fpath = join(output_dir,
                                basename(samples[0].targqc_region_tsv))
    debug('Combining regional reports, writing to ' + tsv_region_rep_fpath)
    with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv:
        with open(tx_tsv, 'w') as tsv_out:
            # sample_i = 0
            # for s in samples:
            #     if s.targqc_region_txt and verify_file(s.targqc_region_txt):
            #         with open(s.targqc_region_txt) as txt_in:
            #             for l in txt_in:
            #                 if l.startswith('#'):
            #                     if not l.startswith('##') and sample_i == 0:
            #                         txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr '))
            #                 else:
            #                     txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l)
            #         sample_i += 1
            sample_i = 0
            for s in samples:
                if s.targqc_region_tsv and verify_file(s.targqc_region_tsv):
                    with open(s.targqc_region_tsv) as tsv_in:
                        for i, l in enumerate(tsv_in):
                            if i == 0:
                                if sample_i == 0:
                                    tsv_out.write('sample\t' + l)
                            else:
                                tsv_out.write(s.name + '\t' + l)
                    sample_i += 1

    return tsv_region_rep_fpath
Ejemplo n.º 5
0
def _correct_qualimap_insert_size_histogram(samples):
    """ replacing Qualimap insert size histogram with Picard one.
    """
    for s in samples:
        qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace('raw_data_qualimapReport', 'raw_data')
        qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath)
        if exists(qualimap1_dirname):
            if not exists(qualimap2_dirname):
                shutil.move(qualimap1_dirname, qualimap2_dirname)
            else:
                shutil.rmtree(qualimap1_dirname)
        elif not exists(qualimap2_dirname):
            continue  # no data from both Qualimap v.1 and Qualimap v.2

        # if qualimap histogram exits and reuse_intermediate, skip
        if verify_file(s.qualimap_ins_size_hist_fpath, silent=True) and cfg.reuse_intermediate:
            pass
        else:
            if verify_file(s.picard_ins_size_hist_txt_fpath):
                with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f:
                    one_line_to_stop = False
                    for line in picard_f:
                        if one_line_to_stop:
                            break
                        if line.startswith('## HISTOGRAM'):
                            one_line_to_stop = True

                    with file_transaction(None, s.qualimap_ins_size_hist_fpath) as tx:
                        with open(tx, 'w') as qualimap_f:
                            for line in picard_f:
                                qualimap_f.write(line)
Ejemplo n.º 6
0
def main():
    options = [
        (['-g', '--genome'], dict(
            dest='genome',
            help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES),
        )),
    ]
    parser = OptionParser()
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    opts, args = parser.parse_args()

    if not opts.genome:
        critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)')
    genome = opts.genome

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES))

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] == 'CDS')
    features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome))

    info('Saving CDS regions...')
    output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed'))
    with file_transaction(None, output_fpath) as tx:
        features_bed.cut(range(6)).saveas(tx)
    info('Done, saved to ' + output_fpath)
Ejemplo n.º 7
0
def partition_gtf(gtf, coding=False, out_file=False):
    """
    return a GTF file of all non-coding or coding transcripts. the GTF must be annotated
    with gene_biotype = "protein_coding" or to have the source column set to the
    biotype for all coding transcripts. set coding to
    True to get only the coding, false to get only the non-coding
    """
    if out_file and file_exists(out_file):
        return out_file
    if not out_file:
        out_file = tempfile.NamedTemporaryFile(delete=False,
                                               suffix=".gtf").name

    if coding:
        pred = lambda biotype: biotype and biotype == "protein_coding"
    else:
        pred = lambda biotype: biotype and biotype != "protein_coding"

    biotype_lookup = _biotype_lookup_fn(gtf)

    db = get_gtf_db(gtf)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for feature in db.all_features():
                biotype = biotype_lookup(feature)
                if pred(biotype):
                    out_handle.write(str(feature) + "\n")
    return out_file
Ejemplo n.º 8
0
    def _make_wgs_regions_file(self, work_dir, genome=None):
        self.wgs_bed_fpath = join(work_dir, 'targqc_features_to_report.bed')
        if can_reuse(self.wgs_bed_fpath, ebl.ensembl_gtf_fpath(genome)):
            return self.wgs_bed_fpath

        chr_order = reference_data.get_chrom_order(genome or cfg.genome)

        r_by_tx_by_gene = OrderedDefaultDict(lambda: defaultdict(list))
        all_features = ebl.get_all_features(genome or cfg.genome,
                                            high_confidence=True)

        debug('Select best transcript to report')
        for r in all_features:
            if r[ebl.BedCols.FEATURE] != 'gene':
                gene = r[ebl.BedCols.GENE]
                tx = r[ebl.BedCols.ENSEMBL_ID]
                r_by_tx_by_gene[gene][tx].append(r.fields)

        with file_transaction(work_dir, self.wgs_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for gname, r_by_tx in r_by_tx_by_gene.items():
                    all_tx = (x for xx in r_by_tx.values() for x in xx
                              if x[ebl.BedCols.FEATURE] == 'transcript')
                    tx_sorted_list = [
                        x[ebl.BedCols.ENSEMBL_ID]
                        for x in sorted(all_tx, key=tx_priority_sort_key)
                    ]
                    if not tx_sorted_list:
                        continue
                    tx_id = tx_sorted_list[0]
                    for r in sorted(r_by_tx[tx_id],
                                    key=get_sort_key(chr_order)):
                        out.write('\t'.join(str(f) for f in r) + '\n')
        return self.wgs_bed_fpath
Ejemplo n.º 9
0
def sort_bed_gsort(input_bed_fpath,
                   output_bed_fpath=None,
                   work_dir=None,
                   fai_fpath=None,
                   genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    if fai_fpath:
        fai_fpath = verify_file(fai_fpath)
    elif genome:
        fai_fpath = verify_file(ref.get_fai(genome))
    else:
        critical('Either of fai_fpath or genome build name must be specified')

    with file_transaction(work_dir, output_bed_fpath) as tx:
        run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()),
            output_fpath=tx)

    return output_bed_fpath
Ejemplo n.º 10
0
def filter_bed_with_gene_set(bed_fpath, gene_keys_set, output_fpath):
    with file_transaction(None, output_fpath) as tx:
        with open(bed_fpath) as inp, open(tx, 'w') as out:
            for l in inp:
                if l.strip('\n'):
                    chrom, start, end, gene = l.strip('\n').split('\t')
                    if (gene, chrom) in gene_keys_set:
                        out.write(l)
Ejemplo n.º 11
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome + ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.')
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.')
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
             ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
Ejemplo n.º 12
0
def sort_bed(input_bed_fpath,
             output_bed_fpath=None,
             work_dir=None,
             fai_fpath=None,
             chr_order=None,
             genome=None):
    input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True)
    output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \
        else intermediate_fname(work_dir, input_bed_fpath, 'sorted')

    debug('Sorting regions in ' + str(input_bed_fpath))
    if can_reuse(output_bed_fpath, input_bed_fpath):
        debug(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    regions = []

    if not chr_order:
        if fai_fpath:
            fai_fpath = verify_file(fai_fpath)
        elif genome:
            fai_fpath = verify_file(ref.get_fai(genome))
        else:
            critical(
                'Either of chr_order, fai_fpath, or genome build name must be specified'
            )
        chr_order = get_chrom_order(fai_fpath=fai_fpath)

    with open(input_bed_fpath) as f:
        with file_transaction(work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    debug('Sorted ' + str(len(regions)) + ' regions, saved to ' +
          output_bed_fpath)
    return output_bed_fpath
Ejemplo n.º 13
0
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True,
        stdout_tx=True, reuse=False, env_vars=None):
    """Run the provided command, logging details and checking for errors.
    """
    if output_fpath and reuse:
        if verify_file(output_fpath, silent=True):
            info(output_fpath + ' exists, reusing')
            return output_fpath
        if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True):
            info(output_fpath + '.gz exists, reusing')
            return output_fpath

    env = os.environ.copy()
    if env_vars:
        for k, v in env_vars.items():
            if v is None:
                if k in env:
                    del env[k]
            else:
                env[k] = v

    if checks is None:
        checks = [file_nonempty_check]

    def _try_run(_cmd, _output_fpath, _input_fpath):
        try:
            info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, six.string_types) else _cmd)
            _do_run(_cmd, checks, env, _output_fpath, _input_fpath)
        except:
            raise

    if output_fpath:
        if isfile(output_fpath):
            os.remove(output_fpath)
    if output_fpath:
        if stdout_tx:
            with file_transaction(None, output_fpath) as tx_out_file:
                if stdout_to_outputfile:
                    cmd += ' > ' + tx_out_file
                else:
                    cmd += '\n'
                    cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \
                             .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \
                             .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \
                             .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \
                             .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \
                             .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \
                             .replace('\n', '')
                _try_run(cmd, tx_out_file, input_fpath)
        else:
            _try_run(cmd, output_fpath, input_fpath)

    else:
        _try_run(cmd, None, input_fpath)
Ejemplo n.º 14
0
    def _make_padded_bed(self, work_dir, fai_fpath, padding):
        if self.is_wgs:
            return None

        self.padded_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'padded')
        if can_reuse(self.padded_bed_fpath, self.capture_bed_fpath):
            return BedTool(self.padded_bed_fpath)

        padded_bed = self.bed.slop(b=padding, g=fai_fpath).sort().merge()
        with file_transaction(work_dir, self.padded_bed_fpath) as tx:
            padded_bed.saveas(tx)
        verify_file(self.padded_bed_fpath, is_critical=True)
        return BedTool(self.padded_bed_fpath)
Ejemplo n.º 15
0
def merge_overlaps(work_dir, bed_fpath, distance=None):
    """Merge bed file intervals to avoid overlapping regions.
    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged')
    if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath):
        return output_fpath

    with file_transaction(work_dir, output_fpath) as tx:
        kwargs = dict(d=distance) if distance else dict()
        BedTool(bed_fpath).merge(**kwargs).saveas(tx)
    return output_fpath
Ejemplo n.º 16
0
def clean_bed(bed_fpath, work_dir):
    clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')

    if not can_reuse(clean_fpath, bed_fpath):
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        bed = BedTool(bed_fpath)
        bed = bed.filter(lambda x: x.chrom and not any(
            x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(work_dir, clean_fpath) as tx_out_file:
            bed.saveas(tx_out_file)
        verify_bed(clean_fpath, is_critical=True)
        debug('Saved clean BED file into ' + clean_fpath)
    return clean_fpath
Ejemplo n.º 17
0
def tx2genefile(gtf, out_file=None):
    """
    write out a file of transcript->gene mappings.
    use the installed tx2gene.csv if it exists, else write a new one out
    """
    installed_tx2gene = os.path.join(os.path.dirname(gtf), "tx2gene.csv")
    if file_exists(installed_tx2gene):
        return installed_tx2gene
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for k, v in transcript_to_gene(gtf).items():
                out_handle.write(",".join([k, v]) + "\n")
    return out_file
Ejemplo n.º 18
0
    def _make_padded_bed(self, work_dir, fai_fpath, padding):
        if self.is_wgs:
            return None

        self.padded_bed_fpath = intermediate_fname(work_dir,
                                                   self.capture_bed_fpath,
                                                   'padded')
        if can_reuse(self.padded_bed_fpath, self.capture_bed_fpath):
            return BedTool(self.padded_bed_fpath)

        padded_bed = self.bed.slop(b=padding, g=fai_fpath).sort().merge()
        with file_transaction(work_dir, self.padded_bed_fpath) as tx:
            padded_bed.saveas(tx)
        verify_file(self.padded_bed_fpath, is_critical=True)
        return BedTool(self.padded_bed_fpath)
Ejemplo n.º 19
0
    def _make_qualimap_bed(self, work_dir):
        if self.is_wgs:
            return None

        self.qualimap_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'qualimap_ready')
        if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath):
            return self.qualimap_bed_fpath

        debug('Merging and saving BED into required bed6 format for Qualimap')
        bed = self.bed.sort().merge()
        with file_transaction(work_dir, self.qualimap_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for i, region in enumerate(x for x in bed):
                    region = [x for x in list(region) if x]
                    fillers = [str(i), "1.0", "+"]
                    full = region + fillers[:6 - len(region)]
                    out.write("\t".join(full) + "\n")
        verify_file(self.qualimap_bed_fpath, is_critical=True)
        return self.qualimap_bed_fpath
Ejemplo n.º 20
0
    def _make_qualimap_bed(self, work_dir):
        if self.is_wgs:
            return None

        self.qualimap_bed_fpath = intermediate_fname(work_dir,
                                                     self.capture_bed_fpath,
                                                     'qualimap_ready')
        if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath):
            return self.qualimap_bed_fpath

        debug('Merging and saving BED into required bed6 format for Qualimap')
        bed = self.bed.sort().merge()
        with file_transaction(work_dir, self.qualimap_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for i, region in enumerate(x for x in bed):
                    region = [x for x in list(region) if x]
                    fillers = [str(i), "1.0", "+"]
                    full = region + fillers[:6 - len(region)]
                    out.write("\t".join(full) + "\n")
        verify_file(self.qualimap_bed_fpath, is_critical=True)
        return self.qualimap_bed_fpath
Ejemplo n.º 21
0
    def _make_target_bed(self,
                         bed_fpath,
                         work_dir,
                         output_dir,
                         is_debug,
                         padding=None,
                         fai_fpath=None,
                         genome=None,
                         reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath,
                                                    'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir,
                                                   clean_target_bed_fpath,
                                                   'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(
                clean_target_bed_fpath,
                output_bed_fpath=sort_target_bed_fpath,
                fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir,
                                                      sort_target_bed_fpath,
                                                      'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count(
                ) == 3 or reannotate:
                    debug(
                        'Annotating target BED file and collecting overlapping genome features'
                    )
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          reannotate=reannotate,
                                          only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(
            work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir,
                                  final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)

        self.capture_bed_fpath = add_suffix(
            join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)
Ejemplo n.º 22
0
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None,
             reannotate=True, high_confidence=False, only_canonical=False,
             coding_only=False, short=False, extended=False, is_debug=False, **kwargs):

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)
        # if reannotate:
        #     bed = BedTool(input_bed_fpath).cut([0, 1, 2])
        #     keep_gene_column = False
        # else:
        #     if col_num > 4:
        #         bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3])
        #     keep_gene_column = True

    # features_bed = features_bed.saveas()
    # cols = features_bed.field_count()
    # if cols < 12:
    #     features_bed = features_bed.each(lambda f: f + ['.']*(12-cols))
    if high_confidence:
        features_bed = features_bed.filter(ebl.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ebl.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x:
        x[ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
        # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num,
                          high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs)

    full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] +
                          ': part of region overlapping with transcripts\n')
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with exons\n')
                out.write('## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                          ': part of region overlapping with protein coding regions\n')
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1
    
    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath
Ejemplo n.º 23
0
def _proc_sambamba_depth(sambamba_depth_output_fpath, output_fpath, sample_name, depth_thresholds):
    read_count_col = None
    mean_cov_col = None
    median_cov_col = None
    min_depth_col = None
    std_dev_col = None
    wn_20_pcnt_col = None

    regions_by_genekey = defaultdict(list)
    #####################################
    #####################################
    if can_reuse(output_fpath, sambamba_depth_output_fpath):
        return output_fpath

    debug('Reading coverage statistics and writing regions to ' + output_fpath)

    def write_line(f, fields):
        f.write('\t'.join(fields) + '\n')

    with file_transaction(None, output_fpath) as tx:
        with open(sambamba_depth_output_fpath) as sambabma_depth_file, open(tx, 'w') as out:
            total_regions_count = 0
            for line in sambabma_depth_file:
                fs = line.strip('\n').split('\t')
                if line.startswith('#'):
                    fs = line.split('\t')
                    read_count_col = fs.index('readCount') + 1
                    mean_cov_col = fs.index('meanCoverage') + 1
                    #median_cov_col = fs.index('medianCoverage') if 'medianCoverage' in fs else None
                    #min_depth_col = fs.index('minDepth') if 'minDepth' in fs else None
                    #std_dev_col = fs.index('stdDev') if 'stdDev' in fs else None
                    #wn_20_pcnt_col = fs.index('percentWithin20PercentOfMedian') if 'percentWithin20PercentOfMedian' in fs else None

                    write_line(out, [
                        'chrom',
                        'start',
                        'end',
                        'size',
                        'gene',
                        'exon',
                        'strand',
                        'feature',
                        'biotype',
                        'transcript',
                        'trx_overlap',
                        'exome_overlap',
                        'cds_overlap',
                        # 'min_depth',
                        'avg_depth',
                        # 'median_depth',
                        # 'std_dev',
                        # 'within_20pct_of_median',
                    ] + ['at{}x'.format(ths) for ths in depth_thresholds])
                    continue

                chrom = fs[0]
                start, end = int(fs[1]), int(fs[2])
                region_size = end - start
                gene_name = fs[ebl.BedCols.GENE] if read_count_col != ebl.BedCols.GENE else '.'
                exon = fs[ebl.BedCols.EXON]
                strand = fs[ebl.BedCols.STRAND]
                feature = fs[ebl.BedCols.FEATURE]
                biotype = fs[ebl.BedCols.BIOTYPE]
                transcript = fs[ebl.BedCols.ENSEMBL_ID]
                transcript_overlap = fs[ebl.BedCols.TX_OVERLAP_PERCENTAGE]
                exome_overlap = fs[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE]
                cds_overlap = fs[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE]
                avg_depth = float(fs[mean_cov_col])
                # min_depth = int(fs[min_depth_col]) if min_depth_col is not None else '.'
                # std_dev = float(fs[std_dev_col]) if std_dev_col is not None else '.'
                # median_depth = int(fs[median_cov_col]) if median_cov_col is not None else '.'
                # rate_within_normal = float(fs[wn_20_pcnt_col]) if wn_20_pcnt_col is not None else '.'
                last_cov_col = max(mean_cov_col or 0, median_cov_col or 0, std_dev_col or 0, wn_20_pcnt_col or 0)
                rates_within_threshs = fs[last_cov_col+1:-1]

                write_line(out, [str(v) if v not in ['', None, '.'] else '.' for v in [
                        chrom,
                        start,
                        end,
                        region_size,
                        gene_name,
                        exon,
                        strand,
                        feature,
                        biotype,
                        transcript,
                        ((transcript_overlap + '%') if transcript_overlap not in ['', None, '.'] else '.'),
                        ((exome_overlap + '%') if exome_overlap not in ['', None, '.'] else '.'),
                        ((cds_overlap + '%') if cds_overlap not in ['', None, '.'] else '.'),
                        # min_depth,
                        avg_depth,
                        # median_depth,
                        # std_dev,
                        # rate_within_normal,
                    ] + rates_within_threshs])

                total_regions_count += 1
                if total_regions_count > 0 and total_regions_count % 10000 == 0:
                    debug('  Processed {0:,} regions'.format(total_regions_count))
        debug('Total regions: ' + str(len(regions_by_genekey)))
    return output_fpath
Ejemplo n.º 24
0
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None):
    debug()
    debug('Determining sex')
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))

    male_bed = None
    for k in chry_key_regions_by_genome:
        if k in genome:
            male_bed = BedTool(chry_key_regions_by_genome.get(k))
            break
    if not male_bed:
        warn('Warning: no male key regions for ' + genome +
             ', cannot identify sex')
        return None

    male_area_size = get_total_bed_size(male_bed)
    debug('Male region total size: ' + str(male_area_size))

    if target_bed:
        target_male_bed = join(work_dir, 'male.bed')
        with file_transaction(work_dir, target_male_bed) as tx:
            BedTool(target_bed).intersect(male_bed).merge().saveas(tx)
        target_male_area_size = get_total_bed_size(target_male_bed)
        if target_male_area_size == 0:
            debug(
                'The male non-PAR region does not overlap with the capture target - cannot determine sex.'
            )
            return None
        male_bed = target_male_bed
    else:
        debug('WGS, determining sex based on chrY key regions coverage.')

    info(
        'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.'
    )
    if not bam_fpath:
        critical('BAM file is required.')
    index_bam(bam_fpath)

    chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1)
    debug('Y key regions average depth: ' + str(chry_mean_coverage))
    avg_depth = float(avg_depth)
    debug('Sample average depth: ' + str(avg_depth))
    if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX:
        debug('Sample average depth is too low (less than ' +
              str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) +
              ') - cannot determine sex')
        return None

    if chry_mean_coverage == 0:
        debug('Y depth is 0 - it\s female')
        sex = 'F'
    else:
        factor = avg_depth / chry_mean_coverage
        debug('Sample depth / Y depth = ' + str(factor))
        if factor > FEMALE_Y_COVERAGE_FACTOR:  # if mean target coverage much higher than chrY coverage
            debug('Sample depth is more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s female')
            sex = 'F'
        else:
            debug('Sample depth is not more than ' +
                  str(FEMALE_Y_COVERAGE_FACTOR) +
                  ' times higher than Y depth - it\s male')
            sex = 'M'
    debug('Sex is ' + sex)
    debug()
    return sex
Ejemplo n.º 25
0
def downsample(work_dir,
               sample_name,
               fastq_left_fpath,
               fastq_right_fpath,
               downsample_to,
               num_pairs=None):
    """ get N random headers from a fastq file without reading the
    whole thing into memory
    modified from: http://www.biostars.org/p/6544/
    """
    sample_name = sample_name or splitext(''.join(
        lc if lc == rc else ''
        for lc, rc in zip(fastq_left_fpath, fastq_right_fpath)))[0]

    l_out_fpath = make_downsampled_fpath(work_dir, fastq_left_fpath)
    r_out_fpath = make_downsampled_fpath(work_dir, fastq_right_fpath)
    if can_reuse(l_out_fpath, [fastq_left_fpath, fastq_right_fpath]):
        return l_out_fpath, r_out_fpath

    info('Processing ' + sample_name)
    if num_pairs is None:
        info(sample_name + ': counting number of reads in fastq...')
        num_pairs = _count_records_in_fastq(fastq_left_fpath)
    if num_pairs > LIMIT:
        info(sample_name + ' the number of reads is higher than ' +
             str(LIMIT) + ', sampling from only first ' + str(LIMIT))
        num_pairs = LIMIT
    info(sample_name + ': ' + str(num_pairs) + ' reads')
    num_downsample_pairs = int(downsample_to * num_pairs) if isinstance(
        downsample_to, float) else downsample_to
    if num_pairs <= num_downsample_pairs:
        info(sample_name + ': and it is less than ' +
             str(num_downsample_pairs) + ', so no downsampling.')
        return fastq_left_fpath, fastq_right_fpath
    else:
        info(sample_name + ': downsampling to ' + str(num_downsample_pairs))
        rand_records = sorted(
            random.sample(range(num_pairs), num_downsample_pairs))

    info('Opening ' + fastq_left_fpath)
    fh1 = open_gzipsafe(fastq_left_fpath)
    info('Opening ' + fastq_right_fpath)
    fh2 = open_gzipsafe(fastq_right_fpath) if fastq_right_fpath else None

    out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath, )

    written_records = 0
    with file_transaction(work_dir, out_files) as tx_out_files:
        if isinstance(tx_out_files, six.string_types):
            tx_out_f1 = tx_out_files
        else:
            tx_out_f1, tx_out_f2 = tx_out_files
        info('Opening ' + str(tx_out_f1) + ' to write')
        sub1 = open_gzipsafe(tx_out_f1, "w")
        info('Opening ' + str(tx_out_f2) + ' to write')
        sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None
        rec_no = -1
        for rr in rand_records:
            while rec_no < rr:
                rec_no += 1
                for i in range(4):
                    fh1.readline()
                if fh2:
                    for i in range(4):
                        fh2.readline()
            for i in range(4):
                sub1.write(fh1.readline())
                if sub2:
                    sub2.write(fh2.readline())
            written_records += 1
            if written_records % 10000 == 0:
                info(sample_name + ': written ' + str(written_records) +
                     ', rec_no ' + str(rec_no + 1))
            if rec_no > num_pairs:
                info(sample_name + ' reached the limit of ' + str(num_pairs),
                     ' read lines, stopping.')
                break
        info(sample_name + ': done, written ' + str(written_records) +
             ', rec_no ' + str(rec_no))
        fh1.close()
        sub1.close()
        if fastq_right_fpath:
            fh2.close()
            sub2.close()

    info(sample_name + ': done downsampling, saved to ' + l_out_fpath +
         ' and ' + r_out_fpath + ', total ' + str(written_records) +
         ' paired reads written')
    return l_out_fpath, r_out_fpath
Ejemplo n.º 26
0
    def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug,
                         padding=None, fai_fpath=None, genome=None, reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate:
                    debug('Annotating target BED file and collecting overlapping genome features')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, reannotate=reannotate, only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
                         genome=genome, extended=True, only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir, final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)
        
        self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)
Ejemplo n.º 27
0
def annotate(input_bed_fpath,
             output_fpath,
             work_dir,
             genome=None,
             reannotate=True,
             high_confidence=False,
             only_canonical=False,
             coding_only=False,
             short=False,
             extended=False,
             is_debug=False,
             **kwargs):

    debug('Getting features from storage')
    features_bed = ebl.get_all_features(genome)
    if features_bed is None:
        critical('Genome ' + genome + ' is not supported. Supported: ' +
                 ', '.join(ebl.SUPPORTED_GENOMES))

    if genome:
        fai_fpath = reference_data.get_fai(genome)
        chr_order = reference_data.get_chrom_order(genome)
    else:
        fai_fpath = None
        chr_order = bed_chrom_order(input_bed_fpath)

    input_bed_fpath = sort_bed(input_bed_fpath,
                               work_dir=work_dir,
                               chr_order=chr_order,
                               genome=genome)

    ori_bed = BedTool(input_bed_fpath)
    ori_col_num = ori_bed.field_count()
    reannotate = reannotate or ori_col_num == 3
    pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools')))
    ori_bed = BedTool(input_bed_fpath)
    # if reannotate:
    #     bed = BedTool(input_bed_fpath).cut([0, 1, 2])
    #     keep_gene_column = False
    # else:
    #     if col_num > 4:
    #         bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3])
    #     keep_gene_column = True

    # features_bed = features_bed.saveas()
    # cols = features_bed.field_count()
    # if cols < 12:
    #     features_bed = features_bed.each(lambda f: f + ['.']*(12-cols))
    if high_confidence:
        features_bed = features_bed.filter(ebl.high_confidence_filter)
    if only_canonical:
        features_bed = features_bed.filter(
            ebl.get_only_canonical_filter(genome))
    if coding_only:
        features_bed = features_bed.filter(ebl.protein_coding_filter)
    # unique_tx_by_gene = find_best_tx_by_gene(features_bed)

    info('Extracting features from Ensembl GTF')
    features_bed = features_bed.filter(lambda x: x[
        ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript'])
    # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]])

    info('Overlapping regions with Ensembl data')
    if is_debug:
        ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed'))
        debug(f'Saved regions to {ori_bed.fn}')
        features_bed = features_bed.saveas(join(work_dir, 'features.bed'))
        debug(f'Saved features to {features_bed.fn}')
    annotated = _annotate(ori_bed,
                          features_bed,
                          chr_order,
                          fai_fpath,
                          work_dir,
                          ori_col_num,
                          high_confidence=False,
                          reannotate=reannotate,
                          is_debug=is_debug,
                          **kwargs)

    full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols]
    add_ori_extra_fields = ori_col_num > 3
    if not reannotate and ori_col_num == 4:
        add_ori_extra_fields = False  # no need to report the original gene field if we are not re-annotating

    info('Saving annotated regions...')
    total = 0
    with file_transaction(work_dir, output_fpath) as tx:
        with open(tx, 'w') as out:
            header = full_header[:6]
            if short:
                header = full_header[:4]
            if extended:
                header = full_header[:-1]
            if add_ori_extra_fields:
                header.append(full_header[-1])

            if extended:
                out.write(
                    '## ' +
                    ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] +
                    ': part of region overlapping with transcripts\n')
                out.write(
                    '## ' +
                    ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] +
                    ': part of region overlapping with exons\n')
                out.write(
                    '## ' +
                    ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] +
                    ': part of region overlapping with protein coding regions\n'
                )
                out.write('\t'.join(header) + '\n')
            for full_fields in annotated:
                fields = full_fields[:6]
                if short:
                    fields = full_fields[:4]
                if extended:
                    fields = full_fields[:-1]
                if add_ori_extra_fields:
                    fields.append(full_fields[-1])

                out.write('\t'.join(map(_format_field, fields)) + '\n')
                total += 1

    debug('Saved ' + str(total) + ' total annotated regions')
    return output_fpath
Ejemplo n.º 28
0
def downsample(work_dir, sample_name, fastq_left_fpath, fastq_right_fpath, downsample_to, num_pairs=None):
    """ get N random headers from a fastq file without reading the
    whole thing into memory
    modified from: http://www.biostars.org/p/6544/
    """
    sample_name = sample_name or splitext(''.join(lc if lc == rc else '' for lc, rc in zip(fastq_left_fpath, fastq_right_fpath)))[0]

    l_out_fpath = make_downsampled_fpath(work_dir, fastq_left_fpath)
    r_out_fpath = make_downsampled_fpath(work_dir, fastq_right_fpath)
    if can_reuse(l_out_fpath, [fastq_left_fpath, fastq_right_fpath]):
        return l_out_fpath, r_out_fpath

    info('Processing ' + sample_name)
    if num_pairs is None:
        info(sample_name + ': counting number of reads in fastq...')
        num_pairs = _count_records_in_fastq(fastq_left_fpath)
    if num_pairs > LIMIT:
        info(sample_name + ' the number of reads is higher than ' + str(LIMIT) +
             ', sampling from only first ' + str(LIMIT))
        num_pairs = LIMIT
    info(sample_name + ': ' + str(num_pairs) + ' reads')
    num_downsample_pairs = int(downsample_to * num_pairs) if isinstance(downsample_to, float) else downsample_to
    if num_pairs <= num_downsample_pairs:
        info(sample_name + ': and it is less than ' + str(num_downsample_pairs) + ', so no downsampling.')
        return fastq_left_fpath, fastq_right_fpath
    else:
        info(sample_name + ': downsampling to ' + str(num_downsample_pairs))
        rand_records = sorted(random.sample(range(num_pairs), num_downsample_pairs))

    info('Opening ' + fastq_left_fpath)
    fh1 = open_gzipsafe(fastq_left_fpath)
    info('Opening ' + fastq_right_fpath)
    fh2 = open_gzipsafe(fastq_right_fpath) if fastq_right_fpath else None

    out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath,)

    written_records = 0
    with file_transaction(work_dir, out_files) as tx_out_files:
        if isinstance(tx_out_files, six.string_types):
            tx_out_f1 = tx_out_files
        else:
            tx_out_f1, tx_out_f2 = tx_out_files
        info('Opening ' + str(tx_out_f1) + ' to write')
        sub1 = open_gzipsafe(tx_out_f1, "w")
        info('Opening ' + str(tx_out_f2) + ' to write')
        sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None
        rec_no = -1
        for rr in rand_records:
            while rec_no < rr:
                rec_no += 1
                for i in range(4): fh1.readline()
                if fh2:
                    for i in range(4): fh2.readline()
            for i in range(4):
                sub1.write(fh1.readline())
                if sub2:
                    sub2.write(fh2.readline())
            written_records += 1
            if written_records % 10000 == 0:
                info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no + 1))
            if rec_no > num_pairs:
                info(sample_name + ' reached the limit of ' + str(num_pairs), ' read lines, stopping.')
                break
        info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no))
        fh1.close()
        sub1.close()
        if fastq_right_fpath:
            fh2.close()
            sub2.close()

    info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written')
    return l_out_fpath, r_out_fpath