Ejemplo n.º 1
0
def split(args):
    input_bam = tk_bam.create_bam_infile(args.possorted_bam)

    # chromosomes should be ordered as chr1, chr11, chr12, ..., chr19, chr2
    chroms0 = input_bam.references
    chrom_lengths0 = input_bam.lengths

    chroms, chrom_lengths = (list(t) for t in zip(
        *sorted(zip(chroms0, chrom_lengths0))))

    loci = []
    for (chrom, length) in zip(chroms, chrom_lengths):
        start = 0
        while start + tenkit.constants.PARALLEL_LOCUS_SIZE < length:
            stop = start + tenkit.constants.PARALLEL_LOCUS_SIZE
            loci.append({
                'locus': tk_io.create_locus_info(chrom, start, stop),
                '__mem_gb': 4
            })
            start += tenkit.constants.PARALLEL_LOCUS_SIZE
        loci.append({
            'locus': tk_io.create_locus_info(chrom, start, length),
            '__mem_gb': 4
        })

    return {'chunks': loci, 'join': {'__mem_gb': 8.0}}
Ejemplo n.º 2
0
def get_sized_bam_chunks(bam_fn, gb_per_chunk, contig_whitelist=None, target_regions=None, extra_args = {}):
    '''Divide a BAM file into disjoint loci with a max compressed size of roughly gb_per_chunk. If contig_whitelist is supplied,
       those contigs will not be included. If target_regions is supplied, boundaries will be adjusted to avoid on-target regions'''
    total_size = os.path.getsize(bam_fn)
    bam = pysam.Samfile(bam_fn)

    file_starts = []
    last_pos = 0
    for chrom in bam.references:
        offset = get_voffset(bam, chrom, 0)
        if offset is None:
            offset = last_pos
        else:
            last_pos = offset

        file_starts.append(offset)

    file_sizes = []
    for i in range(len(file_starts) - 1):
        file_sizes.append(file_starts[i+1] - file_starts[i])
    file_sizes.append(total_size - file_starts[-1])

    loci = []
    for (chrom, file_start, file_size, chrom_size) in zip(bam.references, file_starts, file_sizes, bam.lengths):
        if contig_whitelist is None or chrom in contig_whitelist:
        
            n_chunks = max(1, int(math.ceil(float(file_size) / 1e9 / gb_per_chunk)))
            chunk_size = int(file_size / n_chunks)
            chunk_starts = []#

            for i in range(n_chunks):
                if i == 0:
                    pos = 0
                else:
                    voffset = file_start + chunk_size * i
                    _pos = find_pos_of_voffset(bam, chrom, voffset, err=chunk_size/20)
                    pos = adjust_start(chrom, _pos, target_regions)

                # Don't create very small chunks
                if len(chunk_starts) > 0 and pos - chunk_starts[-1] < 100:
                    continue

                chunk_starts.append(pos)

            for i in range(len(chunk_starts)):
                if i < len(chunk_starts) - 1:
                    locus = tk_io.create_locus_info(chrom, chunk_starts[i], chunk_starts[i+1])
                else:
                    locus = tk_io.create_locus_info(chrom, chunk_starts[i], chrom_size)

                loci.append(locus)

    validate_loci(bam, loci, contig_whitelist)
    chunks = []
    for l in loci:
        chunk = {'locus': l}
        chunk.update(extra_args)
        chunks.append(chunk)

    return chunks
Ejemplo n.º 3
0
def generate_tiling_windows(input_bam, locus_size, overlap=0):
    ''' Generate a list of (chrom, start, length) loci that tile over all the references in the bam file '''

    chroms = input_bam.references
    chrom_lengths = input_bam.lengths

    loci = []
    for (chrom, length) in zip(chroms, chrom_lengths):
        start = 0
        while start + locus_size + overlap < length:
            stop = start + locus_size + overlap
            loci.append(tk_io.create_locus_info(chrom, start, stop))
            start += locus_size
        loci.append(tk_io.create_locus_info(chrom, start, length))

    return loci
Ejemplo n.º 4
0
def split(args):
    input_bam = tk_bam.create_bam_infile(args.bam_infile)

    chroms = input_bam.references
    chrom_lengths = input_bam.lengths

    loci = []
    for (chrom, length) in zip(chroms, chrom_lengths):
        bad_chrom = ('random' in chrom or 'U' in chrom or 'hap' in chrom)
        if bad_chrom or chrom[:
                              3] != 'chr' or chrom == 'chrM' or chrom == 'chrY':
            continue
        start = 0
        while start + tenkit.constants.PARALLEL_LOCUS_SIZE < length:
            stop = start + tenkit.constants.PARALLEL_LOCUS_SIZE
            loci.append({'locus': tk_io.create_locus_info(chrom, start, stop)})
            start += tenkit.constants.PARALLEL_LOCUS_SIZE
        loci.append({'locus': tk_io.create_locus_info(chrom, start, length)})

    return {'chunks': loci, 'join': {'__mem_gb': 12.0}}
Ejemplo n.º 5
0
def split(args):
    input_bam = tk_bam.create_bam_infile(args.bam_infile)

    # chromosomes should be ordered as chr1, chr11, chr12, ..., chr19, chr2
    chroms0 = input_bam.references
    chrom_lengths0 = input_bam.lengths

    chroms, chrom_lengths = (list(t) for t in zip(
        *sorted(zip(chroms0, chrom_lengths0))))

    loci = []
    for (chrom, length) in zip(chroms, chrom_lengths):
        bad_chrom = ('random' in chrom or 'U' in chrom or 'hap' in chrom)
        if bad_chrom or chrom == 'chrM' or chrom == 'chrY' or chrom == 'M' or chrom == 'Y':
            continue
        start = 0
        while start + tenkit.constants.PARALLEL_LOCUS_SIZE < length:
            stop = start + tenkit.constants.PARALLEL_LOCUS_SIZE
            loci.append({'locus': tk_io.create_locus_info(chrom, start, stop)})
            start += tenkit.constants.PARALLEL_LOCUS_SIZE
        loci.append({'locus': tk_io.create_locus_info(chrom, start, length)})

    return {'chunks': loci, 'join': {'__mem_gb': 12.0}}
Ejemplo n.º 6
0
def too_many_overhang_variants(pb, vfr, max_allowable_overhang_variants):
    # find overhangs
    chrom = pb.chrom
    overhang0_start = min(pb.start_left, pb.start_right)
    overhang0_end = max(pb.start_left, pb.start_right)
    overhang1_start = min(pb.end_left, pb.end_right)
    overhang1_end = max(pb.end_left, pb.end_right)

    if overhang1_start <= overhang0_end:
        overhang_loci = [(chrom, overhang0_start, overhang1_end)]
    else:
        overhang_loci = [(chrom, overhang0_start, overhang0_end),
                         (chrom, overhang1_start, overhang1_end)]

    badness = False
    for loc in overhang_loci:
        locus_string = tk_io.create_locus_info(*loc)
        variants = [
            record for record in tk_io.get_variant_iterator_pos(
                vfr, None, locus_string)
        ]
        if len(variants) > max_allowable_overhang_variants:
            badness = True
    return badness
Ejemplo n.º 7
0
def get_variant_iterator(vfr, locus):
    """
    Wrapper to get around the fact that tk_io.get_variant_iterator() takes a locus string instead of a tuple.
    """
    locus_string = tk_io.create_locus_info(locus.chrom, locus.start, locus.end)
    return tk_io.get_variant_iterator_pos(vfr, None, locus_string)
Ejemplo n.º 8
0
def generate_chrom_loci(target_regions, chrom, chrom_length, chunk_size, overlap = 0):
    starts = [adjust_start(chrom, s, target_regions) for s in range(0, chrom_length, chunk_size)]
    ends = [min(chrom_length, s + overlap) for s in starts[1:]] + [chrom_length]
    chunks = [tk_io.create_locus_info(chrom, s, e) for (s,e) in zip(starts, ends)]
    return chunks